enable hf trasfer and add unzip to image

2023-10-29 04:53:14 -04:00
93 changed files with 912 additions and 3741 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -71,9 +71,8 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install --extra-index-url https://download.pytorch.org/whl/cu118 -U torch==2.0.1
          pip3 uninstall -y transformers accelerate
-          pip3 install -U -e .[flash-attn,mamba-ssm]
+          pip3 install -U -e .[flash-attn]
          pip3 install -r requirements-tests.txt

      - name: Run e2e tests
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -8,9 +8,6 @@ ignore_missing_imports = True
 [mypy-axolotl.monkeypatch.*]
 ignore_errors = True

-[mypy-axolotl.models.mixtral.*]
-ignore_errors = True
-
 [mypy-axolotl.models.phi.*]
 ignore_errors = True

--- a/README.md
+++ b/README.md
@@ -25,10 +25,8 @@ Features:
 - [Installation](#installation)
  - [Docker](#docker)
  - [Conda/Pip venv](#condapip-venv)
-  - [Runpod](#runpod)
  - [LambdaLabs](#lambdalabs)
  - [Windows](#windows)
-  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
@@ -65,21 +63,17 @@ Features:

 ## Axolotl supports

-|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
-|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
-| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
-| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
-| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
-| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
-| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+|          | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
+|----------|:----------|:-----|-------|------|-------------------|------------|--------------|
+| llama    | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Pythia   | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| cerebras | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| btlm     | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| mpt      | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
+| falcon   | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| gpt-j    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
+| XGen     | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
+| phi      | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |


 ## Quickstart ⚡
@@ -88,29 +82,20 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo

 **Requirements**: Python >=3.9 and Pytorch >=2.0.

-`pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
-
-### For developers
 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl

 pip3 install packaging
 pip3 install -e '.[flash-attn,deepspeed]'
-```
+pip3 install -U git+https://github.com/huggingface/peft.git

-### Usage
-```bash
 # finetune lora
 accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml

 # inference
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
    --lora_model_dir="./lora-out"
-
-# gradio
-accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
-    --lora_model_dir="./lora-out" --gradio
 ```

 ## Installation
@@ -121,6 +106,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  ```bash
  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
  ```
+  - `winglian/axolotl-runpod:main-latest`: for runpod or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)

  Or run on the current files for development:

@@ -135,15 +121,13 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  A more powerful Docker command to run would be this:

  ```bash
-  docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=volume,src=axolotl,target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
+  docker run --gpus '"all"' --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=volume,src=axolotl,target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
  ```

  It additionally:
  * Prevents memory issues when running e.g. deepspeed (e.g. you could hit SIGBUS/signal 7 error) through `--ipc` and `--ulimit` args.
  * Persists the downloaded HF data (models etc.) and your modifications to axolotl code through `--mount`/`-v` args.
  * The `--name` argument simply makes it easier to refer to the container in vscode (`Dev Containers: Attach to Running Container...`) or in your terminal.
-  * The `--privileged` flag gives all capabilities to the container.
-  * The `--shm-size 10g` argument increases the shared memory size. Use this if you see `exitcode: -7` errors using deepspeed.

  [More information on nvidia website](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#setincshmem)

@@ -165,10 +149,6 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
        ```
        Get the token at huggingface.co/settings/tokens

-#### Runpod
-
-Use `winglian/axolotl-runpod:main-latest` or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
-
 #### LambdaLabs
  <details>

@@ -216,28 +196,6 @@ Use `winglian/axolotl-runpod:main-latest` or use this [direct link](https://runp
 #### Windows
 Please use WSL or Docker!

-
-#### Launching on public clouds via SkyPilot
-To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
-```bash
-pip install "skypilot-nightly[gcp,aws,azure,oci,lambda,kubernetes,ibm,scp]"  # choose your clouds
-sky check
-```
-Get the [example YAMLs](https://github.com/skypilot-org/skypilot/tree/master/llm/axolotl) of using Axolotl to finetune `mistralai/Mistral-7B-v0.1`:
-```
-git clone https://github.com/skypilot-org/skypilot.git
-cd skypilot/llm/axolotl
-```
-Use one command to launch:
-```bash
-# On-demand
-HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
-
-# Managed spot (auto-recovery on preemption)
-HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
-```
-
-
 ### Dataset

 Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
@@ -247,7 +205,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"instruction": "...", "input": "...", "output": "..."}
  ```
- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: `system` to override default system prompt)
+- `sharegpt`: conversations where `from` is `human`/`gpt`
  ```json
  {"conversations": [{"from": "...", "value": "..."}]}
  ```
@@ -434,12 +392,6 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
    - path: knowrohit07/know_sql
      type: context_qa.load_v2
      train_on_split: validation
-
-  # loading from s3 or gcs
-  # s3 creds will be loaded from the system default and gcs only supports public access
-  dataset:
-    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
-      ...
  ```

 - loading
@@ -502,15 +454,6 @@ is_falcon_derived_model:
 is_llama_derived_model:
 # Please note that if you set this to true, `padding_side` will be set to "left" by default
 is_mistral_derived_model:
-is_qwen_derived_model:
-
-# optional overrides to the base model configuration
-model_config:
-  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-  rope_scaling:
-    type: # linear | dynamic
-    factor: # float
-

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -535,7 +478,7 @@ float16: true

 # A list of one or more datasets to finetune the model with
 datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+  # HuggingFace dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -543,12 +486,9 @@ datasets:
    data_files: # Optional[str] path to source data files
    shards: # Optional[int] number of shards to split data into
    name: # Optional[str] name of dataset configuration to load
-    train_on_split: train # Optional[str] name of dataset split to load from

    # Optional[str] fastchat conversation type, only used with type: sharegpt
    conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.

  # Custom user prompt
  - path: repo
@@ -614,12 +554,6 @@ eval_sample_packing:
 sample_packing_eff_est:
 total_num_tokens:

-# Passed through to transformers when loading the model when launched without accelerate
-# Use `sequential` when training w/ model parallelism to limit memory
-device_map:
-# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
-max_memory:
-
 # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 adapter: lora
 # If you already have a lora model trained that you want to load, put that here.
@@ -667,8 +601,7 @@ wandb_mode: # "offline" to save run metadata locally and not sync to the server,
 wandb_project: # Your wandb project name
 wandb_entity: # A wandb Team name if using a Team
 wandb_watch:
-wandb_name: # Set the name of your wandb run
-wandb_run_id: # Set the ID of your wandb run
+wandb_run_id: # Set the name of your wandb run
 wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training

 # Where to save the full-finetuned model to
@@ -686,16 +619,13 @@ gradient_accumulation_steps: 1
 micro_batch_size: 2
 eval_batch_size:
 num_epochs: 4
-warmup_steps: 100  # cannot use with warmup_ratio
-warmup_ratio: 0.05  # cannot use with warmup_steps
+warmup_steps: 100
 learning_rate: 0.00003
 lr_quadratic_warmup:
 logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
-evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
 save_strategy: # Set to `no` to skip checkpoint saves
 save_steps: # Leave empty to save at each epoch
-saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 save_total_limit: # Checkpoints saved at a time
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
@@ -705,9 +635,6 @@ max_steps:
 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128

-loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
-loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
-
 # Save model as safetensors (require safetensors package)
 save_safetensors:

@@ -794,6 +721,10 @@ landmark_attention:
 # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
 # LLaMA only
 xpos_rope:
+# RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+rope_scaling:
+  type: # linear | dynamic
+  factor: # float

 # Resume from a specific checkpoint dir
 resume_from_checkpoint:
@@ -966,7 +897,7 @@ wandb_mode:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 ```

@@ -987,10 +918,6 @@ Pass the appropriate flag to the train command:
  cat /tmp/prompt.txt | python -m axolotl.cli.inference examples/your_config.yml \
    --base_model="./completed-model" --prompter=None --load_in_8bit=True
  ```
-- With gradio hosting
-  ```bash
-  python -m axolotl.cli.inference examples/your_config.yml --gradio
-  ```

 Please use `--sample_packing False` if you have it on and receive the error similar to below:

@@ -1022,10 +949,6 @@ Please reduce any below
  - `gradient_accumulation_steps`
  - `sequence_len`

-If it does not help, try running without deepspeed and without accelerate (replace "accelerate launch" with "python") in the command.
-
-Using adamw_bnb_8bit might also save you some memory.
-
 > `failed (exitcode: -9)`

 Usually means your system has run out of system memory.
--- a/deepspeed/zero1.json
+++ b/deepspeed/zero1.json
@@ -24,6 +24,16 @@
      "weight_decay": "auto"
    }
  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto",
+      "warmup_type": "linear",
+      "total_num_steps": "auto"
+    }
+  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed/zero2.json
+++ b/deepspeed/zero2.json
@@ -28,6 +28,16 @@
      "weight_decay": "auto"
    }
  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto",
+      "warmup_type": "linear",
+      "total_num_steps": "auto"
+    }
+  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed/zero3.json
+++ b/deepspeed/zero3.json
@@ -32,6 +32,16 @@
      "weight_decay": "auto"
    }
  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto",
+      "warmup_type": "linear",
+      "total_num_steps": "auto"
+    }
+  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -8,6 +8,7 @@ ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION
+ENV HF_HUB_ENABLE_HF_TRANSFER=1

 RUN apt-get update && \
    apt-get install -y vim curl
@@ -21,9 +22,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN sed -i "s/torch==.*/torch==$PYTORCH_VERSION/" requirements.txt
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,$AXOLOTL_EXTRAS]; \
+        pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \
    else \
-        pip install -e .[deepspeed,flash-attn]; \
+        pip install -e .[flash-attn]; \
    fi

 # fix so that git fetch/pull from remote works
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -10,13 +10,11 @@ ENV PATH="/root/miniconda3/bin:${PATH}"
 ARG PYTHON_VERSION="3.9"
 ARG PYTORCH_VERSION="2.0.1"
 ARG CUDA="118"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev unzip && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
@@ -29,9 +27,47 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA

-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
+FROM base-builder AS deepspeed-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/microsoft/DeepSpeed.git && \
+    cd DeepSpeed && \
+    MAX_CONCURRENCY=8 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_OPS=1 DS_BUILD_EVOFORMER_ATTN=0 python3 setup.py bdist_wheel
+
+FROM base-builder AS bnb-builder
+
+WORKDIR /workspace
+ARG CUDA="118"
+ENV CUDA=$CUDA
+ARG MAX_JOBS="-1"
+ENV MAX_JOBS=$MAX_JOBS
+
+RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
+    cd bitsandbytes && \
+    CUDA_VERSION=$CUDA make cuda11x && \
+    python setup.py bdist_wheel
+
+FROM base-builder
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+RUN mkdir -p /workspace/builds
+COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
+
+RUN mkdir -p /workspace/wheels/bitsandbytes
+COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
+COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
+
+RUN pip3 install wheels/deepspeed-*.whl
+RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
+RUN git lfs install --skip-repo
+RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docker/Dockerfile-runpod
+++ b/docker/Dockerfile-runpod
@@ -4,7 +4,6 @@ FROM winglian/axolotl:$BASE_TAG
 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
 ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"

 COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -12,7 +12,3 @@ This usually happens when you run out of system RAM.
 > Exitcode -7 while using deepspeed

 Try upgrading deepspeed w: `pip install -U deepspeed`
-
-> AttributeError: 'DummyOptim' object has no attribute 'step'
-
-You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -14,7 +14,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path: last_prepared_run
-val_set_size: 0.05
+val_set_size: 0.01

 adapter:
 lora_model_dir:
@@ -35,7 +35,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 output_dir: btlm-out
@@ -72,8 +72,8 @@ gptq_groupsize:
 gptq_model_v1:

 warmup_steps: 32
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps:
+save_steps:
 save_total_limit:

 debug:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -7,7 +7,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
@@ -24,7 +24,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out
 batch_size: 4
@@ -49,8 +49,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -54,8 +54,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./qlora-out

 adapter: qlora
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -56,8 +56,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -54,8 +54,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./qlora-out

 adapter: qlora
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -56,8 +56,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -54,8 +54,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./qlora-out

 adapter: qlora
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -56,8 +56,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -12,7 +12,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca:chat
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
@@ -26,7 +26,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./falcon-7b
 batch_size: 2
@@ -51,8 +51,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 5
+save_steps: 43
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -18,7 +18,7 @@ datasets:
      - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
    type: "alpaca:chat"
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 # enable QLoRA
 adapter: qlora
 lora_model_dir:
@@ -40,7 +40,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out

@@ -80,8 +80,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 5
+save_steps: 10
 debug:
 deepspeed:
 weight_decay: 0.000001
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -12,7 +12,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca:chat
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter:
 lora_model_dir:
 sequence_len: 2048
@@ -26,7 +26,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./falcon-7b
 batch_size: 2
@@ -51,8 +51,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 5
+save_steps: 43
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -7,7 +7,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
@@ -21,7 +21,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 2
@@ -46,8 +46,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -19,7 +19,7 @@ lora_fan_in_fan_out: false
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
 gradient_accumulation_steps: 1
@@ -42,8 +42,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 110
+save_steps: 660
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 1
@@ -58,9 +58,9 @@ flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true

 warmup_steps: 100
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed: #deepspeed/zero2.json # multi-gpu only
 weight_decay: 0.1
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -15,7 +15,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter: lora
 lora_model_dir:
 sequence_len: 4096
@@ -32,7 +32,7 @@ lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./model-out
 gradient_accumulation_steps: 1
@@ -62,8 +62,8 @@ flash_attention:
 sdp_attention:
 flash_optimum:
 warmup_steps: 100
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps:
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -54,10 +54,10 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
 eval_table_max_new_tokens: 128
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./qlora-out

 adapter: qlora
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -56,9 +56,9 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./relora-out

 adapter: qlora
@@ -35,7 +35,7 @@ relora_cpu_offload: false
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -60,8 +60,8 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps: 50
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/llama-2/tiny-llama.yml
+++ b/examples/llama-2/tiny-llama.yml
@@ -1,4 +1,4 @@
-base_model: PY007/TinyLlama-1.1B-intermediate-step-715k-1.5T
+base_model: PY007/TinyLlama-1.1B-step-50K-105b

 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
@@ -12,7 +12,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -54,9 +54,9 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -1,61 +0,0 @@
-base_model: state-spaces/mamba-2.8b
-model_type: MambaLMHeadModel
-tokenizer_type: AutoTokenizer
-tokenizer_config: EleutherAI/gpt-neox-20b
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./out
-
-sequence_len: 2048
-sample_packing: false
-pad_to_sequence_len: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 5e-5
-
-train_on_inputs: false
-group_by_length: true
-
-bf16: true
-fp16: false
-tf32: true
-
-gradient_checkpointing: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-tokens:
-save_safetensors: False
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./out

 sequence_len: 8192
@@ -21,7 +21,7 @@ pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -46,10 +46,10 @@ xformers_attention:
 flash_attention: true

 warmup_steps: 10
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
 eval_table_max_new_tokens: 128
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -1,79 +0,0 @@
-base_model: mistralai/Mixtral-8x7B-v0.1
-model_type: AutoModelForCausalLM
-tokenizer_type: LlamaTokenizer
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-#lora_target_modules:
-#  - gate
-#  - q_proj
-#  - k_proj
-#  - v_proj
-#  - o_proj
-#  - w1
-#  - w2
-#  - w3
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed: deepspeed/zero2.json
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -11,7 +11,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
+val_set_size: 0.01
 output_dir: ./qlora-out

 adapter: qlora
@@ -38,7 +38,7 @@ lora_target_modules:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 4
@@ -62,14 +62,11 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true

-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
 warmup_steps: 10
-evals_per_epoch: 4
+eval_steps: 0.05
 eval_table_size:
 eval_table_max_new_tokens: 128
-saves_per_epoch: 1
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -21,7 +21,7 @@ lora_fan_in_fan_out: false
 wandb_project: mpt-alpaca-7b
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./mpt-alpaca-7b
 gradient_accumulation_steps: 1
@@ -44,8 +44,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 110
+save_steps: 660
 debug:
 deepspeed:
 weight_decay: 0.0001
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -23,7 +23,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./openllama-out
 gradient_accumulation_steps: 1
@@ -49,8 +49,8 @@ flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-out
 gradient_accumulation_steps: 1
@@ -54,8 +54,8 @@ flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -9,7 +9,7 @@ datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 adapter: qlora
 lora_model_dir:
 sequence_len: 1024
@@ -23,7 +23,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 1
@@ -48,8 +48,8 @@ flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -1,5 +1,5 @@
 base_model: microsoft/phi-1_5
-model_type: PhiForCausalLM
+model_type: MixFormerSequentialForCausalLM
 tokenizer_type: AutoTokenizer
 is_llama_derived_model: false
 trust_remote_code: true
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 1
@@ -59,8 +59,8 @@ xformers_attention:
 flash_attention:

 warmup_steps: 100
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:

 gradient_accumulation_steps: 1
@@ -59,8 +59,8 @@ xformers_attention:
 flash_attention:

 warmup_steps: 100
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 0.05
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0.1
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -24,7 +24,7 @@ lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./pythia-12b
 gradient_accumulation_steps: 1
--- a/examples/pythia/lora.yml
+++ b/examples/pythia/lora.yml
@@ -18,7 +18,7 @@ lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
@@ -33,5 +33,5 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 weight_decay: 0.1
-evals_per_epoch: 4
+eval_steps: 0.05
 logging_steps: 1
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -1,68 +0,0 @@
-base_model: Qwen/Qwen-7B
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-is_qwen_derived_model: true
-trust_remote_code: true
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./lora-out
-
-sequence_len: 2048  # supports up to 8192
-sample_packing: false
-pad_to_sequence_len:
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -1,68 +0,0 @@
-base_model: Qwen/Qwen-7B
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-is_qwen_derived_model: true
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./lora-out
-
-sequence_len: 2048  # supports up to 8192
-sample_packing: false
-pad_to_sequence_len:
-
-adapter: qlora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -22,7 +22,7 @@ lora_fan_in_fan_out: false
 wandb_project: redpajama-alpaca-3b
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./redpajama-alpaca-3b
 batch_size: 4
@@ -45,8 +45,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 110
+save_steps: 660
 debug:
 deepspeed:
 weight_decay: 0.0001
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -21,7 +21,7 @@ lora_fan_in_fan_out:
 wandb_project: lora-replit
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-replit
 batch_size: 8
@@ -45,8 +45,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 50
+save_steps:
 debug:
 deepspeed:
 weight_decay: 0
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -16,7 +16,7 @@ datasets:
      - openassistant_best_replies_train.jsonl
    type: "completion"
 dataset_prepared_path:
-val_set_size: 0.05
+val_set_size: 0.01
 # enable QLoRA
 adapter: qlora
 lora_model_dir:
@@ -38,7 +38,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_name:
+wandb_run_id:
 wandb_log_model:
 output_dir: ./qlora-out

@@ -78,8 +78,8 @@ flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
+eval_steps: 50
+save_steps: 50
 debug:
 deepspeed:
 weight_decay: 0.0
--- a/gitbook/README.md
+++ b/gitbook/README.md
@@ -0,0 +1 @@
+# Page
--- a/gitbook/SUMMARY.md
+++ b/gitbook/SUMMARY.md
@@ -0,0 +1,4 @@
+# Table of contents
+
+* [Page](README.md)
+* [Small dev details](small-dev-details.md)
--- a/gitbook/small-dev-details.md
+++ b/gitbook/small-dev-details.md
@@ -0,0 +1,3 @@
+# Small dev details
+
+/
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,23 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-auto-gptq==0.5.1
+torch==2.0.1
+auto-gptq
 packaging
-peft==0.6.0
-transformers @ git+https://github.com/huggingface/transformers.git@e5079b0b2abcef11ecbdae60ba4a6636c57b725d
-tokenizers==0.15.0
+peft @ git+https://github.com/huggingface/peft.git
+transformers @ git+https://github.com/huggingface/transformers.git@acc394c4f5e1283c19783581790b3dc3105a3697
 bitsandbytes>=0.41.1
-accelerate==0.24.1
+accelerate @ git+https://github.com/huggingface/accelerate@80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9
 deepspeed
 addict
 fire
 PyYAML>=6.0
-datasets>=2.15.0
-flash-attn==2.3.3
+datasets
+flash-attn>=2.3.0
 sentencepiece
 wandb
 einops
-xformers==0.0.22
-optimum==1.13.2
+xformers>=0.0.22
+optimum
 hf_transfer
 colorama
 numba
@@ -29,11 +30,4 @@ scipy
 scikit-learn==1.2.2
 pynvml
 art
-fschat==0.2.34
-gradio==3.50.2
-tensorboard
-
-# remote filesystems
-s3fs
-gcsfs
-# adlfs
+fschat==0.2.29
--- a/setup.py
+++ b/setup.py
@@ -46,13 +46,10 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.3.3",
+            "flash-attn>=2.3.0",
        ],
        "deepspeed": [
            "deepspeed",
        ],
-        "mamba-ssm": [
-            "mamba-ssm==1.0.1",
-        ],
    },
 )
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -6,10 +6,8 @@ import os
 import random
 import sys
 from pathlib import Path
-from threading import Thread
 from typing import Any, Dict, List, Optional, Union

-import gradio as gr
 import torch
 import yaml

@@ -18,7 +16,7 @@ from accelerate.commands.config import config_args
 from art import text2art
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
-from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
+from transformers import GenerationConfig, TextStreamer

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
 from axolotl.logging_config import configure_logging
@@ -29,7 +27,6 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
-from axolotl.utils.trainer import prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -47,7 +44,7 @@ def print_axolotl_text_art(suffix=None):
    ascii_text = "  axolotl"
    if suffix:
        ascii_text += f"  x  {suffix}"
-    ascii_art = text2art(ascii_text, font=font)
+    ascii_art = text2art(" axolotl", font=font)

    if is_main_process():
        print(ascii_art)
@@ -72,7 +69,7 @@ def do_merge_lora(

    LOG.info("running merge of LoRA with base model")
    model = model.merge_and_unload()
-    model.to(dtype=cfg.torch_dtype)
+    model.to(dtype=torch.float16)

    if cfg.local_rank == 0:
        LOG.info(f"saving merged model to: {str(Path(cfg.output_dir) / 'merged')}")
@@ -156,91 +153,6 @@ def do_inference(
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))


-def do_inference_gradio(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-):
-    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
-    prompter = cli_args.prompter
-    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})
-
-    prompter_module = None
-    if prompter:
-        prompter_module = getattr(
-            importlib.import_module("axolotl.prompters"), prompter
-        )
-
-    if cfg.landmark_attention:
-        from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
-
-        set_model_mem_id(model, tokenizer)
-        model.set_mem_cache_args(
-            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
-        )
-
-    model = model.to(cfg.device)
-
-    def generate(instruction):
-        if not instruction:
-            return
-        if prompter_module:
-            # pylint: disable=stop-iteration-return
-            prompt: str = next(
-                prompter_module().build_prompt(instruction=instruction.strip("\n"))
-            )
-        else:
-            prompt = instruction.strip()
-        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
-
-        model.eval()
-        with torch.no_grad():
-            generation_config = GenerationConfig(
-                repetition_penalty=1.1,
-                max_new_tokens=1024,
-                temperature=0.9,
-                top_p=0.95,
-                top_k=40,
-                bos_token_id=tokenizer.bos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                pad_token_id=tokenizer.pad_token_id,
-                do_sample=True,
-                use_cache=True,
-                return_dict_in_generate=True,
-                output_attentions=False,
-                output_hidden_states=False,
-                output_scores=False,
-            )
-            streamer = TextIteratorStreamer(tokenizer)
-            generation_kwargs = {
-                "inputs": batch["input_ids"].to(cfg.device),
-                "generation_config": generation_config,
-                "streamer": streamer,
-            }
-
-            thread = Thread(target=model.generate, kwargs=generation_kwargs)
-            thread.start()
-
-            all_text = ""
-
-            for new_text in streamer:
-                all_text += new_text
-                yield all_text
-
-    demo = gr.Interface(
-        fn=generate,
-        inputs="textbox",
-        outputs="text",
-        title=cfg.get("gradio_title", "Axolotl Gradio Interface"),
-    )
-    demo.queue().launch(show_api=False, share=True)
-
-
 def choose_config(path: Path):
    yaml_files = list(path.glob("*.yml"))

@@ -297,8 +209,6 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):

    validate_config(cfg)

-    prepare_optim_env(cfg)
-
    normalize_config(cfg)

    setup_wandb_env_vars(cfg)
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -6,16 +6,11 @@ from pathlib import Path
 import fire
 import transformers

-from axolotl.cli import (
-    do_inference,
-    do_inference_gradio,
-    load_cfg,
-    print_axolotl_text_art,
-)
+from axolotl.cli import do_inference, load_cfg, print_axolotl_text_art
 from axolotl.common.cli import TrainerCliArgs


-def do_cli(config: Path = Path("examples/"), gradio=False, **kwargs):
+def do_cli(config: Path = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parsed_cfg = load_cfg(config, **kwargs)
@@ -26,10 +21,7 @@ def do_cli(config: Path = Path("examples/"), gradio=False, **kwargs):
    )
    parsed_cli_args.inference = True

-    if gradio:
-        do_inference_gradio(cfg=parsed_cfg, cli_args=parsed_cli_args)
-    else:
-        do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)


 if __name__ == "__main__":
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -6,36 +6,33 @@ import abc
 import importlib
 import logging
 import math
+import os
 import sys
 from abc import abstractmethod
 from dataclasses import dataclass, field
 from functools import partial
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union

 import torch
 import transformers
 from datasets import Dataset
 from torch.optim.lr_scheduler import OneCycleLR
-from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import DataLoader, DistributedSampler, SequentialSampler
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
-from transformers.trainer_utils import seed_worker
+from transformers.trainer_pt_utils import SequentialDistributedSampler

 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
-    LossWatchDogCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
-from axolotl.utils.collators import (
-    BatchSamplerDataCollatorForSeq2Seq,
-    MambaDataCollator,
-)
-from axolotl.utils.samplers import MultipackBatchSampler
+from axolotl.utils.collators import DataCollatorForSeq2Seq
+from axolotl.utils.dataloader import MultipackDistributedDataloader
 from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup

 try:
@@ -52,9 +49,6 @@ class AxolotlTrainingArguments(TrainingArguments):
    Extend the base TrainingArguments for axolotl helpers
    """

-    model_type: Optional[str] = field(
-        default=None, metadata={"help": "HF model configuration model_type."}
-    )
    lr_quadratic_warmup: bool = field(
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
@@ -108,10 +102,6 @@ class AxolotlTrainingArguments(TrainingArguments):
    bench_source_max_len: int = field(
        default=2048, metadata={"help": "Maximum source sequence length for bench."}
    )
-    dataloader_prefetch_factor: Optional[int] = field(
-        default=None,
-        metadata={"help": "prefetch_factor argument to the dataloader"},
-    )


 class AxolotlTrainer(Trainer):
@@ -155,102 +145,70 @@ class AxolotlTrainer(Trainer):
        return self.lr_scheduler

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        if self.args.sample_packing:
-            return MultipackBatchSampler(
-                RandomSampler(self.train_dataset),
-                self.args.train_batch_size,
-                drop_last=True,
-                batch_max_len=self._train_batch_size * self.args.max_seq_length,
-                lengths=(
-                    self.train_dataset.data.column("position_ids")
-                    .to_pandas()
-                    .apply(lambda x: x[-1] + 1)
-                    .values
-                ),
-                packing_efficiency_estimate=self.args.sample_packing_efficiency,
+        if self.args.world_size > 1 and self.args.sample_packing:
+            return DistributedSampler(
+                self.train_dataset,
+                num_replicas=self.args.world_size,
+                rank=self.args.process_index,
+                seed=self.args.seed,
            )
        return super()._get_train_sampler()

    def _get_eval_sampler(
        self, eval_dataset: Dataset
    ) -> Optional[torch.utils.data.Sampler]:
-        if self.args.sample_packing and self.args.eval_sample_packing is not False:
-            return MultipackBatchSampler(
-                SequentialSampler(eval_dataset),
-                self.args.per_device_eval_batch_size,
-                drop_last=True,
-                batch_max_len=self.args.eval_batch_size * self.args.max_seq_length,
-                lengths=(
-                    eval_dataset.data.column("position_ids")
-                    .to_pandas()
-                    .apply(lambda x: x[-1] + 1)
-                    .values
-                ),
-                packing_efficiency_estimate=self.args.sample_packing_efficiency,
+        if (
+            self.args.world_size > 1
+            and self.args.sample_packing
+            and self.args.eval_sample_packing is not False
+        ):
+            return SequentialDistributedSampler(
+                eval_dataset,
+                num_replicas=self.args.world_size,
+                rank=self.args.process_index,
+                batch_size=self.args.per_device_eval_batch_size,
            )
        return super()._get_eval_sampler(eval_dataset)

-    def get_train_dataloader(self) -> DataLoader:
+    def get_train_dataloader(self) -> Union[DataLoader, MultipackDistributedDataloader]:
        if self.args.sample_packing:
-            train_dataset = self.train_dataset
-            train_dataset = train_dataset.remove_columns(["length"])
-            data_collator = self.data_collator
-            dataloader_params = {
-                "batch_size": self._train_batch_size,
-                "collate_fn": data_collator,
-                "num_workers": self.args.dataloader_num_workers,
-                "pin_memory": self.args.dataloader_pin_memory,
-            }
-            if self.args.dataloader_prefetch_factor:
-                dataloader_params[
-                    "prefetch_factor"
-                ] = self.args.dataloader_prefetch_factor
-
-            sampler = self._get_train_sampler()
-            if isinstance(sampler, BatchSampler):
-                dataloader_params["batch_sampler"] = sampler
-                del dataloader_params["batch_size"]
-            else:
-                dataloader_params["sampler"] = sampler
-                dataloader_params["drop_last"] = self.args.dataloader_drop_last
-            dataloader_params["worker_init_fn"] = seed_worker
-
-            self.accelerator.even_batches = False
-            return self.accelerator.prepare_data_loader(
-                DataLoader(train_dataset, **dataloader_params)
+            train_sampler = self._get_train_sampler()
+            return self.accelerator.prepare(
+                MultipackDistributedDataloader(
+                    self.train_dataset,
+                    batch_size=self._train_batch_size,
+                    seq_max_length=self.args.max_seq_length,
+                    collate_fn=self.data_collator,
+                    sampler=train_sampler,
+                    packing_efficiency_estimate=self.args.sample_packing_efficiency,
+                    sample_packing_seq_len_multiplier=self.args.sample_packing_seq_len_multiplier,
+                    device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                    num_epochs=self.num_epochs,
+                )
            )
        return super().get_train_dataloader()

-    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+    def get_eval_dataloader(
+        self, eval_dataset: Optional[Dataset] = None
+    ) -> Union[DataLoader, MultipackDistributedDataloader]:
        if self.args.sample_packing and self.args.eval_sample_packing is not False:
            eval_dataset = (
                eval_dataset if eval_dataset is not None else self.eval_dataset
            )

            eval_sampler = self._get_eval_sampler(eval_dataset)
-            eval_dataset = eval_dataset.remove_columns(["length"])
-            data_collator = self.data_collator
-            dataloader_params = {
-                "batch_size": self.args.eval_batch_size,
-                "collate_fn": data_collator,
-                "num_workers": self.args.dataloader_num_workers,
-                "pin_memory": self.args.dataloader_pin_memory,
-            }
-            if self.args.dataloader_prefetch_factor:
-                dataloader_params[
-                    "prefetch_factor"
-                ] = self.args.dataloader_prefetch_factor
-
-            if isinstance(eval_sampler, BatchSampler):
-                dataloader_params["batch_sampler"] = eval_sampler
-                del dataloader_params["batch_size"]
-            else:
-                dataloader_params["sampler"] = eval_sampler
-                dataloader_params["drop_last"] = self.args.dataloader_drop_last
-
-            self.accelerator.even_batches = False
-            return self.accelerator.prepare_data_loader(
-                DataLoader(eval_dataset, **dataloader_params)
+            return self.accelerator.prepare(
+                MultipackDistributedDataloader(
+                    eval_dataset,
+                    batch_size=self.args.eval_batch_size,
+                    seq_max_length=self.args.max_seq_length,
+                    collate_fn=self.data_collator,
+                    sampler=eval_sampler,
+                    packing_efficiency_estimate=self.args.sample_packing_efficiency,
+                    sample_packing_seq_len_multiplier=self.args.eval_batch_size,
+                    device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                    num_epochs=self.num_epochs,
+                )
            )
        return super().get_eval_dataloader(eval_dataset)

@@ -264,15 +222,13 @@ class AxolotlTrainer(Trainer):
    def get_bench_dataloader(
        self,
        bench_dataset: Dataset,
-    ) -> DataLoader:
+    ) -> Union[DataLoader, MultipackDistributedDataloader]:
        dataloader_params = {
            "batch_size": self.args.eval_batch_size,
            "collate_fn": self.bench_data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }
-        if self.args.dataloader_prefetch_factor:
-            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
@@ -291,32 +247,6 @@ class AxolotlTrainer(Trainer):
        return super().compute_loss(model, inputs, return_outputs=return_outputs)


-class AxolotlMambaTrainer(AxolotlTrainer):
-    """
-    Mamba specific trainer to handle loss calculation
-    """
-
-    def compute_loss(
-        self,
-        model,
-        inputs,
-        return_outputs=False,  # pylint: disable=unused-argument
-    ):
-        input_ids = inputs.pop("input_ids")
-        lm_logits = model(input_ids).logits
-
-        labels = input_ids.to(lm_logits.device)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        labels = labels[:, 1:].contiguous()
-
-        loss_fct = torch.nn.CrossEntropyLoss()
-        lm_loss = loss_fct(
-            shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
-        )
-
-        return lm_loss
-
-
 class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
@@ -463,9 +393,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
            )

-        if self.cfg.loss_watchdog_threshold is not None:
-            callbacks.append(LossWatchDogCallback(self.cfg))
-
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -494,19 +421,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            return OneCycleLRSchedulerTrainer
        if self.cfg.relora_steps:
            return ReLoRATrainer
-        if self.cfg.model_config_type == "mamba":
-            return AxolotlMambaTrainer
        return AxolotlTrainer

    def build(self, total_num_steps):
-        warmup_steps = None
-        if self.cfg.warmup_steps is not None:
-            warmup_steps = self.cfg.warmup_steps
-        elif self.cfg.warmup_ratio is not None:
-            warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
-        else:
-            warmup_steps = min(int(0.03 * total_num_steps), 100)
-
+        warmup_steps = (
+            self.cfg.warmup_steps
+            if self.cfg.warmup_steps is not None
+            else min(int(0.03 * total_num_steps), 100)
+        )
        logging_steps = (
            self.cfg.logging_steps
            if self.cfg.logging_steps is not None
@@ -563,7 +485,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            if self.cfg.hub_strategy:
                training_arguments_kwargs["hub_strategy"] = self.cfg.hub_strategy

-        if self.cfg.save_safetensors is not None:
+        if self.cfg.save_safetensors:
            training_arguments_kwargs["save_safetensors"] = self.cfg.save_safetensors

        if self.cfg.sample_packing_eff_est:
@@ -571,29 +493,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                "sample_packing_efficiency"
            ] = self.cfg.sample_packing_eff_est

-        if self.cfg.dataloader_pin_memory is not None:
-            training_arguments_kwargs[
-                "dataloader_pin_memory"
-            ] = self.cfg.dataloader_pin_memory
-        if self.cfg.dataloader_num_workers is not None:
-            training_arguments_kwargs[
-                "dataloader_num_workers"
-            ] = self.cfg.dataloader_num_workers
-        if self.cfg.dataloader_prefetch_factor is not None:
-            training_arguments_kwargs[
-                "dataloader_prefetch_factor"
-            ] = self.cfg.dataloader_prefetch_factor
-
-        if self.cfg.val_set_size == 0:
-            # no eval set, so don't eval
-            training_arguments_kwargs["evaluation_strategy"] = "no"
-        elif self.cfg.eval_steps:
+        if self.cfg.eval_steps:
            training_arguments_kwargs["evaluation_strategy"] = "steps"
            training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps
        elif self.cfg.evaluation_strategy:
            training_arguments_kwargs[
                "evaluation_strategy"
            ] = self.cfg.evaluation_strategy
+        elif self.cfg.val_set_size == 0:
+            # no eval set, so don't eval
+            training_arguments_kwargs["evaluation_strategy"] = "no"
        else:
            # we have an eval set, but no steps defined, default to use epoch
            training_arguments_kwargs["evaluation_strategy"] = "epoch"
@@ -681,7 +590,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
        training_arguments_kwargs["report_to"] = "wandb" if self.cfg.use_wandb else None
        training_arguments_kwargs["run_name"] = (
-            self.cfg.wandb_name if self.cfg.use_wandb else None
+            self.cfg.wandb_run_id if self.cfg.use_wandb else None
        )
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
@@ -699,9 +608,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            self.cfg.sample_packing if self.cfg.sample_packing else False
        )
        training_arguments_kwargs["eval_sample_packing"] = (
-            self.cfg.sample_packing
-            if self.cfg.eval_sample_packing is not False
-            else False
+            self.cfg.sample_packing if self.cfg.sample_packing else False
        )
        training_arguments_kwargs[
            "sample_packing_seq_len_multiplier"
@@ -711,7 +618,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
-        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
        training_args = (
            AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
                **training_arguments_kwargs,
@@ -766,7 +672,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            args=training_args,
-            data_collator=self.build_collator(**data_collator_kwargs),
+            data_collator=DataCollatorForSeq2Seq(
+                self.tokenizer,
+                return_tensors="pt",
+                **data_collator_kwargs,
+            ),
            bench_data_collator=transformers.DataCollatorForSeq2Seq(
                self.tokenizer,
                return_tensors="pt",
@@ -780,19 +690,4 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        for callback in self.get_post_trainer_create_callbacks(trainer):
            trainer.add_callback(callback)

-        if self.cfg.deepspeed and self.cfg.sample_packing:
-            trainer.accelerator.state.deepspeed_plugin.deepspeed_config[
-                "train_micro_batch_size_per_gpu"
-            ] = self.cfg.micro_batch_size
-
        return trainer
-
-    def build_collator(self, **kwargs):
-        if self.cfg.model_config_type == "mamba":
-            return MambaDataCollator(tokenizer=self.tokenizer)
-
-        return BatchSamplerDataCollatorForSeq2Seq(
-            self.tokenizer,
-            return_tensors="pt",
-            **kwargs,
-        )
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -2,7 +2,7 @@

 import logging
 import os
-from typing import List, Optional
+from typing import List

 import torch
 from datasets import Dataset, IterableDataset
@@ -30,20 +30,14 @@ class TokenizedPromptDataset(Dataset):
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: IterableDataset,
-        process_count: Optional[int] = None,
        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
-        self.process_count = process_count
        super().__init__(self.process(dataset).data, **kwargs)

    def process(self, dataset):
        features = dataset.features.keys()
-        num_proc = (
-            min(64, self.process_count)
-            if self.process_count
-            else min(64, os.cpu_count())
-        )
+        num_proc = min(64, os.cpu_count())
        map_kwargs = {}
        if self.prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
--- a/src/axolotl/models/mamba/init.py
+++ b/src/axolotl/models/mamba/init.py
@@ -1,12 +0,0 @@
-"""
-Modeling module for Mamba models
-"""
-
-
-def fix_mamba_attn_for_loss():
-    from mamba_ssm.models import mixer_seq_simple
-
-    from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed
-
-    mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed
-    return mixer_seq_simple.MambaLMHeadModel  # pylint: disable=invalid-name
--- a/src/axolotl/models/mamba/configuration_mamba.py
+++ b/src/axolotl/models/mamba/configuration_mamba.py
@@ -1,42 +0,0 @@
-"""
-HF Transformers MambaConfig
-"""
-from transformers import PretrainedConfig
-
-
-class MambaConfig(PretrainedConfig):
-    """
-    modeling configuration for state space model/mamba
-    """
-
-    model_type = "mamba"
-
-    def __init__(
-        self,
-        vocab_size=50280,
-        d_model=2560,
-        n_layer=64,
-        rms_norm=True,
-        residual_in_fp32=True,
-        fused_add_norm=True,
-        pad_vocab_size_multiple=8,
-        pad_token_id=50277,
-        bos_token_id=0,
-        eos_token_id=0,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.d_model = d_model
-        self.n_layer = n_layer
-        self.rms_norm = rms_norm
-        self.residual_in_fp32 = residual_in_fp32
-        self.fused_add_norm = fused_add_norm
-        self.pad_vocab_size_multiple = pad_vocab_size_multiple
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -1,128 +0,0 @@
-# pylint: skip-file
-import os
-from collections import namedtuple
-from functools import partial
-from typing import Optional, Union
-
-import torch
-from mamba_ssm.models.mixer_seq_simple import MixerModel, _init_weights
-from mamba_ssm.utils.generation import GenerationMixin
-from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from axolotl.models.mamba.configuration_mamba import MambaConfig
-
-
-class MambaLMHeadModel(nn.Module, GenerationMixin):
-    def __init__(
-        self,
-        d_model: int,
-        n_layer: int,
-        vocab_size: int,
-        initializer_cfg=None,
-        pad_vocab_size_multiple: int = 1,
-        device=None,
-        dtype=None,
-        **backbone_kwargs,
-    ) -> None:
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        if vocab_size % pad_vocab_size_multiple != 0:
-            vocab_size += pad_vocab_size_multiple - (
-                vocab_size % pad_vocab_size_multiple
-            )
-        self.config = MambaConfig(
-            vocab_size=vocab_size,
-            d_model=d_model,
-            n_layer=n_layer,
-            pad_vocab_size_multiple=pad_vocab_size_multiple,
-        )
-        self.backbone = MixerModel(
-            d_model=d_model,
-            n_layer=n_layer,
-            vocab_size=vocab_size,
-            initializer_cfg=initializer_cfg,
-            **backbone_kwargs,
-            **factory_kwargs,
-        )
-        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)
-
-        # Initialize weights and apply final processing
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=n_layer,
-                **(initializer_cfg if initializer_cfg is not None else {}),
-            )
-        )
-        self.tie_weights()
-
-    def tie_weights(self):
-        self.lm_head.weight = self.backbone.embedding.weight
-
-    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
-        return self.backbone.allocate_inference_cache(
-            batch_size, max_seqlen, dtype=dtype, **kwargs
-        )
-
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        inference_params=None,
-        num_last_tokens=0,
-        labels=None,
-        **kwargs,
-    ):
-        """
-        "position_ids" is just to be compatible with Transformer generation. We don't use it.
-        num_last_tokens: if > 0, only return the logits for the last n tokens
-        """
-        hidden_states = self.backbone(input_ids, inference_params=inference_params)
-        if num_last_tokens > 0:
-            hidden_states = hidden_states[:, -num_last_tokens:]
-        lm_logits = self.lm_head(hidden_states)
-
-        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-        return CausalLMOutput(logits=lm_logits)
-
-        loss = None
-        if labels is not None:
-            logits = lm_logits
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-            CausalLMOutput = namedtuple("CausalLMOutput", ["logits", "loss"])
-            print(loss)
-            return CausalLMOutput(logits=lm_logits, loss=loss)
-
-        else:
-            CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
-            return CausalLMOutput(logits=lm_logits)
-
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        state_dict: Optional[dict] = None,
-        safe_serialization: Optional[bool] = None,  # pylint: disable=unused-argument
-    ):
-        if state_dict is None:
-            state_dict = self.state_dict()
-        torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin"))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
-        config = load_config_hf(pretrained_model_name)
-        model = cls(**config, device=device, dtype=dtype, **kwargs)
-        model.load_state_dict(
-            load_state_dict_hf(pretrained_model_name, device={"": device}, dtype=dtype)
-        )
-        return model
--- a/src/axolotl/models/phi/init.py
+++ b/src/axolotl/models/phi/init.py
@@ -3,6 +3,4 @@ MixFormers model architecture used for phi models
 """

 from .configuration_mixformer_sequential import MixFormerSequentialConfig  # noqa
-from .configuration_phi import PhiConfig  # noqa
 from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM  # noqa
-from .modeling_phi import PhiForCausalLM  # noqa
--- a/src/axolotl/models/phi/configuration_phi.py
+++ b/src/axolotl/models/phi/configuration_phi.py
@@ -1,65 +0,0 @@
-# pylint: skip-file
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-from typing import Optional
-
-from transformers import PretrainedConfig
-
-
-class PhiConfig(PretrainedConfig):
-    """Phi configuration."""
-
-    model_type = "phi"
-    attribute_map = {
-        "max_position_embeddings": "n_positions",
-        "hidden_size": "n_embd",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size: int = 50304,
-        n_positions: int = 2048,
-        n_embd: int = 1024,
-        n_layer: int = 20,
-        n_inner: Optional[int] = None,
-        n_head: int = 16,
-        n_head_kv: Optional[int] = None,
-        rotary_dim: Optional[int] = 32,
-        activation_function: Optional[str] = "gelu_new",
-        flash_attn: bool = False,
-        flash_rotary: bool = False,
-        fused_dense: bool = False,
-        attn_pdrop: float = 0.0,
-        embd_pdrop: float = 0.0,
-        resid_pdrop: float = 0.0,
-        layer_norm_epsilon: float = 1e-5,
-        initializer_range: float = 0.02,
-        tie_word_embeddings: bool = False,
-        pad_vocab_size_multiple: int = 64,
-        **kwargs
-    ) -> None:
-        self.vocab_size = int(
-            math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.n_head_kv = n_head_kv
-        self.rotary_dim = min(rotary_dim, n_embd // n_head)
-        self.activation_function = activation_function
-        self.flash_attn = flash_attn
-        self.flash_rotary = flash_rotary
-        self.fused_dense = fused_dense
-        self.attn_pdrop = attn_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/src/axolotl/models/phi/modeling_phi.py
+++ b/src/axolotl/models/phi/modeling_phi.py
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -321,8 +321,6 @@ def flashattn_forward(
        # only on first autoregressive step q,k,v have same seqlen
        is_causal = key_states.shape == query_states.shape

-    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
-
    if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
        # special handling using sample packing
        qkv = torch.stack(
@@ -332,12 +330,7 @@ def flashattn_forward(
        qkv = rearrange(qkv, "b s ... -> (b s) ...")

        output = flash_attn_varlen_qkvpacked_func(
-            qkv,
-            cu_seqlens,
-            max_seqlen,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=True,
+            qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=None, causal=True
        )
        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
    elif query_states.shape == key_states.shape:
@@ -360,7 +353,7 @@ def flashattn_forward(
            qkv_unpad,
            cu_seqlens_q,
            max_seqlen_q,
-            dropout_p=dropout_rate,
+            0.0,
            softmax_scale=None,
            causal=is_causal,
        )
@@ -373,7 +366,6 @@ def flashattn_forward(
            output = flash_attn_kvpacked_func(
                query_states,
                torch.stack([key_states, value_states], 2),
-                dropout_p=dropout_rate,
                causal=is_causal,
            )
        else:
@@ -406,7 +398,7 @@ def flashattn_forward(
                cu_seqlens_k,
                max_seqlen_q,
                max_seqlen_k,
-                dropout_p=dropout_rate,
+                0.0,
                softmax_scale=None,
                causal=is_causal,
            )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
@@ -25,8 +25,6 @@ def sdp_attention_forward(
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -29,8 +29,6 @@ def xformers_forward(
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -201,8 +201,6 @@ def flashattn_forward(
        # only on first autoregressive step q,k,v have same seqlen
        is_causal = key_states.shape == query_states.shape

-    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
-
    if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
        # special handling using sample packing
        qkv = torch.stack(
@@ -215,7 +213,7 @@ def flashattn_forward(
            qkv,
            cu_seqlens,
            max_seqlen,
-            dropout_p=dropout_rate,
+            0.0,
            softmax_scale=None,
            causal=True,
            window_size=window_size,
@@ -241,7 +239,7 @@ def flashattn_forward(
            qkv_unpad,
            cu_seqlens_q,
            max_seqlen_q,
-            dropout_p=dropout_rate,
+            0.0,
            softmax_scale=None,
            causal=is_causal,
            window_size=window_size,
@@ -255,7 +253,6 @@ def flashattn_forward(
            output = flash_attn_kvpacked_func(
                query_states,
                torch.stack([key_states, value_states], 2),
-                dropout_p=dropout_rate,
                causal=is_causal,
                window_size=window_size,
            )
@@ -289,7 +286,7 @@ def flashattn_forward(
                cu_seqlens_k,
                max_seqlen_q,
                max_seqlen_k,
-                dropout_p=dropout_rate,
+                0.0,
                softmax_scale=None,
                causal=is_causal,
                window_size=window_size,
--- a/src/axolotl/monkeypatch/mixtral/init.py
+++ b/src/axolotl/monkeypatch/mixtral/init.py
@@ -1,22 +0,0 @@
-"""
-Patches to support multipack for mixtral
-"""
-import transformers
-
-
-def replace_mixtral_attn_with_multipack_flash_attn():
-    from .modeling_mixtral import (
-        MixtralMultipackFlashAttention2,
-        mixtral_decoder_layer_forward,
-        mixtral_model_forward,
-    )
-
-    transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer.forward = (
-        mixtral_decoder_layer_forward
-    )
-    transformers.models.mixtral.modeling_mixtral.MixtralModel.forward = (
-        mixtral_model_forward
-    )
-    transformers.models.mixtral.modeling_mixtral.MISTRAL_ATTENTION_CLASSES[
-        "flash_attention_2"
-    ] = MixtralMultipackFlashAttention2
--- a/src/axolotl/monkeypatch/mixtral/modeling_mixtral.py
+++ b/src/axolotl/monkeypatch/mixtral/modeling_mixtral.py
@@ -1,379 +0,0 @@
-"""
-Mixtral modeling for multipack
-"""
-# pylint: disable=missing-module-docstring,unused-argument,protected-access,pointless-string-statement,duplicate-code
-import logging
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-from einops import rearrange
-from flash_attn import flash_attn_varlen_qkvpacked_func
-from transformers import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import MoeModelOutputWithPast
-from transformers.models.mixtral.modeling_mixtral import (
-    MixtralFlashAttention2,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
-
-LOG = logging.getLogger("axolotl.monkeypatch.mixtral")
-
-
-class MixtralMultipackFlashAttention2(MixtralFlashAttention2):
-    """
-    Custom multipack implementation w flash attention 2
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._flash_attn_uses_top_left_mask = True
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(
-            bsz, q_len, self.num_heads, self.head_dim
-        ).transpose(1, 2)
-        key_states = key_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-        value_states = value_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids
-        )
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
-            # special handling using sample packing
-            qkv = torch.stack(
-                [query_states, key_states, value_states], dim=2
-            )  # [bsz, nh, 3, q_len, hd]
-            qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-            qkv = rearrange(qkv, "b s ... -> (b s) ...")
-
-            attn_output = flash_attn_varlen_qkvpacked_func(
-                qkv,
-                cu_seqlens,
-                max_seqlen,
-                dropout_p=self.attention_dropout,
-                softmax_scale=None,
-                causal=True,
-            )
-            attn_output = rearrange(attn_output, "(b s) ... -> b s ...", b=bsz)
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-def mixtral_decoder_layer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    output_router_logits: Optional[bool] = False,
-    use_cache: Optional[bool] = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[torch.Tensor] = None,
-    **kwargs,
-) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-    """
-    Args:
-        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-        attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-            `(batch, sequence_length)` where padding elements are indicated by 0.
-        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-            returned tensors for more detail.
-        output_router_logits (`bool`, *optional*):
-            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
-            should not be returned during inference.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-            (see `past_key_values`).
-    """
-
-    residual = hidden_states
-
-    hidden_states = self.input_layernorm(hidden_states)
-
-    # Self Attention
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_value=past_key_value,
-        output_attentions=output_attentions,
-        use_cache=use_cache,
-        cu_seqlens=cu_seqlens,
-        max_seqlen=max_seqlen,
-    )
-    hidden_states = residual + hidden_states
-
-    # Fully Connected
-    residual = hidden_states
-    hidden_states = self.post_attention_layernorm(hidden_states)
-    hidden_states, router_logits = self.block_sparse_moe(hidden_states)
-    hidden_states = residual + hidden_states
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights,)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    if output_router_logits:
-        outputs += (router_logits,)
-
-    return outputs
-
-
-def mixtral_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[Tuple, MoeModelOutputWithPast]:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    if input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    past_key_values_length = 0
-
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-    cu_seqlens = None
-    max_seqlen = None
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_seqlens = cu_seqlens.squeeze()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    if attention_mask is not None and self._use_flash_attention_2 and use_cache:
-        is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-        if is_padding_right:
-            raise ValueError(
-                "You are attempting to perform batched generation with padding_side='right'"
-                " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
-                " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-            )
-
-    if self._use_flash_attention_2:
-        # 2d mask is passed through the layers
-        attention_mask = (
-            attention_mask
-            if (attention_mask is not None and 0 in attention_mask)
-            else None
-        )
-    else:
-        # 4d mask is passed through the layers
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-            sliding_window=self.config.sliding_window,
-        )
-
-    hidden_states = inputs_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            LOG.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    all_router_logits = () if output_router_logits else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-                output_router_logits,
-                use_cache,
-                cu_seqlens,
-                max_seqlen,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                output_router_logits=output_router_logits,
-                use_cache=use_cache,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-        if output_router_logits:
-            all_router_logits += (layer_outputs[-1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = (
-            next_decoder_cache.to_legacy_cache()
-            if use_legacy_cache
-            else next_decoder_cache
-        )
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [
-                hidden_states,
-                next_cache,
-                all_hidden_states,
-                all_self_attns,
-                all_router_logits,
-            ]
-            if v is not None
-        )
-
-    return MoeModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-        router_logits=all_router_logits,
-    )
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -81,9 +81,8 @@ class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.tokenizer.add_special_tokens(
-            {"pad_token": getattr(self.tokenizer, "pad_token", "<pad>")}
-        )
+        self.sequence_len = 4096
+        self.tokenizer.add_special_tokens({"pad_token": "<pad>"})
        # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/added_tokens.json

    def tokenize_prompt(self, prompt):
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -13,7 +13,7 @@ register_conv_template(
        system_message="You are a helpful assistant.",
        roles=["<|im_start|>user", "<|im_start|>assistant"],
        sep_style=SeparatorStyle.CHATML,
-        sep="<|im_end|>",
+        sep="<|im_end|>\n",
    )
 )

--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -22,19 +22,13 @@ class PromptStyle(Enum):
    CHATML = "chatml"


-class Prompter:
-    """
-    Base prompter class for all prompters
-    """
-
-
-class AlpacaPrompter(Prompter):
+class AlpacaPrompter:
    """
    Base class for alpaca prompters
    """

-    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
-    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    system_format: str = "{system}"
    turn_format: str
    turn_no_input_format: str
@@ -75,7 +69,7 @@ class AlpacaPrompter(Prompter):
        else:
            res = (
                self.system_format.format(system=self.system_no_input_prompt)
-                if self.system_no_input_prompt
+                if self.system_prompt
                else ""
            ) + self.turn_no_input_format.format(instruction=instruction)
        if output:
@@ -165,7 +159,7 @@ class NomicGPT4AllPrompter(AlpacaPrompter):
    """


-class ReflectAlpacaPrompter(Prompter):
+class ReflectAlpacaPrompter:
    """
    Prompter for ReflectAlpaca
    """
@@ -260,7 +254,7 @@ SHAREGPT_ASSERTION_FAILED_ROLE = (
 )


-class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
+class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
    """
    A prompter that generates prompts for the ShareGPT
    """
@@ -355,7 +349,7 @@ class ShareGPTPrompterV2(ShareGPTPrompter):
        )


-class UnsupportedPrompter(Prompter):
+class UnsupportedPrompter:
    """
    A dummy class for custom prompters
    """
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,5 +1,6 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

+import logging
 import os
 import signal
 import sys
@@ -9,7 +10,6 @@ from typing import Optional

 import torch
 import transformers.modelcard
-from accelerate.logging import get_logger
 from datasets import Dataset
 from optimum.bettertransformer import BetterTransformer
 from transformers.deepspeed import is_deepspeed_zero3_enabled
@@ -26,7 +26,7 @@ src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)

 configure_logging()
-LOG = get_logger("axolotl.train")
+LOG = logging.getLogger("axolotl.train")


@dataclass
@@ -44,10 +44,7 @@ def train(
    *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
 ):
    # load the tokenizer first
-    LOG.debug(
-        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
-        main_process_only=True,
-    )
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
    tokenizer = load_tokenizer(cfg)

    train_dataset = dataset_meta.train_dataset
@@ -55,10 +52,7 @@ def train(
    total_num_steps = dataset_meta.total_num_steps

    # Load the model and tokenizer
-    msg = "loading model"
-    if cfg.adapter:
-        msg += " and peft_config..."
-    LOG.debug(msg)
+    LOG.info("loading model and (optionally) peft_config...")
    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)

    safe_serialization = cfg.save_safetensors is True
@@ -82,8 +76,7 @@ def train(
        cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
    )

-    if hasattr(model, "config"):
-        model.config.use_cache = False
+    model.config.use_cache = False

    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
@@ -93,8 +86,7 @@ def train(
    if not Path(cfg.output_dir).is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)
    tokenizer.save_pretrained(str(Path(cfg.output_dir)))
-    if hasattr(model, "config"):
-        model.config.save_pretrained(str(Path(cfg.output_dir)))
+    model.config.save_pretrained(str(Path(cfg.output_dir)))

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
    if cfg.local_rank == 0:
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -124,36 +124,6 @@ class GPUStatsCallback(
        return control


-class LossWatchDogCallback(TrainerCallback):
-    """Callback to track loss and stop training if loss is too high"""
-
-    def __init__(self, cfg):
-        self.cfg = cfg
-        self.logged = False
-        self.violations = 0
-        self.threshold = cfg.loss_watchdog_threshold
-        self.patience = cfg.loss_watchdog_patience or 3
-
-    def on_step_end(
-        self,
-        _args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **_kwargs,
-    ):
-        if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
-            if state.log_history[-1]["loss"] > self.threshold:
-                self.violations += 1
-                if self.violations >= self.patience:
-                    LOG.warning(
-                        "Loss is too high, stopping training (loss_watchdog_threshold)"
-                    )
-                    control.should_training_stop = True
-            else:
-                self.violations = 0
-        return control
-
-
 def bench_eval_callback_factory(trainer, tokenizer):
    accuracy = evaluate.load("accuracy")
    abcd_idx = [
--- a/src/axolotl/utils/collators.py
+++ b/src/axolotl/utils/collators.py
@@ -2,16 +2,12 @@
 DataCollator for axolotl to pad labels and position_ids for packed sequences
 """
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Sequence, Union
+from typing import Any, Optional, Union

 import numpy as np
-import torch
-import transformers
 from transformers import PreTrainedTokenizerBase
 from transformers.utils import PaddingStrategy

-IGNORE_INDEX = -100
-

@dataclass
 class DataCollatorForSeq2Seq:
@@ -123,58 +119,3 @@ class DataCollatorForSeq2Seq:
            features["decoder_input_ids"] = decoder_input_ids

        return features
-
-
-@dataclass
-class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
-    """
-    Collator for multipack specific to the using the BatchSampler
-    """
-
-    def __call__(self, features, return_tensors=None):
-        chunked_data = {}
-        for feature in features[0].keys():
-            if feature == "length":
-                continue
-            if feature == "attention_mask":
-                arrays = [
-                    (1) * np.array(item[feature])
-                    for item in features
-                    if feature in item
-                ]
-                chunked_data[feature] = np.concatenate(arrays)
-            else:
-                arrays = [
-                    np.array(item[feature]) for item in features if feature in item
-                ]
-                chunked_data[feature] = np.concatenate(arrays)
-        features = [chunked_data]
-        return super().__call__(features, return_tensors=return_tensors)
-
-
-@dataclass
-class MambaDataCollator:
-    """
-    Collator for State Space Models (Mamba)
-    """
-
-    tokenizer: transformers.PreTrainedTokenizer
-
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels = tuple(
-            [torch.LongTensor(instance[key]) for instance in instances]
-            for key in ("input_ids", "labels")
-        )
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids,
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id,
-        )
-        labels = torch.nn.utils.rnn.pad_sequence(
-            labels, batch_first=True, padding_value=IGNORE_INDEX
-        )
-
-        return {
-            "input_ids": input_ids,
-            "labels": labels,
-        }
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -27,7 +27,7 @@ def choose_device(cfg):

    cfg.device = get_device()
    if cfg.world_size == 1:
-        cfg.device_map = cfg.device_map or "auto"
+        cfg.device_map = "auto"
    else:
        if cfg.device.startswith("cuda"):
            cfg.device_map = {"": torch.cuda.current_device()}
@@ -77,15 +77,6 @@ def normalize_config(cfg):
    else:
        cfg.torch_dtype = torch.float32

-    if cfg.saves_per_epoch:
-        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
-        if save_steps < 1.0:  # prevent saves on every step
-            cfg.save_steps = save_steps
-    if cfg.evals_per_epoch:
-        eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
-        if eval_steps < 1.0:  # prevent evals on every step
-            cfg.eval_steps = eval_steps
-
    cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()

    if not cfg.base_model_config:
@@ -131,19 +122,6 @@ def normalize_config(cfg):
        or (cfg.model_type and "mistral" in cfg.model_type.lower())
    )

-    cfg.is_qwen_derived_model = (
-        (
-            hasattr(model_config, "model_type")
-            and model_config.model_type
-            in [
-                "qwen",
-            ]
-        )
-        or cfg.is_qwen_derived_model
-        or "qwen" in cfg.base_model.lower()
-        or (cfg.model_type and "qwen" in cfg.model_type.lower())
-    )
-
    if isinstance(cfg.learning_rate, str):
        cfg.learning_rate = float(cfg.learning_rate)

@@ -187,11 +165,7 @@ def validate_config(cfg):
            "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
        )
-    if (
-        cfg.eval_batch_size
-        and cfg.micro_batch_size
-        and cfg.eval_batch_size != cfg.micro_batch_size
-    ):
+    if cfg.eval_batch_size != cfg.micro_batch_size:
        LOG.warning(
            "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
        )
@@ -361,27 +335,6 @@ def validate_config(cfg):
                cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
                    "sharegpt_simple", "sharegpt"
                )
-
-    if cfg.saves_per_epoch and cfg.save_steps:
-        raise ValueError(
-            "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
-        )
-    if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps":
-        raise ValueError(
-            "save_strategy must be empty or set to `steps` when used with saves_per_epoch."
-        )
-    if cfg.evals_per_epoch and cfg.eval_steps:
-        raise ValueError(
-            "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
-        )
-    if (
-        cfg.evals_per_epoch
-        and cfg.evaluation_strategy
-        and cfg.evaluation_strategy != "steps"
-    ):
-        raise ValueError(
-            "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
-        )
    if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
        raise ValueError(
            "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
@@ -416,24 +369,6 @@ def validate_config(cfg):
            "If you want to full finetune, please turn off load_in_8bit and load_in_4bit."
        )

-    if cfg.rope_scaling:
-        LOG.warning("`rope_scaling` should now be be a key under `model_config`")
-
-    if cfg.warmup_steps and cfg.warmup_ratio:
-        raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
-
-    if cfg.is_qwen_derived_model and cfg.gradient_checkpointing:
-        LOG.warning(
-            "Gradient checkpointing is broken for Qwen models for transformers>=4.35.0, except main branch."
-        )
-
-    if cfg.wandb_run_id and not cfg.wandb_name:
-        cfg.wandb_name = cfg.wandb_run_id
-
-        LOG.warning(
-            "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
-        )
-
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -3,7 +3,7 @@ import functools
 import hashlib
 import logging
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union

 import torch
 from datasets import (
@@ -34,7 +34,6 @@ from axolotl.prompters import (
    JeopardyPrompter,
    MultipleChoiceConcisePrompter,
    MultipleChoiceExplainPrompter,
-    Prompter,
    ReflectAlpacaPrompter,
    SummarizeTLDRPrompter,
    UnsupportedPrompter,
@@ -79,27 +78,19 @@ def prepare_dataset(cfg, tokenizer):
        train_dataset, eval_dataset = process_datasets_for_packing(
            cfg, train_dataset, eval_dataset, tokenizer
        )
-
-    if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
-        total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
-        if total_eval_steps == 0:
-            raise ValueError(
-                "eval dataset split is too small for sample_packing. You should set `eval_sample_packing: False`. "
-            )
-
    if cfg.max_steps:
        total_num_steps = min(
-            calculate_total_num_steps(cfg, train_dataset), cfg.max_steps
+            calculate_total_num_steps(cfg, train_dataset, tokenizer), cfg.max_steps
        )
        LOG.info(f"Maximum number of steps set at {total_num_steps}")
    else:
-        total_num_steps = calculate_total_num_steps(cfg, train_dataset)
+        total_num_steps = calculate_total_num_steps(cfg, train_dataset, tokenizer)
    return train_dataset, eval_dataset, total_num_steps, prompters


 def load_tokenized_prepared_datasets(
    tokenizer, cfg, default_dataset_prepared_path
-) -> Tuple[DatasetDict, List[Prompter]]:
+) -> DatasetDict:
    tokenizer_name = tokenizer.__class__.__name__
    ds_hash = str(
        md5(
@@ -107,12 +98,7 @@ def load_tokenized_prepared_datasets(
                str(cfg.sequence_len)
                + "@"
                + "|".join(
-                    sorted(
-                        [
-                            f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
-                            for d in cfg.datasets
-                        ]
-                    )
+                    sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
                )
                + "|"
                + tokenizer_name
@@ -178,66 +164,6 @@ def load_tokenized_prepared_datasets(
            except (FileNotFoundError, ConnectionError):
                pass

-            ds_from_cloud = False
-            storage_options = {}
-            remote_file_system = None
-            if config_dataset.path.startswith("s3://"):
-                try:
-                    import aiobotocore.session  # type: ignore
-                    import s3fs  # type: ignore
-                except ImportError as exc:
-                    raise ImportError(
-                        "s3:// paths require aiobotocore and s3fs to be installed"
-                    ) from exc
-
-                # Takes credentials from ~/.aws/credentials for default profile
-                s3_session = aiobotocore.session.AioSession(profile="default")
-                storage_options = {"session": s3_session}
-                remote_file_system = s3fs.S3FileSystem(**storage_options)
-            elif config_dataset.path.startswith(
-                "gs://"
-            ) or config_dataset.path.startswith("gcs://"):
-                try:
-                    import gcsfs  # type: ignore
-                except ImportError as exc:
-                    raise ImportError(
-                        "gs:// or gcs:// paths require gcsfs to be installed"
-                    ) from exc
-
-                # gcsfs will use default credentials from the environment else anon
-                # https://gcsfs.readthedocs.io/en/latest/#credentials
-                storage_options = {"token": None}
-                remote_file_system = gcsfs.GCSFileSystem(**storage_options)
-            # TODO: Figure out how to get auth creds passed
-            # elif config_dataset.path.startswith("adl://") or config_dataset.path.startswith("abfs://"):
-            #     try:
-            #         import adlfs
-            #     except ImportError as exc:
-            #        raise ImportError(
-            #            "adl:// or abfs:// paths require adlfs to be installed"
-            #        ) from exc
-
-            #     # Gen 1
-            #     storage_options = {
-            #         "tenant_id": TENANT_ID,
-            #         "client_id": CLIENT_ID,
-            #         "client_secret": CLIENT_SECRET,
-            #     }
-            #     # Gen 2
-            #     storage_options = {
-            #         "account_name": ACCOUNT_NAME,
-            #         "account_key": ACCOUNT_KEY,
-            #     }
-
-            #     remote_file_system = adlfs.AzureBlobFileSystem(**storage_options)
-            try:
-                if remote_file_system and remote_file_system.exists(
-                    config_dataset.path
-                ):
-                    ds_from_cloud = True
-            except (FileNotFoundError, ConnectionError):
-                pass
-
            # prefer local dataset, even if hub exists
            local_path = Path(config_dataset.path)
            if local_path.exists():
@@ -251,8 +177,17 @@ def load_tokenized_prepared_datasets(
                        split=None,
                    )
                elif local_path.is_file():
-                    ds_type = get_ds_type(config_dataset)
-
+                    ds_type = "json"
+                    if config_dataset.ds_type:
+                        ds_type = config_dataset.ds_type
+                    elif ".parquet" in config_dataset.path:
+                        ds_type = "parquet"
+                    elif ".arrow" in config_dataset.path:
+                        ds_type = "arrow"
+                    elif ".csv" in config_dataset.path:
+                        ds_type = "csv"
+                    elif ".txt" in config_dataset.path:
+                        ds_type = "text"
                    ds = load_dataset(
                        ds_type,
                        name=config_dataset.name,
@@ -272,22 +207,6 @@ def load_tokenized_prepared_datasets(
                    data_files=config_dataset.data_files,
                    token=use_auth_token,
                )
-            elif ds_from_cloud and remote_file_system:
-                if remote_file_system.isdir(config_dataset.path):
-                    ds = load_from_disk(
-                        config_dataset.path,
-                        storage_options=storage_options,
-                    )
-                elif remote_file_system.isfile(config_dataset.path):
-                    ds_type = get_ds_type(config_dataset)
-                    ds = load_dataset(
-                        ds_type,
-                        name=config_dataset.name,
-                        data_files=config_dataset.path,
-                        streaming=False,
-                        split=None,
-                        storage_options=storage_options,
-                    )
            else:
                if isinstance(config_dataset.data_files, str):
                    fp = hf_hub_download(
@@ -379,29 +298,11 @@ def load_tokenized_prepared_datasets(
    return dataset, prompters


-def get_ds_type(config_dataset: DictDefault):
-    """
-    Get the dataset type from the path if it's not specified
-    """
-    ds_type = "json"
-    if config_dataset.ds_type:
-        ds_type = config_dataset.ds_type
-    elif ".parquet" in config_dataset.path:
-        ds_type = "parquet"
-    elif ".arrow" in config_dataset.path:
-        ds_type = "arrow"
-    elif ".csv" in config_dataset.path:
-        ds_type = "csv"
-    elif ".txt" in config_dataset.path:
-        ds_type = "text"
-    return ds_type
-
-
 def load_prepare_datasets(
    tokenizer: PreTrainedTokenizerBase,
    cfg,
    default_dataset_prepared_path,
-) -> Tuple[Dataset, Dataset, List[Prompter]]:
+) -> Tuple[Dataset, Dataset, List[Any]]:
    max_packed_sequence_len = (
        cfg.max_packed_sequence_len if cfg.max_packed_sequence_len else cfg.sequence_len
    )
@@ -410,7 +311,7 @@ def load_prepare_datasets(
    )  # make sure we don't accidentally set it larger than sequence_len

    tokenizer_name = tokenizer.__class__.__name__
-    prompters: List[Prompter] = []
+    prompters = []
    if cfg.max_packed_sequence_len is not None:
        # see if we can go ahead and load the stacked dataset
        seed = f"@{str(cfg.seed)}" if cfg.seed else ""
@@ -544,13 +445,14 @@ def load_prepare_datasets(
        train_fingerprint = md5(to_hash_train)
        test_fingerprint = md5(to_hash_test)

-        dataset = dataset.train_test_split(
-            test_size=cfg.val_set_size,
-            shuffle=False,
-            seed=cfg.seed or 42,
-            train_new_fingerprint=train_fingerprint,
-            test_new_fingerprint=test_fingerprint,
-        )
+        with zero_first(is_main_process()):
+            dataset = dataset.train_test_split(
+                test_size=cfg.val_set_size,
+                shuffle=False,
+                seed=cfg.seed or 42,
+                train_new_fingerprint=train_fingerprint,
+                test_new_fingerprint=test_fingerprint,
+            )

        train_dataset = dataset["train"]
        eval_dataset = dataset["test"]
@@ -580,14 +482,10 @@ def get_dataset_wrapper(
            "user_defined", tokenizer, cfg, config_dataset.type.to_dict()
        )
        dataset_prompter = UnsupportedPrompter()
-        dataset_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        dataset_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
    elif ds_strategy := load(config_dataset.type, tokenizer, cfg, config_dataset):
        dataset_prompter = UnsupportedPrompter()
-        dataset_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        dataset_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
    elif d_base_type == "alpaca":
        dataset_prompter = AlpacaPrompter(d_prompt_style)
        ds_strategy = AlpacaPromptTokenizingStrategy(
@@ -596,9 +494,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "explainchoice":
        dataset_prompter = MultipleChoiceExplainPrompter(d_prompt_style)
@@ -608,9 +504,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "concisechoice":
        dataset_prompter = MultipleChoiceConcisePrompter(d_prompt_style)
@@ -620,9 +514,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "summarizetldr":
        dataset_prompter = SummarizeTLDRPrompter(d_prompt_style)
@@ -632,9 +524,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "jeopardy":
        dataset_prompter = JeopardyPrompter(d_prompt_style)
@@ -644,9 +534,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "oasst":
        dataset_prompter = AlpacaPrompter(d_prompt_style)
@@ -656,9 +544,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "gpteacher":
        dataset_prompter = GPTeacherPrompter(d_prompt_style)
@@ -668,9 +554,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    elif d_base_type == "reflection":
        dataset_prompter = ReflectAlpacaPrompter(d_prompt_style)
@@ -680,9 +564,7 @@ def get_dataset_wrapper(
            cfg.train_on_inputs,
            cfg.sequence_len,
        )
-        ds_wrapper = TokenizedPromptDataset(
-            ds_strategy, dataset, process_count=cfg.dataset_processes
-        )
+        ds_wrapper = TokenizedPromptDataset(ds_strategy, dataset)
        dataset_wrapper = ds_wrapper
    else:
        suffix = ""
--- a/src/axolotl/utils/dataloader.py
+++ b/src/axolotl/utils/dataloader.py
@@ -0,0 +1,342 @@
+# pylint: skip-file
+import hashlib
+import itertools
+import logging
+import math
+import time
+from queue import Queue
+from threading import Thread
+from typing import Any, Callable, List, Union
+
+import numba
+import numpy as np
+from torch.utils.data import DistributedSampler, Sampler
+
+LOG = logging.getLogger("axolotl.utils.dataloader")
+
+
+@numba.njit
+def ffd_check(a: np.ndarray, c: int, n: int):
+    # First-fit-decreasing bin packing
+    # Check if a[] could fit in n bins with capacity c
+    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
+
+    a = np.sort(a)[::-1]
+    bins = np.full((n,), c, dtype=a.dtype)
+    for size in a:
+        not_found = True
+        for idx in range(n):
+            if bins[idx] >= size:
+                bins[idx] -= size
+                not_found = False
+                break
+
+        if not_found:
+            return False
+
+    return True
+
+
+@numba.njit
+def ffd_with_result(a: np.ndarray, c: int, start_index: int):
+    # First-fit-decreasing bin packing (with result return)
+
+    indices = np.argsort(a)[::-1]
+    a = a[indices]
+
+    bins: List[Any] = []
+    bins_result: List[Any] = []
+    for a_id, size in enumerate(a):
+        add_new = True
+        for idx in range(len(bins)):
+            if bins[idx] >= size:
+                bins[idx] -= size
+                bins_result[idx].append(indices[a_id] + start_index)
+                add_new = False
+                break
+
+        if add_new:
+            bins.append(c - size)
+            bins_result.append([indices[a_id] + start_index])
+
+    return bins_result, len(a)
+
+
+@numba.njit
+def allocate(
+    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
+):
+    """
+    :param lengths: array of lengths of each sample
+    :param lengths_cumsum: cumulative sum of consecutive lengths
+    :param rank: rank for this process
+    :param c: length of tokens per batch
+    :param n: number of ranks
+    :return:
+    """
+    # Dynamic batch allocator, similar to Multifit
+    # https://en.wikipedia.org/wiki/Multifit_algorithm
+    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
+
+    s = 0
+    start_index = 0
+    result = []
+    result_totseqs = []
+
+    while True:
+        # binary search [left, right)
+        left = 1
+        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
+
+        while right - left > 1:
+            mid = (left + right) // 2
+            if ffd_check(lengths[start_index : start_index + mid], c, n):
+                left = mid
+            else:
+                right = mid
+
+        # use length left
+        batch, tot_seqs = ffd_with_result(
+            lengths[start_index : start_index + left], c, start_index
+        )
+        if len(batch) < n:
+            break
+
+        start_index += left
+        s = lengths_cumsum[start_index - 1]
+
+        # add local rank
+        result.append(batch[rank])
+        # add total seqs for all ranks
+        result_totseqs.append(tot_seqs)
+        # yield batch[rank], tot_seqs, s, len(result) * c * n
+    return result, result_totseqs, s, len(result) * c * n
+
+
+def chunk(iterable, n):
+    """
+    Chunk data into tuples of length n
+    """
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(itertools.islice(it, n)):
+        yield batch
+
+
+def hash_indices(lst: List[int]) -> str:
+    # Convert the list of integers to a string representation
+    concatenated = ",".join(map(str, lst))
+
+    # Generate the hash
+    sha256 = hashlib.sha256()
+    sha256.update(concatenated.encode())
+
+    return sha256.hexdigest()
+
+
+class MultipackDistributedDataloader:
+    """Unpadded data loading using Multipack.
+    Adapted from https://github.com/imoneoi/openchat/blob/v3_fix_mle_loss/ochat/training_deepspeed/multipack_dataloader.py
+    Approximate (at most ~1.22x) the optimal solution of the identical-machines scheduling problem, which is NP-hard.
+    """
+
+    def __init__(
+        self,
+        dataset: Any,
+        collate_fn: Callable,
+        seq_max_length: int = 2048,
+        batch_size: int = 1,
+        sampler: Union[Sampler, DistributedSampler] = None,
+        packing_efficiency_estimate: float = 1.0,
+        sample_packing_seq_len_multiplier: int = 1,
+        device_count: int = 1,
+        prefetch_max: int = 1000,
+        num_epochs: int = 1,
+    ):
+        # Dataset
+        self.dataset = dataset
+        self.lengths = (
+            dataset.data.column("position_ids")
+            .to_pandas()
+            .apply(lambda x: x[-1] + 1)
+            .values
+        )
+        assert isinstance(self.lengths, np.ndarray)
+        assert batch_size % sample_packing_seq_len_multiplier == 0
+        assert batch_size >= sample_packing_seq_len_multiplier
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.sample_packing_seq_len_multiplier = sample_packing_seq_len_multiplier
+        self.seq_max_length = seq_max_length
+        self.batch_max_length = batch_size * seq_max_length
+        self.collate_fn = collate_fn
+        self.num_epochs = num_epochs
+
+        self.num_replicas = 1
+        self.rank = 0
+
+        # statistics
+        self.eff_total_used = 0
+        self.eff_total_slots = 0
+        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
+        self.device_count = device_count
+
+        # maxsize is maximum number of samples in queue
+        self.prefetch_max = prefetch_max
+        self.queue: Queue = Queue(maxsize=prefetch_max)
+        self.thread = None
+
+    def _worker(self):
+        LOG.info(
+            f"[WORKER] Epochs: {self.num_epochs}, Samples: {self.len_w_stats()*self.batch_size}"
+        )
+        for epoch in range(self.num_epochs):
+            for sample in self._internal_batch_generator():
+                while True:
+                    if self.queue.full():
+                        time.sleep(1)
+                    else:
+                        break
+                self.queue.put(sample)
+
+            # stop the queue when epoch is done
+            self.queue.put(None)
+
+    def __iter__(self):
+        if hasattr(self.sampler, "set_epoch"):
+            new_epoch = self.sampler.epoch + 1
+            self.sampler.set_epoch(new_epoch)
+            LOG.info(f"calling sampler.set_epoch({new_epoch})")
+
+        if self.thread is None:
+            self.thread = Thread(target=self._worker, daemon=True)
+            self.thread.start()
+
+        while True:
+            item = self.queue.get()
+
+            if item is None:
+                break
+            yield item
+
+    def generate_batches(self, set_stats=False):
+        LOG.info("generating packed batches")
+        if self.sampler:
+            indices = [idx for idx in self.sampler]
+        else:
+            indices = range(0, len(self.dataset))
+
+        LOG.info(hash_indices(indices))
+        lengths = self.lengths[indices]
+        lengths_cumsum = np.cumsum(lengths)
+
+        batches, totseqs, total_used, total_slots = allocate(
+            lengths=lengths,
+            lengths_cumsum=lengths_cumsum,
+            rank=self.rank,
+            # c=self.batch_max_length,
+            c=self.seq_max_length * self.sample_packing_seq_len_multiplier,
+            n=self.num_replicas,
+        )
+
+        batches = [[indices[b_idx] for b_idx in batch] for batch in batches]
+
+        # statistics
+        if set_stats:
+            self.eff_total_used += total_used
+            self.eff_total_slots += total_slots
+
+        return batches, totseqs
+
+    def _internal_batch_generator(self):
+        all_batches, _ = self.generate_batches(set_stats=True)
+        features = self.dataset.features.keys()
+        len_remaining = self._len_est()
+        for batches in chunk(
+            all_batches, self.batch_size // self.sample_packing_seq_len_multiplier
+        ):
+            chunked_data = []
+            attn_mask_cum_idx = 0
+            for batch in batches:
+                concatenated = {}
+                batched_data = [self.dataset[batch_idx] for batch_idx in batch]
+                for feature in features:
+                    if feature == "length":
+                        continue
+                    if feature == "attention_mask":
+                        arrays = [
+                            (attn_mask_cum_idx + idx + 1) * np.array(item[feature])
+                            for idx, item in enumerate(batched_data)
+                            if feature in item
+                        ]
+                        attn_mask_cum_idx += len(batched_data)
+                        concatenated[feature] = np.concatenate(arrays)
+                    else:
+                        arrays = [
+                            np.array(item[feature])
+                            for item in batched_data
+                            if feature in item
+                        ]
+                        concatenated[feature] = np.concatenate(arrays)
+                chunked_data.append(concatenated)
+            yield self.collate_fn(chunked_data)
+            len_remaining -= 1
+            if not len_remaining:
+                return
+        # yield a no-op for cases where we don't have any data left to pack
+        for i in range(0, len_remaining):
+            yield self.collate_fn(
+                [
+                    {
+                        "input_ids": [0],
+                        "labels": [-100],
+                        "attention_mask": [True],
+                        "position_ids": [0],
+                    }
+                ]
+            )
+
+    def _len_est(self):
+        lengths_sum = np.sum(self.lengths)
+        lengths_sum_per_device = lengths_sum // self.device_count
+        LOG.info(
+            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+            f"total_num_tokens per device: {lengths_sum_per_device}"
+        )
+
+        # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
+        return (
+            math.floor(
+                0.99
+                * lengths_sum_per_device
+                / self.packing_efficiency_estimate
+                // self.seq_max_length
+                // self.batch_size
+            )
+            - 1
+        )
+
+    def __len__(self):
+        # this doesn't return the actual length b/c with distributed samplers, not all dataloaders get
+        # the same share of total tokens
+        # if not self.eff_total_used:
+        #     batches, _ = self.generate_batches(set_stats=True)
+        # LOG.info(
+        #     f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+        #     f"actual packing efficiency: {self.efficiency()}"
+        # )
+        return max(1, self._len_est())
+
+    def len_w_stats(self):
+        if not self.eff_total_used:
+            batches, _ = self.generate_batches(set_stats=True)
+        LOG.info(
+            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+            f"actual packing efficiency: {self.efficiency()}"
+        )
+        return max(1, self._len_est())
+
+    def efficiency(self):
+        return self.eff_total_used / self.eff_total_slots
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -50,17 +50,6 @@ def get_world_size():
    return int(os.getenv("WORLD_SIZE", "1"))


-@contextmanager
-def zero_only():
-    """
-    Context manager that only runs the enclosed block on the main rank.
-    """
-    if is_main_process():
-        yield
-    else:
-        yield None
-
-
@contextmanager
 def zero_first(is_main):
    """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -4,7 +4,6 @@ import math
 import os
 from typing import Optional, Tuple  # noqa: F401

-import addict
 import bitsandbytes as bnb
 import torch
 import transformers
@@ -18,11 +17,11 @@ from transformers import (  # noqa: F401
    AutoTokenizer,
    BitsAndBytesConfig,
    GPTQConfig,
+    LlamaConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
 )

-from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
@@ -30,51 +29,12 @@ from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger("axolotl")


-def check_model_config(cfg: DictDefault, model_config: AutoConfig):
-    quant_config_exists = hasattr(model_config, "quantization_config")
-    quant_config_method_is_gptq = (
-        quant_config_exists
-        and "quant_method" in model_config.quantization_config
-        and model_config.quantization_config["quant_method"] == "gptq"
-    )
-
-    if cfg.gptq and not quant_config_method_is_gptq:
-        raise ValueError(
-            "model_config.quantization_config is not set or quant_method is not set to gptq. "
-            "Please make sure to point to a GPTQ model."
-        )
-
-    if not cfg.gptq and quant_config_exists:
-        raise ValueError(
-            "model_config.quantization_config is set but `gptq` flag is not. "
-            "Please use the `gptq` flag to train quantized model or point to a non-quantized model."
-        )
-
-
 def load_model_config(cfg):
    model_config_name = cfg.base_model_config or cfg.base_model
    trust_remote_code = cfg.trust_remote_code is True
-
-    try:
-        model_config = AutoConfig.from_pretrained(
-            model_config_name, trust_remote_code=trust_remote_code
-        )
-    except ValueError as err:
-        if "mamba" in model_config_name:
-            return addict.Dict(
-                {
-                    "model_type": "mamba",
-                }
-            )
-        raise err
-
-    if cfg.model_config:
-        for key, val in cfg.model_config.items():
-            setattr(model_config, key, val)
-
-    check_model_config(cfg, model_config)
-
-    return model_config
+    return AutoConfig.from_pretrained(
+        model_config_name, trust_remote_code=trust_remote_code
+    )


 def load_tokenizer(cfg):
@@ -91,7 +51,7 @@ def load_tokenizer(cfg):
    if cfg.tokenizer_type:
        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)

-    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
+    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
    tokenizer = tokenizer_cls.from_pretrained(
        tokenizer_config,
        trust_remote_code=cfg.trust_remote_code or False,
@@ -105,7 +65,6 @@ def load_tokenizer(cfg):
            "LlamaTokenizer",
            "LlamaTokenizerFast",
            "CodeLlamaTokenizer",
-            "CodeLlamaTokenizerFast",
        ]
        and hasattr(tokenizer, "pad_token")
        and not tokenizer.pad_token
@@ -113,6 +72,11 @@ def load_tokenizer(cfg):
        # set a pad_token, but use eos_token so we don't add a new token
        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN

+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+
    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -121,40 +85,11 @@ def load_tokenizer(cfg):
    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
        tokenizer.padding_side = "left"

-    # Qwen base only has single token, so we need to set the special tokens
-    if cfg.is_qwen_derived_model:
-        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
-        for attr_name in token_ids:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, tokenizer.eod_id)
-
-        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
-        for attr_name in token_names:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, "<|endoftext|>")
-
    if cfg.special_tokens:
        for k, val in cfg.special_tokens.items():
            tokenizer.add_special_tokens(
                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
            )
-
-        # If we add bos_token and eos_token, we need to update the post processor to
-        # handle them correctly.
-        # https://github.com/huggingface/transformers/pull/24132
-        bos_or_eos_in_special_tokens = (
-            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
-        )
-        if (
-            tokenizer.__class__.__name__
-            in (
-                "LlamaTokenizerFast",
-                "CodeLlamaTokenizerFast",
-            )
-            and bos_or_eos_in_special_tokens
-        ):
-            tokenizer.update_post_processor()
-
    if cfg.tokens:
        tokenizer.add_tokens(
            [
@@ -163,11 +98,6 @@ def load_tokenizer(cfg):
            ]
        )

-    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
-
    return tokenizer


@@ -180,6 +110,7 @@ def load_model(
    Load a model for a given configuration and tokenizer.
    """
    base_model = cfg.base_model
+    base_model_config = cfg.base_model_config
    model_type = cfg.model_type
    model_config = load_model_config(cfg)

@@ -249,18 +180,6 @@ def load_model(
        LOG.info("patching with flash attention")
        replace_mistral_attn_with_flash_attn(packed=cfg.sample_packing)

-    if (
-        cfg.model_config_type == "mixtral"
-        and cfg.flash_attention
-        and cfg.sample_packing
-    ):
-        from axolotl.monkeypatch.mixtral import (
-            replace_mixtral_attn_with_multipack_flash_attn,
-        )
-
-        LOG.info("patching with flash attention")
-        replace_mixtral_attn_with_multipack_flash_attn()
-
    if cfg.is_llama_derived_model and cfg.xpos_rope:
        from axolotl.monkeypatch.xpos_rope_llama_monkey_patch import (
            replace_llama_rope_with_xpos_rope,
@@ -282,7 +201,6 @@ def load_model(
    model_kwargs = {}

    model_kwargs["device_map"] = cfg.device_map
-    model_kwargs["max_memory"] = cfg.max_memory
    model_kwargs["torch_dtype"] = cfg.torch_dtype

    if cfg.model_revision:
@@ -308,30 +226,28 @@ def load_model(
            bnb_4bit_quant_type="nf4",
        )
    # sample packing uses custom FA2 patch
-    if cfg.flash_attention:
-        if not cfg.sample_packing:
-            if (
-                cfg.is_llama_derived_model
-                or cfg.is_falcon_derived_model
-                or cfg.is_mistral_derived_model
-                or model_config.model_type == "mixtral"
-            ):
-                model_config._attn_implementation = (  # pylint: disable=protected-access
-                    "flash_attention_2"
-                )
-        else:
-            if model_config.model_type == "mixtral":
-                model_config._attn_implementation = (  # pylint: disable=protected-access
-                    "flash_attention_2"
-                )
+    if cfg.flash_attention and not cfg.sample_packing:
+        if (
+            cfg.is_llama_derived_model
+            or cfg.is_falcon_derived_model
+            or cfg.is_mistral_derived_model
+        ):
+            model_kwargs["use_flash_attention_2"] = True

    try:
        if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
            from transformers import LlamaForCausalLM

+            config_kwargs = {}
+            if cfg.rope_scaling:
+                config_kwargs["rope_scaling"] = cfg.rope_scaling
+            config = LlamaConfig.from_pretrained(
+                base_model_config,
+                **config_kwargs,
+            )
            model = LlamaForCausalLM.from_pretrained(
                base_model,
-                config=model_config,
+                config=config,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                **model_kwargs,
@@ -376,99 +292,92 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type == "PhiForCausalLM":
-            from axolotl.models.phi import PhiForCausalLM
+        elif model_type == "MixFormerSequentialForCausalLM":
+            from axolotl.models.phi import MixFormerSequentialForCausalLM

-            model = PhiForCausalLM.from_pretrained(
+            model = MixFormerSequentialForCausalLM.from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                **model_kwargs,
            )
-        elif model_type == "MambaLMHeadModel":
-            # FIXME this is janky at best and hacked together to make it work
-            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
-
-            model_kwargs["dtype"] = model_kwargs["torch_dtype"]
-            model_kwargs["device"] = torch.cuda.current_device()
-            del model_kwargs["torch_dtype"]
-            del model_kwargs["device_map"]
-            del model_kwargs["max_memory"]
-
-            model = MambaLMHeadModel.from_pretrained(
-                base_model,
-                **model_kwargs,
-            )
        elif model_type and not cfg.trust_remote_code:
            if cfg.gptq:
                model = AutoModelForCausalLM.from_pretrained(
                    base_model,
-                    config=model_config,
                    trust_remote_code=cfg.trust_remote_code or False,
                    **model_kwargs,
                )
            else:
                model = getattr(transformers, model_type).from_pretrained(
                    base_model,
-                    config=model_config,
                    load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                    load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                    trust_remote_code=cfg.trust_remote_code or False,
                    **model_kwargs,
                )
        else:
+            config = AutoConfig.from_pretrained(
+                base_model,
+                trust_remote_code=cfg.trust_remote_code or False,
+            )
            # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
            # when training starts
            if (
-                hasattr(model_config, "max_seq_len")
-                and model_config.max_seq_len
-                and cfg.sequence_len > model_config.max_seq_len
+                hasattr(config, "max_seq_len")
+                and config.max_seq_len
+                and cfg.sequence_len > config.max_seq_len
            ):
-                model_config.max_seq_len = cfg.sequence_len
+                config.max_seq_len = cfg.sequence_len
                LOG.warning(f"increasing context length to {cfg.sequence_len}")
            elif (
-                hasattr(model_config, "max_sequence_length")
-                and model_config.max_sequence_length
-                and cfg.sequence_len > model_config.max_sequence_length
+                hasattr(config, "max_sequence_length")
+                and config.max_sequence_length
+                and cfg.sequence_len > config.max_sequence_length
            ):
-                model_config.max_sequence_length = cfg.sequence_len
+                config.max_sequence_length = cfg.sequence_len
                LOG.warning(f"increasing context length to {cfg.sequence_len}")
            if cfg.gptq:
                model = AutoModelForCausalLM.from_pretrained(
                    base_model,
-                    config=model_config,
+                    config=config,
                    trust_remote_code=cfg.trust_remote_code or False,
                    **model_kwargs,
                )
            else:
                model = AutoModelForCausalLM.from_pretrained(
                    base_model,
-                    config=model_config,
+                    config=config,
                    load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                    load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                    trust_remote_code=cfg.trust_remote_code or False,
                    **model_kwargs,
                )
    except Exception as err:  # pylint: disable=broad-exception-caught
+        LOG.error(
+            "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
+        )
        LOG.exception(err)
-        raise err
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
+            trust_remote_code=cfg.trust_remote_code or False,
+            **model_kwargs,
+        )

    embeddings_len = (
        math.ceil(len(tokenizer) / 32) * 32
        if cfg.resize_token_embeddings_to_32x
        else len(tokenizer)
    )
-    if (
-        hasattr(model, "get_input_embeddings")
-        and model.get_input_embeddings().num_embeddings < embeddings_len
-    ):
+    if model.get_input_embeddings().num_embeddings < embeddings_len:
        model.resize_token_embeddings(embeddings_len)
    else:
        model.tie_weights()

    if (
-        hasattr(model, "config")
-        and hasattr(model.config, "max_position_embeddings")
+        hasattr(model.config, "max_position_embeddings")
        and model.config.max_position_embeddings
        and cfg.sequence_len > model.config.max_position_embeddings
    ):
@@ -477,23 +386,7 @@ def load_model(
        )
        model.config.max_position_embeddings = cfg.sequence_len

-    if (
-        hasattr(model, "config")
-        and hasattr(model.config, "bos_token_id")
-        and model.config.bos_token_id
-        and model.config.bos_token_id != tokenizer.bos_token_id
-    ):
-        model.config.bos_token_id = tokenizer.bos_token_id
-
-    if (
-        hasattr(model, "config")
-        and hasattr(model.config, "eos_token_id")
-        and model.config.eos_token_id
-        and model.config.eos_token_id != tokenizer.eos_token_id
-    ):
-        model.config.eos_token_id = tokenizer.eos_token_id
-
-    if hasattr(model, "device") and model.device.type == "cuda":
+    if model.device.type == "cuda":
        log_gpu_memory_usage(LOG, "after model load", model.device)

    # make sure these are fp32 per Ramesh et al. (2021)
@@ -508,22 +401,15 @@ def load_model(
                module.to(torch.float32)

    needs_fa2_dtype = cfg.adapter or cfg.fsdp
-    skip_prepare_model_for_kbit_training = False
-
-    if cfg.model_config_type == "qwen" and cfg.adapter == "lora":
-        # Qwen doesn't play nicely with LoRA if this is enabled
-        skip_prepare_model_for_kbit_training = True
-
    if (cfg.adapter == "lora" and load_in_8bit) or (
        cfg.adapter == "qlora" and cfg.load_in_4bit
    ):
        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
        if cfg.gradient_checkpointing:
            model.gradient_checkpointing_enable()
-        if not skip_prepare_model_for_kbit_training:
-            model = prepare_model_for_kbit_training(
-                model, use_gradient_checkpointing=cfg.gradient_checkpointing
-            )
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=cfg.gradient_checkpointing
+        )
        needs_fa2_dtype = True

    # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
@@ -542,7 +428,14 @@ def load_model(
    if cfg.ddp and not load_in_8bit:
        model.to(f"cuda:{cfg.local_rank}")

-    if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
+    if (
+        torch.cuda.device_count() > 1
+        and int(os.getenv("WORLD_SIZE", "1")) > 1
+        and (cfg.load_in_4bit)
+    ):
+        # llama is PROBABLY model parallelizable, but the default isn't that it is
+        # so let's only set it for the 4bit, see
+        # https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133
        setattr(model, "is_parallelizable", True)
        setattr(model, "model_parallel", True)

@@ -552,8 +445,7 @@ def load_model(
            requires_grad.append(f"{name}: {param.requires_grad}")
    if len(requires_grad) == 0:
        LOG.warning("there are no parameters that require gradient updates")
-    if hasattr(model, "config"):
-        model.config.use_cache = False
+    model.config.use_cache = False

    if cfg.flash_optimum:
        model = BetterTransformer.transform(model)
--- a/src/axolotl/utils/samplers/init.py
+++ b/src/axolotl/utils/samplers/init.py
@@ -1,4 +0,0 @@
-"""
-axolotl samplers module
-"""
-from .multipack import MultipackBatchSampler  # noqa: F401
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -1,196 +0,0 @@
-# pylint: skip-file
-"""
-Multipack Batch Sampler
-"""
-import logging
-import math
-import os
-from typing import Any, Iterable, List, Union
-
-import numba
-import numpy as np
-from torch.utils.data import BatchSampler, Sampler
-
-LOG = logging.getLogger("axolotl.utils.samplers.multipack")
-
-
-@numba.njit
-def ffd_check(a: np.ndarray, c: int, n: int):
-    # First-fit-decreasing bin packing
-    # Check if a[] could fit in n bins with capacity c
-    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
-
-    a = np.sort(a)[::-1]
-    bins = np.full((n,), c, dtype=a.dtype)
-    for size in a:
-        not_found = True
-        for idx in range(n):
-            if bins[idx] >= size:
-                bins[idx] -= size
-                not_found = False
-                break
-
-        if not_found:
-            return False
-
-    return True
-
-
-@numba.njit
-def ffd_with_result(a: np.ndarray, c: int, start_index: int):
-    # First-fit-decreasing bin packing (with result return)
-
-    indices = np.argsort(a)[::-1]
-    a = a[indices]
-
-    bins: List[Any] = []
-    bins_result: List[Any] = []
-    for a_id, size in enumerate(a):
-        add_new = True
-        for idx in range(len(bins)):
-            if bins[idx] >= size:
-                bins[idx] -= size
-                bins_result[idx].append(indices[a_id] + start_index)
-                add_new = False
-                break
-
-        if add_new:
-            bins.append(c - size)
-            bins_result.append([indices[a_id] + start_index])
-
-    return bins_result
-
-
-@numba.njit
-def allocate(
-    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
-):
-    # Dynamic batch allocator, similar to Multifit
-    # https://en.wikipedia.org/wiki/Multifit_algorithm
-    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
-
-    s = 0
-    start_index = 0
-    result = []
-
-    while True:
-        # binary search [l, r)
-        left = 1
-        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
-
-        while right - left > 1:
-            mid = (left + right) // 2
-            if ffd_check(lengths[start_index : start_index + mid], c, n):
-                left = mid
-            else:
-                right = mid
-
-        # use length l
-        batch = ffd_with_result(
-            lengths[start_index : start_index + left], c, start_index
-        )
-        assert len(batch) <= n
-        if len(batch) < n:
-            break
-
-        start_index += left
-        s = lengths_cumsum[start_index - 1]
-
-        # add local rank
-        result.append(batch[rank])
-
-    return result, s, len(result) * c * n
-
-
-class MultipackBatchSampler(BatchSampler):
-    """
-    Batch Sampler class for multipack
-    """
-
-    def __init__(
-        self,
-        sampler: Union[Sampler[int], Iterable[int]],
-        batch_size: int,
-        drop_last: bool,
-        batch_max_len: int,
-        lengths: np.ndarray,
-        packing_efficiency_estimate: float = 1.0,
-    ):
-        super().__init__(sampler, batch_size, drop_last)
-        self.batch_size = None
-        self.batch_max_len = batch_max_len
-        self.lengths: np.ndarray = lengths
-        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
-
-        assert isinstance(self.lengths, np.ndarray)
-
-        self.epoch = 0
-
-        # statistics
-        self.eff_total_used = 0
-        self.eff_total_slots = 0
-
-    def set_epoch(self, epoch: int):
-        self.epoch = epoch
-
-    def generate_batches(self, set_stats=False):
-        indices = [idx for idx in self.sampler]
-
-        lengths = self.lengths[indices]
-        lengths_cumsum = np.cumsum(lengths)
-
-        batches, total_used, total_slots = allocate(
-            lengths=lengths,
-            lengths_cumsum=lengths_cumsum,
-            rank=0,
-            c=self.batch_max_len,
-            n=1,
-        )
-
-        batches = [[indices[b_idx] for b_idx in batch] for batch in batches]
-
-        # statistics
-        if set_stats:
-            self.eff_total_used += total_used
-            self.eff_total_slots += total_slots
-
-        return batches
-
-    def __iter__(self):
-        batches = self.generate_batches(set_stats=True)
-        return iter(batches)
-
-    def num_batches(self):
-        batches = self.generate_batches(set_stats=True)
-        return len(batches)
-
-    def efficiency(self):
-        return self.eff_total_used / self.eff_total_slots
-
-    def __len__(self):
-        self.num_batches()
-        return self._len_est()
-
-    def _len_est(self):
-        world_size = int(os.getenv("WORLD_SIZE", "1"))
-        lengths_sum = np.sum(self.lengths)
-        lengths_sum_per_device = lengths_sum // world_size
-        LOG.info(
-            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
-            f"total_num_tokens per device: {lengths_sum_per_device}"
-        )
-
-        # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
-        return max(
-            0,
-            (
-                world_size
-                * math.floor(
-                    0.99
-                    * lengths_sum_per_device
-                    / self.packing_efficiency_estimate
-                    // self.batch_max_len
-                )
-                - 1
-            ),
-        )
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,4 +1,5 @@
 """Module containing the Trainer class and related functions"""
+import logging
 import math
 import os
 from contextlib import contextmanager
@@ -8,15 +9,21 @@ from typing import List
 import numpy as np
 import torch
 import torch.cuda
-from accelerate.logging import get_logger
+import torch.distributed as dist
 from datasets import set_caching_enabled
-from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data import DistributedSampler, RandomSampler

 from axolotl.core.trainer_builder import HFCausalTrainerBuilder
-from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
-from axolotl.utils.samplers import MultipackBatchSampler
+from axolotl.utils.collators import DataCollatorForSeq2Seq
+from axolotl.utils.dataloader import MultipackDistributedDataloader
+from axolotl.utils.distributed import (
+    is_distributed,
+    is_main_process,
+    reduce_and_broadcast,
+    zero_first,
+)

-LOG = get_logger("axolotl")
+LOG = logging.getLogger("axolotl")


@torch.jit.script
@@ -131,10 +138,8 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
                    )

        # Phi doesn't want the attention_mask feature when training
-        if (
-            "CodeGenTokenizer" in tokenizer.__class__.__name__
-            or (cfg.is_mistral_derived_model and cfg.flash_attention)
-            or cfg.model_config_type == "mamba"
+        if "CodeGenTokenizer" in tokenizer.__class__.__name__ or (
+            cfg.is_mistral_derived_model and cfg.flash_attention
        ):
            train_dataset = train_dataset.remove_columns("attention_mask")
            if eval_dataset:
@@ -143,37 +148,30 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
    return train_dataset, eval_dataset


-def calculate_total_num_steps(cfg, train_dataset, update=True):
-    if not cfg.total_num_tokens:
-        total_num_tokens = np.sum(
-            train_dataset.data.column("input_ids")
-            .to_pandas()
-            .apply(lambda x: len(x))  # pylint: disable=unnecessary-lambda
-            .values
-        )
-        LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
-        if update:
-            cfg.total_num_tokens = total_num_tokens
-
-    skip_estimates = cfg.model_config_type == "mamba"
-
-    if not skip_estimates and not cfg.total_supervised_tokens:
-        total_supervised_tokens = (
-            train_dataset.data.column("labels")
-            .to_pandas()
-            .apply(lambda x: np.sum(np.array(x) != -100))
-            .sum()
-        )
-        LOG.debug(
-            f"`total_supervised_tokens: {total_supervised_tokens}`",
-            main_process_only=True,
-        )
-        if update:
-            cfg.total_supervised_tokens = total_supervised_tokens
-
-    if not skip_estimates and cfg.sample_packing:
+def calculate_total_num_steps(cfg, train_dataset, tokenizer):
+    if cfg.sample_packing:
        # we have to drop anything longer then sequence len otherwise
        # flash attention with position ids fails
+        if not cfg.total_num_tokens:
+            LOG.info("calculating total_num_tokens")
+            total_num_tokens = np.sum(
+                train_dataset.data.column("input_ids")
+                .to_pandas()
+                .apply(lambda x: len(x))  # pylint: disable=unnecessary-lambda
+                .values
+            )
+            LOG.info(f"total_num_tokens: {total_num_tokens}")
+            cfg.total_num_tokens = total_num_tokens
+
+        if not cfg.total_supervised_tokens:
+            total_supervised_tokens = (
+                train_dataset.data.column("labels")
+                .to_pandas()
+                .apply(lambda x: np.sum(np.array(x) != -100))
+                .sum()
+            )
+            LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`")
+            cfg.total_supervised_tokens = total_supervised_tokens

        if cfg.sample_packing_eff_est:
            total_num_steps = (
@@ -191,41 +189,41 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                )
                * cfg.num_epochs
            )
-            LOG.debug(
-                f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}",
-                main_process_only=True,
+            LOG.info(
+                f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}"
            )
        else:
-            sampler = MultipackBatchSampler(
-                sampler=RandomSampler(train_dataset),
-                batch_size=cfg.micro_batch_size,
-                drop_last=True,
-                batch_max_len=cfg.micro_batch_size
-                * (cfg.max_packed_sequence_len or cfg.sequence_len),
-                lengths=(
-                    train_dataset.data.column("position_ids")
-                    .to_pandas()
-                    .apply(lambda x: x[-1] + 1)
-                    .values
-                ),
-            )
+            if cfg.world_size > 1 and is_distributed():
+                sampler = DistributedSampler(
+                    train_dataset,
+                    num_replicas=cfg.world_size,
+                    rank=dist.get_rank(),
+                    seed=cfg.seed or 42,
+                )
+            else:
+                sampler = RandomSampler(train_dataset)

-            data_loader = DataLoader(
-                train_dataset.remove_columns(["length"]),
-                batch_sampler=sampler,
+            data_loader = MultipackDistributedDataloader(
+                train_dataset,
+                batch_size=cfg.micro_batch_size,
+                seq_max_length=cfg.max_packed_sequence_len or cfg.sequence_len,
+                collate_fn=DataCollatorForSeq2Seq(
+                    tokenizer,
+                    return_tensors="pt",
+                    padding="longest",
+                ),
+                sampler=sampler,
+                packing_efficiency_estimate=cfg.sample_packing_eff_est,
+                sample_packing_seq_len_multiplier=cfg.micro_batch_size,
+                device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                num_epochs=cfg.num_epochs,
            )
-            data_loader_len = len(data_loader)
-            actual_eff = sampler.efficiency()
-            LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
+            data_loader_len = data_loader.len_w_stats()
+            actual_eff = data_loader.efficiency()
+            LOG.info(f"data_loader_len: {data_loader_len}")
            # FIXME: is there a bug here somewhere? the total num steps depends
            # on the agreed on value for sample_packing_eff_est
-            total_num_steps = int(
-                math.floor(
-                    data_loader_len
-                    * cfg.num_epochs
-                    / int(os.environ.get("WORLD_SIZE", 1))
-                )
-            )
+            total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))

            def calc_sample_packing_eff_est(estimates: List[float]):
                LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
@@ -238,22 +236,13 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            sample_packing_eff_est = (
                math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
            )
-            if update:
-                cfg.sample_packing_eff_est = sample_packing_eff_est
-            LOG.debug(
-                f"sample_packing_eff_est: {cfg.sample_packing_eff_est}",
-                main_process_only=True,
-            )
+            cfg.sample_packing_eff_est = sample_packing_eff_est
+            LOG.info(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
    else:
        total_num_steps = int(
-            math.ceil(
-                len(train_dataset)
-                * cfg.num_epochs
-                / int(os.environ.get("WORLD_SIZE", 1))
-                / cfg.batch_size
-            )
+            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
        )
-    LOG.debug(f"total_num_steps: {total_num_steps}", main_process_only=True)
+    LOG.info(f"total_num_steps: {total_num_steps}")
    return total_num_steps


@@ -271,14 +260,12 @@ def setup_fsdp_envs(cfg):
        ] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap


-def prepare_optim_env(cfg):
+def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    if cfg.fsdp:
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"

-
-def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    trainer_builder = HFCausalTrainerBuilder(cfg, model, tokenizer)
    trainer_builder.train_dataset = train_dataset
    trainer_builder.eval_dataset = eval_dataset
--- a/src/axolotl/utils/wandb_.py
+++ b/src/axolotl/utils/wandb_.py
@@ -2,20 +2,20 @@

 import os

-from axolotl.utils.dict import DictDefault

-
-def setup_wandb_env_vars(cfg: DictDefault):
-    for key in cfg.keys():
-        if key.startswith("wandb_"):
-            value = cfg.get(key, "")
-
-            if value and isinstance(value, str) and len(value) > 0:
-                os.environ[key.upper()] = value
-
-    # Enable wandb if project name is present
-    if cfg.wandb_project and len(cfg.wandb_project) > 0:
+def setup_wandb_env_vars(cfg):
+    if cfg.wandb_mode and cfg.wandb_mode == "offline":
+        os.environ["WANDB_MODE"] = cfg.wandb_mode
+    elif cfg.wandb_project and len(cfg.wandb_project) > 0:
+        os.environ["WANDB_PROJECT"] = cfg.wandb_project
        cfg.use_wandb = True
-        os.environ.pop("WANDB_DISABLED", None)  # Remove if present
+        if cfg.wandb_entity and len(cfg.wandb_entity) > 0:
+            os.environ["WANDB_ENTITY"] = cfg.wandb_entity
+        if cfg.wandb_watch and len(cfg.wandb_watch) > 0:
+            os.environ["WANDB_WATCH"] = cfg.wandb_watch
+        if cfg.wandb_log_model and len(cfg.wandb_log_model) > 0:
+            os.environ["WANDB_LOG_MODEL"] = cfg.wandb_log_model
+        if cfg.wandb_run_id and len(cfg.wandb_run_id) > 0:
+            os.environ["WANDB_RUN_ID"] = cfg.wandb_run_id
    else:
        os.environ["WANDB_DISABLED"] = "true"
--- a/tests/e2e/init.py
+++ b/tests/e2e/init.py
--- a/tests/e2e/test_fused_llama.py
+++ b/tests/e2e/test_fused_llama.py
@@ -4,6 +4,7 @@ E2E tests for lora llama

 import logging
 import os
+import tempfile
 import unittest
 from pathlib import Path

@@ -15,8 +16,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -26,9 +25,9 @@ class TestFusedLlama(unittest.TestCase):
    Test case for Llama models using Fused layers
    """

-    @with_temp_dir
-    def test_fft_packing(self, temp_dir):
+    def test_fft_packing(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
@@ -52,7 +51,7 @@ class TestFusedLlama(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -70,4 +69,4 @@ class TestFusedLlama(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()
+        assert (Path(output_dir) / "pytorch_model.bin").exists()
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -4,6 +4,7 @@ E2E tests for lora llama

 import logging
 import os
+import tempfile
 import unittest
 from pathlib import Path

@@ -13,8 +14,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -24,9 +23,9 @@ class TestLoraLlama(unittest.TestCase):
    Test case for Llama models using LoRA
    """

-    @with_temp_dir
-    def test_lora(self, temp_dir):
+    def test_lora(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
@@ -53,7 +52,7 @@ class TestLoraLlama(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -64,11 +63,11 @@ class TestLoraLlama(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
+        assert (Path(output_dir) / "adapter_model.bin").exists()

-    @with_temp_dir
-    def test_lora_packing(self, temp_dir):
+    def test_lora_packing(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
@@ -97,11 +96,10 @@ class TestLoraLlama(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
-                "bf16": True,
            }
        )
        normalize_config(cfg)
@@ -109,11 +107,11 @@ class TestLoraLlama(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
+        assert (Path(output_dir) / "adapter_model.bin").exists()

-    @with_temp_dir
-    def test_lora_gptq(self, temp_dir):
+    def test_lora_gptq(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
@@ -146,7 +144,7 @@ class TestLoraLlama(unittest.TestCase):
                "save_steps": 0.5,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -157,4 +155,4 @@ class TestLoraLlama(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
+        assert (Path(output_dir) / "adapter_model.bin").exists()
--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -1,65 +0,0 @@
-"""
-E2E tests for lora llama
-"""
-
-import logging
-import os
-import unittest
-from pathlib import Path
-
-from axolotl.cli import load_datasets
-from axolotl.common.cli import TrainerCliArgs
-from axolotl.train import train
-from axolotl.utils.config import normalize_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestMistral(unittest.TestCase):
-    """
-    Test case for Llama models using LoRA
-    """
-
-    @with_temp_dir
-    def test_fft(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "state-spaces/mamba-130m",
-                "model_type": "MambaLMHeadModel",
-                "tokenizer_type": "AutoTokenizer",
-                "tokenizer_config": "EleutherAI/gpt-neox-20b",
-                "flash_attention": False,
-                "sequence_len": 1024,
-                "load_in_8bit": False,
-                "val_set_size": 0.0,
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "gradient_checkpointing": False,
-                "num_epochs": 2,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": None,
-                "save_safetensors": False,
-            }
-        )
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()
--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -4,6 +4,7 @@ E2E tests for lora llama

 import logging
 import os
+import tempfile
 import unittest
 from pathlib import Path

@@ -15,8 +16,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -26,9 +25,9 @@ class TestMistral(unittest.TestCase):
    Test case for Llama models using LoRA
    """

-    @with_temp_dir
-    def test_lora(self, temp_dir):
+    def test_lora(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "openaccess-ai-collective/tiny-mistral",
@@ -55,7 +54,7 @@ class TestMistral(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -69,11 +68,11 @@ class TestMistral(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
+        assert (Path(output_dir) / "adapter_model.bin").exists()

-    @with_temp_dir
-    def test_ft(self, temp_dir):
+    def test_ft(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "openaccess-ai-collective/tiny-mistral",
@@ -94,7 +93,7 @@ class TestMistral(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -112,4 +111,4 @@ class TestMistral(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()
+        assert (Path(output_dir) / "pytorch_model.bin").exists()
--- a/tests/e2e/test_mistral_samplepack.py
+++ b/tests/e2e/test_mistral_samplepack.py
@@ -4,6 +4,7 @@ E2E tests for lora llama

 import logging
 import os
+import tempfile
 import unittest
 from pathlib import Path

@@ -15,8 +16,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -26,9 +25,9 @@ class TestMistral(unittest.TestCase):
    Test case for Llama models using LoRA
    """

-    @with_temp_dir
-    def test_lora_packing(self, temp_dir):
+    def test_lora_packing(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "openaccess-ai-collective/tiny-mistral",
@@ -56,7 +55,7 @@ class TestMistral(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -70,11 +69,11 @@ class TestMistral(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
+        assert (Path(output_dir) / "adapter_model.bin").exists()

-    @with_temp_dir
-    def test_ft_packing(self, temp_dir):
+    def test_ft_packing(self):
        # pylint: disable=duplicate-code
+        output_dir = tempfile.mkdtemp()
        cfg = DictDefault(
            {
                "base_model": "openaccess-ai-collective/tiny-mistral",
@@ -96,7 +95,7 @@ class TestMistral(unittest.TestCase):
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": output_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
@@ -114,4 +113,4 @@ class TestMistral(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()
+        assert (Path(output_dir) / "pytorch_model.bin").exists()
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -4,8 +4,8 @@ E2E tests for lora llama

 import logging
 import os
+import tempfile
 import unittest
-from pathlib import Path

 from axolotl.cli import load_datasets
 from axolotl.common.cli import TrainerCliArgs
@@ -13,8 +13,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -24,14 +22,13 @@ class TestPhi(unittest.TestCase):
    Test case for Llama models using LoRA
    """

-    @with_temp_dir
-    def test_ft(self, temp_dir):
+    def test_ft(self):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "trust_remote_code": True,
-                "model_type": "PhiForCausalLM",
+                "model_type": "MixFormerSequentialForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 512,
                "sample_packing": False,
@@ -55,7 +52,7 @@ class TestPhi(unittest.TestCase):
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": tempfile.mkdtemp(),
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
@@ -67,16 +64,14 @@ class TestPhi(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()

-    @with_temp_dir
-    def test_ft_packed(self, temp_dir):
+    def test_ft_packed(self):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "trust_remote_code": True,
-                "model_type": "PhiForCausalLM",
+                "model_type": "MixFormerSequentialForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 512,
                "sample_packing": True,
@@ -100,7 +95,7 @@ class TestPhi(unittest.TestCase):
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
+                "output_dir": tempfile.mkdtemp(),
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
@@ -112,4 +107,3 @@ class TestPhi(unittest.TestCase):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "pytorch_model.bin").exists()
--- a/tests/e2e/test_resume.py
+++ b/tests/e2e/test_resume.py
@@ -1,95 +0,0 @@
-"""
-E2E tests for resuming training
-"""
-
-import logging
-import os
-import re
-import subprocess
-import unittest
-from pathlib import Path
-
-from transformers.utils import is_torch_bf16_gpu_available
-
-from axolotl.cli import load_datasets
-from axolotl.common.cli import TrainerCliArgs
-from axolotl.train import train
-from axolotl.utils.config import normalize_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import most_recent_subdir, with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestResumeLlama(unittest.TestCase):
-    """
-    Test case for resuming training of llama models
-    """
-
-    @with_temp_dir
-    def test_resume_qlora(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 1024,
-                "sample_packing": True,
-                "flash_attention": True,
-                "load_in_4bit": True,
-                "adapter": "qlora",
-                "lora_r": 32,
-                "lora_alpha": 64,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.1,
-                "special_tokens": {},
-                "datasets": [
-                    {
-                        "path": "vicgalle/alpaca-gpt4",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "save_steps": 10,
-                "save_total_limit": 5,
-                "max_steps": 40,
-            }
-        )
-        if is_torch_bf16_gpu_available():
-            cfg.bf16 = True
-        else:
-            cfg.fp16 = True
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-
-        resume_cfg = cfg | DictDefault(
-            {
-                "resume_from_checkpoint": f"{temp_dir}/checkpoint-30/",
-            }
-        )
-        normalize_config(resume_cfg)
-        cli_args = TrainerCliArgs()
-
-        train(cfg=resume_cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.bin").exists()
-
-        tb_log_path_1 = most_recent_subdir(temp_dir + "/runs")
-        cmd = f"tensorboard --inspect  --logdir {tb_log_path_1}"
-        res = subprocess.run(
-            cmd, shell=True, text=True, capture_output=True, check=True
-        )
-        pattern = r"first_step\s+(\d+)"
-        first_steps = int(re.findall(pattern, res.stdout)[0])
-        assert first_steps == 31
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -1,33 +0,0 @@
-"""
-helper utils for tests
-"""
-import os
-import shutil
-import tempfile
-from functools import wraps
-from pathlib import Path
-
-
-def with_temp_dir(test_func):
-    @wraps(test_func)
-    def wrapper(*args, **kwargs):
-        # Create a temporary directory
-        temp_dir = tempfile.mkdtemp()
-        try:
-            # Pass the temporary directory to the test function
-            test_func(*args, temp_dir=temp_dir, **kwargs)
-        finally:
-            # Clean up the directory after the test
-            shutil.rmtree(temp_dir)
-
-    return wrapper
-
-
-def most_recent_subdir(path):
-    base_path = Path(path)
-    subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
-    if not subdirectories:
-        return None
-    subdir = max(subdirectories, key=os.path.getctime)
-
-    return subdir
--- a/tests/fixtures/conversation.tokenized_llama2chat.json
+++ b/tests/fixtures/conversation.tokenized_llama2chat.json
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -1,7 +1,6 @@
 """Module for testing the validation module"""

 import logging
-import os
 import unittest
 from typing import Optional

@@ -9,7 +8,6 @@ import pytest

 from axolotl.utils.config import validate_config
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.wandb_ import setup_wandb_env_vars


 class ValidationTest(unittest.TestCase):
@@ -651,113 +649,3 @@ class ValidationTest(unittest.TestCase):
        )

        validate_config(cfg)
-
-    def test_warmup_step_no_conflict(self):
-        cfg = DictDefault(
-            {
-                "warmup_steps": 10,
-                "warmup_ratio": 0.1,
-            }
-        )
-
-        with pytest.raises(
-            ValueError,
-            match=r".*warmup_steps and warmup_ratio are mutually exclusive*",
-        ):
-            validate_config(cfg)
-
-        cfg = DictDefault(
-            {
-                "warmup_steps": 10,
-            }
-        )
-
-        validate_config(cfg)
-
-        cfg = DictDefault(
-            {
-                "warmup_ratio": 0.1,
-            }
-        )
-
-        validate_config(cfg)
-
-
-class ValidationWandbTest(ValidationTest):
-    """
-    Validation test for wandb
-    """
-
-    def test_wandb_set_run_id_to_name(self):
-        cfg = DictDefault(
-            {
-                "wandb_run_id": "foo",
-            }
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert any(
-                "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
-                in record.message
-                for record in self._caplog.records
-            )
-
-            assert cfg.wandb_name == "foo" and cfg.wandb_run_id == "foo"
-
-        cfg = DictDefault(
-            {
-                "wandb_name": "foo",
-            }
-        )
-
-        validate_config(cfg)
-
-        assert cfg.wandb_name == "foo" and cfg.wandb_run_id is None
-
-    def test_wandb_sets_env(self):
-        cfg = DictDefault(
-            {
-                "wandb_project": "foo",
-                "wandb_name": "bar",
-                "wandb_run_id": "bat",
-                "wandb_entity": "baz",
-                "wandb_mode": "online",
-                "wandb_watch": "false",
-                "wandb_log_model": "checkpoint",
-            }
-        )
-
-        validate_config(cfg)
-
-        setup_wandb_env_vars(cfg)
-
-        assert os.environ.get("WANDB_PROJECT", "") == "foo"
-        assert os.environ.get("WANDB_NAME", "") == "bar"
-        assert os.environ.get("WANDB_RUN_ID", "") == "bat"
-        assert os.environ.get("WANDB_ENTITY", "") == "baz"
-        assert os.environ.get("WANDB_MODE", "") == "online"
-        assert os.environ.get("WANDB_WATCH", "") == "false"
-        assert os.environ.get("WANDB_LOG_MODEL", "") == "checkpoint"
-        assert os.environ.get("WANDB_DISABLED", "") != "true"
-
-    def test_wandb_set_disabled(self):
-        cfg = DictDefault({})
-
-        validate_config(cfg)
-
-        setup_wandb_env_vars(cfg)
-
-        assert os.environ.get("WANDB_DISABLED", "") == "true"
-
-        cfg = DictDefault(
-            {
-                "wandb_project": "foo",
-            }
-        )
-
-        validate_config(cfg)
-
-        setup_wandb_env_vars(cfg)
-
-        assert os.environ.get("WANDB_DISABLED", "") != "true"