smart resize embeddings

Feat(doc): Add max_steps to readme (#389 )
Feat(config): add max steps (#387 )
2023-08-14 23:44:15 -04:00 · 2023-08-15 00:34:22 +09:00 · 2023-08-14 11:19:29 -04:00 · 2023-08-14 10:59:23 -04:00 · 2023-08-14 07:12:55 -04:00 · 2023-08-14 17:40:40 +09:00
50 changed files with 1600 additions and 197 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"instruction": "...", "input": "...", "output": "..."}
  ```
- `sharegpt:chat`: conversations
+- `sharegpt:chat`: conversations where `from` is `human`/`gpt`
  ```json
  {"conversations": [{"from": "...", "value": "..."}]}
  ```
@@ -225,6 +225,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
  ```
+- `sharegpt_simple.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
 - `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
  ```json
  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
@@ -322,9 +326,9 @@ tokenizer_type: AutoTokenizer
 trust_remote_code:
 # use_fast option for tokenizer loading from_pretrained, default to True
 tokenizer_use_fast:
-# resize the model embeddings when new tokens are added to multiples of 32
-# this is reported to improve training speed on some models
-resize_token_embeddings_to_32x:
+# resize the model embeddings when new tokens are added to multiples of N
+# multiples of 32 are reported to improve training speed on some models
+resize_token_embeddings_multiple:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -360,6 +364,9 @@ dataset_prepared_path: data/last_run_prepared
 push_dataset_to_hub: # repo path
 # push checkpoints to hub
 hub_model_id: # repo path to push finetuned model
+# how to push checkpoints to hub
+# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+hub_strategy:
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -375,7 +382,14 @@ dataset_shard_idx:
 sequence_len: 2048
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
+# FutureWarning: This will soon be DEPRECATED
 max_packed_sequence_len: 1024
+# use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+sample_packing:
+# you can set these packing optimizations AFTER starting a training at least once.
+# The trainer will provide recommended values for these values.
+sample_packing_eff_est:
+total_num_tokens:

 # if you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 adapter: lora
@@ -401,11 +415,12 @@ lora_out_dir:
 lora_fan_in_fan_out: false

 # wandb configuration if you're using it
-wandb_mode:
-wandb_project:
+wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+wandb_project: # your wandb project name
+wandb_entity: # a wandb Team name if using a Team
 wandb_watch:
-wandb_run_id:
-wandb_log_model: # 'checkpoint'
+wandb_run_id: # set the name of your wandb run
+wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training

 # where to save the finished model to
 output_dir: ./completed-model
@@ -420,13 +435,17 @@ learning_rate: 0.00003
 logging_steps:
 save_steps:
 eval_steps:
+save_total_limit: # checkpoints saved at a time
+max_steps:

 # save model as safetensors (require safetensors package)
 save_safetensors:

 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
+# group similarly sized data to minimize padding
+# may be slower to start, as it must download and sort the entire dataset
+# note that training loss may have an oscillating pattern with this enabled
 group_by_length: false

 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
@@ -472,6 +491,10 @@ landmark_attention:
 # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
 # llama only
 xpos_rope:
+# RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+rope_scaling:
+  type: # linear | dynamic
+  factor: # float

 # resume from a specific checkpoint dir
 resume_from_checkpoint:
@@ -503,6 +526,9 @@ torchdistx_path:
 # Set padding for data collator to 'longest'
 collator_pad_to_longest:

+# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+pretraining_dataset:
+
 # Debug mode
 debug:

@@ -522,7 +548,14 @@ Run
 accelerate launch scripts/finetune.py configs/your_config.yml
 ```

-#### Multi-GPU Config
+#### Multi-GPU
+
+You can optionally pre-tokenize dataset with the following before finetuning:
+```bash
+CUDA_VISIBLE_DEVICES="" accelerate ... --prepare_ds_only
+```
+
+##### Config

 - llama FSDP
 ```yaml
@@ -537,6 +570,18 @@ fsdp_config:

 - llama Deepspeed: append `ACCELERATE_USE_DEEPSPEED=true` in front of finetune command

+##### Weights & Biases Logging
+
+- wandb options
+```yaml
+wandb_mode:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+```
+
 ### Inference

 Pass the appropriate flag to the train command:
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -40,7 +40,7 @@ ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
    cd flash-attention && \
-    git checkout v2.0.1  && \
+    git checkout v2.0.4  && \
    python3 setup.py bdist_wheel && \
    cd csrc/fused_dense_lib && \
    python3 setup.py bdist_wheel && \
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -23,6 +23,7 @@ lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -35,7 +36,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -24,6 +24,7 @@ lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -38,6 +38,7 @@ lora_target_linear: true
 lora_fan_in_fan_out:

 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -24,6 +24,7 @@ lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -20,6 +20,7 @@ lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -32,7 +33,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -22,6 +22,7 @@ lora_target_modules:
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: llama-7b-lora-int4
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -18,6 +18,7 @@ lora_dropout:
 lora_target_modules:
 lora_fan_in_fan_out: false
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -15,7 +15,7 @@ val_set_size: 0.01
 output_dir: ./lora-out

 sequence_len: 4096
-max_packed_sequence_len: 4096
+sample_packing: true

 adapter: lora
 lora_model_dir:
@@ -26,6 +26,7 @@ lora_target_linear: true
 lora_fan_in_fan_out:

 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -38,7 +39,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002

 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
@@ -48,8 +49,8 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+xformers_attention:
+flash_attention: true

 warmup_steps: 10
 eval_steps: 20
@@ -63,4 +64,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
-  pad_token: "<pad>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -18,7 +18,8 @@ adapter: qlora
 lora_model_dir:

 sequence_len: 4096
-max_packed_sequence_len: 4096
+sample_packing: true
+
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
@@ -27,6 +28,7 @@ lora_target_linear: true
 lora_fan_in_fan_out:

 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -39,7 +41,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002

 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
@@ -49,8 +51,8 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+xformers_attention:
+flash_attention: true

 warmup_steps: 10
 eval_steps: 20
@@ -64,4 +66,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
-  pad_token: "<pad>"
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -20,6 +20,7 @@ lora_target_modules:
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: mpt-alpaca-7b
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -22,6 +22,7 @@ lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -28,6 +28,7 @@ lora_target_modules:
  - o_proj
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -22,6 +22,7 @@ lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -34,7 +35,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -23,6 +23,7 @@ lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/pythia/lora.yml
+++ b/examples/pythia/lora.yml
@@ -17,6 +17,7 @@ lora_target_modules:
 lora_target_linear:
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -21,6 +21,7 @@ lora_target_modules:
  - v_proj
 lora_fan_in_fan_out: false
 wandb_project: redpajama-alpaca-3b
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -20,6 +20,7 @@ lora_target_modules:
  - mlp_down
 lora_fan_in_fan_out:
 wandb_project: lora-replit
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -37,6 +37,7 @@ lora_target_linear: true
 lora_fan_in_fan_out:

 wandb_project:
+wandb_entity:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
-bitsandbytes>=0.39.0
+bitsandbytes>=0.41.1
 accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
 fire
@@ -13,9 +13,12 @@ einops
 xformers
 optimum
 hf_transfer
+numba
+numpy==1.24.4
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
 rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
+pynvml
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -18,12 +18,17 @@ from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

 from axolotl.logging_config import configure_logging
+from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import barrier, is_main_process
 from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
-from axolotl.utils.trainer import setup_trainer
-from axolotl.utils.validation import validate_config
+from axolotl.utils.trainer import (
+    calculate_total_num_steps,
+    process_datasets_for_packing,
+    setup_trainer,
+)
 from axolotl.utils.wandb import setup_wandb_env_vars

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -38,27 +43,6 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


-def choose_device(cfg):
-    def get_device():
-        try:
-            if torch.cuda.is_available():
-                return f"cuda:{cfg.local_rank}"
-
-            if torch.backends.mps.is_available():
-                return "mps"
-
-            raise SystemError("No CUDA/mps device found")
-        except Exception:  # pylint: disable=broad-exception-caught
-            return "cpu"
-
-    cfg.device = get_device()
-    if cfg.device_map != "auto":
-        if cfg.device.startswith("cuda"):
-            cfg.device_map = {"": cfg.local_rank}
-        else:
-            cfg.device_map = {"": cfg.device}
-
-
 def get_multi_line_input() -> Optional[str]:
    print("Give me an instruction (Ctrl + D to finish): ")
    instruction = ""
@@ -188,36 +172,13 @@ def train(

    validate_config(cfg)

-    # setup some derived config / hyperparams
-    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
-        cfg.batch_size // cfg.micro_batch_size
-    )
-    cfg.batch_size = (
-        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
-    )
-    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
-    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    choose_device(cfg)
-    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
-    if cfg.ddp:
-        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
-        cfg.batch_size = cfg.batch_size * cfg.world_size
+    normalize_config(cfg)

    setup_wandb_env_vars(cfg)
-    if cfg.device == "mps":
-        cfg.load_in_8bit = False
-        cfg.tf32 = False
-        if cfg.bf16:
-            cfg.fp16 = True
-        cfg.bf16 = False
-
-    if cfg.tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True

    # load the tokenizer first
-    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
-    LOG.info(f"loading tokenizer... {tokenizer_config}")
-    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    tokenizer = load_tokenizer(cfg)

    if (
        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
@@ -231,12 +192,31 @@ def train(
                cfg.pretraining_dataset,
                tokenizer,
                max_tokens=cfg.sequence_len,
-                seed=cfg.seed,
+                seed=cfg.seed or 42,
            )
            # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
            train_dataset = train_dataset.with_format("torch")
            eval_dataset = None

+        if is_main_process():
+            # process on rank 0 first so it gets cached so other ranks load from cache
+            train_dataset, eval_dataset = process_datasets_for_packing(
+                cfg, train_dataset, eval_dataset
+            )
+        barrier()
+        if not is_main_process():
+            train_dataset, eval_dataset = process_datasets_for_packing(
+                cfg, train_dataset, eval_dataset
+            )
+        barrier()
+        if cfg.max_steps:
+            total_num_steps = min(
+                calculate_total_num_steps(cfg, train_dataset, tokenizer), cfg.max_steps
+            )
+            LOG.info(f"Maximum number of steps set at {total_num_steps}")
+        else:
+            total_num_steps = calculate_total_num_steps(cfg, train_dataset, tokenizer)
+
    if cfg.debug or "debug" in kwargs:
        LOG.info("check_dataset_labels...")
        check_dataset_labels(
@@ -251,15 +231,10 @@ def train(
        return

    # Load the model and tokenizer
-    LOG.info("loading model and peft_config...")
-    model, peft_config = load_model(
-        cfg.base_model,
-        cfg.base_model_config,
-        cfg.model_type,
-        tokenizer,
-        cfg,
-        adapter=cfg.adapter,
-    )
+    LOG.info("loading model and (optionally) peft_config...")
+    model, peft_config = load_model(cfg, tokenizer)
+
+    safe_serialization = cfg.save_safetensors is True

    if "merge_lora" in kwargs and cfg.adapter is not None:
        LOG.info("running merge of LoRA with base model")
@@ -268,7 +243,11 @@ def train(

        if cfg.local_rank == 0:
            LOG.info("saving merged model")
-            model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
+            model.save_pretrained(
+                str(Path(cfg.output_dir) / "merged"),
+                safe_serialization=safe_serialization,
+            )
+            tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

    if cfg.inference:
@@ -283,10 +262,12 @@ def train(
        return

    if "shard" in kwargs:
-        model.save_pretrained(cfg.output_dir)
+        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
        return

-    trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
+    trainer = setup_trainer(
+        cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
+    )

    model.config.use_cache = False

@@ -305,7 +286,7 @@ def train(
        def terminate_handler(_, __, model):
            if cfg.flash_optimum:
                model = BetterTransformer.reverse(model)
-            model.save_pretrained(cfg.output_dir)
+            model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
            sys.exit(0)

        signal.signal(
@@ -332,6 +313,7 @@ def train(

    if not Path(cfg.output_dir).is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)
+    tokenizer.save_pretrained(cfg.output_dir)
    if cfg.flash_optimum:
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
@@ -345,13 +327,11 @@ def train(
    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
    if cfg.fsdp:
-        model.save_pretrained(cfg.output_dir)
+        trainer.save_model(cfg.output_dir)
    elif cfg.local_rank == 0:
        if cfg.flash_optimum:
            model = BetterTransformer.reverse(model)
-        model.save_pretrained(cfg.output_dir)
-
-    # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
+        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)


 if __name__ == "__main__":
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -5,7 +5,7 @@ import os
 from typing import List

 import torch
-from datasets import IterableDataset
+from datasets import Dataset, IterableDataset

 from .prompt_tokenizers import PromptTokenizingStrategy

@@ -18,9 +18,9 @@ from .prompt_tokenizers import PromptTokenizingStrategy
 LOG = logging.getLogger("axolotl")


-class TokenizedPromptDataset(IterableDataset):
+class TokenizedPromptDataset(Dataset):
    """
-    Iterable dataset that returns tokenized prompts from a stream of text files.
+    Dataset that returns tokenized prompts from a stream of text files.
        Args:
            prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
@@ -30,19 +30,18 @@ class TokenizedPromptDataset(IterableDataset):
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: IterableDataset,
+        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
-        self.dataset = dataset
+        super().__init__(self.process(dataset).data, **kwargs)

-    def __iter__(self):
-        features = self.dataset.features.keys()
-        num_proc = os.cpu_count()
-        return iter(
-            self.dataset.map(
-                self.prompt_tokenizer.tokenize_prompt,
-                num_proc=num_proc,
-                remove_columns=features,
-            )
+    def process(self, dataset):
+        features = dataset.features.keys()
+        num_proc = min(64, os.cpu_count())
+        return dataset.map(
+            self.prompt_tokenizer.tokenize_prompt,
+            num_proc=num_proc,
+            remove_columns=features,
        )


@@ -77,14 +76,21 @@ class ConstantLengthDataset(IterableDataset):
            self.tokens_dtype = torch.int64

    def __iter__(self):
-        buffer = {"input_ids": [], "attention_mask": [], "labels": []}
+        buffer = {
+            "input_ids": [],
+            "attention_mask": [],
+            "labels": [],
+            "position_ids": [],
+        }
        buffer_len = 0
        for dataset in self.datasets:
+            idx = 0
            iterator = iter(dataset)
            more_examples = True
            while more_examples:
                try:
                    example = next(iterator)
+                    idx += 1
                except StopIteration:
                    more_examples = False
                    example = None
@@ -106,6 +112,9 @@ class ConstantLengthDataset(IterableDataset):
                        attention_mask = torch.cat(buffer["attention_mask"], dim=-1)[
                            : self.seq_length
                        ]
+                        position_ids = torch.cat(buffer["position_ids"], dim=-1)[
+                            : self.seq_length
+                        ]
                        labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
                        if labels.size() == input_ids.size() and (
                            attention_mask.size() == input_ids.size()
@@ -114,6 +123,7 @@ class ConstantLengthDataset(IterableDataset):
                                "input_ids": input_ids,
                                "labels": labels,
                                "attention_mask": attention_mask,
+                                "position_ids": position_ids,
                            }
                        else:
                            LOG.warning(
@@ -123,8 +133,10 @@ class ConstantLengthDataset(IterableDataset):
                        "input_ids": [],
                        "attention_mask": [],
                        "labels": [],
+                        "position_ids": [],
                    }
                    buffer_len = 0
+                    idx = 1

                if example:
                    # FIXME
@@ -133,11 +145,6 @@ class ConstantLengthDataset(IterableDataset):
                        input_ids = example["input_ids"]
                        attention_mask = example["attention_mask"]
                        labels = example["labels"]
-                        if (
-                            buffer["input_ids"]
-                            and input_ids[0] == self.tokenizer.bos_token_id
-                        ):
-                            attention_mask[0] = 0

                        if add_concat_token:
                            input_ids.append(self.concat_token_id)
@@ -148,13 +155,17 @@ class ConstantLengthDataset(IterableDataset):
                            input_ids, dtype=self.tokens_dtype
                        )
                        attention_mask_with_concat = torch.tensor(
-                            attention_mask, dtype=self.tokens_dtype
+                            [idx * m for m in attention_mask], dtype=torch.int16
                        )
                        labels_with_concat = torch.tensor(
                            labels, dtype=self.tokens_dtype
                        )
+                        position_ids = torch.arange(
+                            len(input_ids), dtype=self.tokens_dtype
+                        )

                        buffer["input_ids"].append(input_ids_with_concat)
                        buffer["attention_mask"].append(attention_mask_with_concat)
                        buffer["labels"].append(labels_with_concat)
+                        buffer["position_ids"].append(position_ids)
                        buffer_len += len(input_ids)
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -8,9 +8,18 @@ import torch
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import (
+        flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
+    )
+
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb

+from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+

 def forward(
    self,
@@ -79,6 +88,16 @@ def forward(
            dtype=torch.int32,
            device=qkv.device,
        )
+        output = flash_attn_varlen_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    elif attention_mask.shape[0] == 1:
+        # special handling using sample packing
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        cu_q_lens, max_s = get_cu_seqlens_from_pos_ids(position_ids)
+        cu_q_lens = cu_q_lens.squeeze()
+
        output = flash_attn_varlen_qkvpacked_func(
            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
        )
@@ -113,6 +132,7 @@ def forward(
            "b s (h d) -> b s h d",
            h=nheads,
        )
+
    return (
        self.o_proj(rearrange(output, "b s h d -> b s (h d)")),
        None,
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -128,6 +128,7 @@ def xformers_forward(
                query_states,
                key_states,
                value_states,
+                # attn_bias=attention_mask,
                attn_bias=xformers.ops.LowerTriangularMask(),
            )
        attn_weights = None
--- a/src/axolotl/monkeypatch/llama_expand_mask.py
+++ b/src/axolotl/monkeypatch/llama_expand_mask.py
@@ -0,0 +1,52 @@
+"""
+expands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf
+"""
+from typing import Optional
+
+import torch
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    This expansion handles packed sequences so that sequences share the same attention mask integer value
+    when they attend to each other within that sequence.
+    This expansion transforms the mask to lower triangular form to prevent future peeking.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    mask = mask.unsqueeze(1).unsqueeze(2)
+    mask = mask.expand(bsz, 1, tgt_len, src_len)
+
+    # Create a binary mask from the original mask where zeros remain zeros and all other values are set to one
+    binary_mask = torch.where(
+        mask != 0,
+        torch.tensor(1).to(dtype),
+        torch.tensor(0).to(dtype),
+    )
+
+    # Create a block-diagonal mask.
+    # we multiply by the binary mask so that 0's in the original mask are correctly excluded
+    zero_one_mask = torch.eq(mask, mask.transpose(-1, -2)).int() * binary_mask
+
+    # Now let's create a lower triangular mask of ones that will zero out the upper triangular part
+    lower_triangular_ones = torch.tril(torch.ones((tgt_len, src_len), dtype=dtype)).to(
+        mask.device
+    )
+
+    # Use the lower triangular mask to zero out the upper triangular part of the zero_one_mask
+    masked_zero_one_mask = zero_one_mask * lower_triangular_ones
+    inverted_mask = 1.0 - masked_zero_one_mask
+
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+
+
+def hijack_expand_mask():
+    import transformers
+
+    transformers.models.llama.modeling_llama._expand_mask = (  # pylint: disable=protected-access
+        _expand_mask
+    )
--- a/src/axolotl/monkeypatch/utils.py
+++ b/src/axolotl/monkeypatch/utils.py
@@ -0,0 +1,103 @@
+"""
+Shared utils for the monkeypatches
+"""
+import torch
+
+
+def get_cu_seqlens(attn_mask):
+    """generate a cumulative sequence length mask for flash attention using attn mask"""
+    if len(attn_mask.shape) == 1:
+        attn_mask = attn_mask.unsqueeze(0)
+
+    device = attn_mask.device
+    results = []
+    max_seq_lens = []
+
+    for row in attn_mask:
+        # Exclude zeros to avoid adding their positions to the mask
+        t_non_zeros = row[row != 0]
+        # Find where the sequence number changes (including the first position)
+        seq_change = torch.cat(
+            [
+                torch.tensor([1], dtype=torch.int32, device=device),
+                t_non_zeros[1:] != t_non_zeros[:-1],
+            ]
+        )
+        # Get the indices where the sequence changes
+        change_indices = torch.cat(
+            [
+                (seq_change == 1).nonzero(as_tuple=True)[0],
+                torch.tensor([len(t_non_zeros)], dtype=torch.int32, device=device),
+            ]
+        )
+        # Calculate the sequence lengths
+        seq_lengths = change_indices[1:] - change_indices[:-1]
+        # Calculate the length of the final sequence or padding
+        final_seq_length = len(row) - change_indices[-1]
+        # Append the length of the final sequence or padding to seq_lengths
+        if final_seq_length.item():
+            seq_lengths = torch.cat(
+                [
+                    seq_lengths,
+                    torch.tensor(
+                        [final_seq_length.item()], dtype=torch.int32, device=device
+                    ),
+                ]
+            )
+        # Calculate the cumulative sequence lengths
+        cu_seqlens = torch.cat(
+            [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)]
+        )
+        max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        results.append(cu_seqlens)
+        max_seq_lens.append(max_seq_len)
+
+    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
+
+
+def get_cu_seqlens_from_pos_ids(position_ids):
+    """generate a cumulative sequence length mask for flash attention using pos ids"""
+    if len(position_ids.shape) == 1:
+        position_ids = position_ids.unsqueeze(0)
+
+    device = position_ids.device
+    results = []
+    max_seq_lens = []
+
+    for row in position_ids:
+        # Count the number of consecutive zeros from the right side
+        padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()
+
+        # Adjust the row to exclude padding
+        adjusted_row = row[:-padding_length] if padding_length else row.clone()
+
+        # Find where the position resets to 0 (indicating a new sequence)
+        seq_starts = torch.cat(
+            [
+                torch.tensor([True], dtype=torch.bool, device=device),
+                adjusted_row[1:] == 0,
+            ]
+        )
+        # Get the indices where the sequence starts
+        start_indices = torch.cat(
+            [
+                (seq_starts).nonzero(as_tuple=True)[0],
+                torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
+            ]
+        )
+        # Calculate the sequence lengths
+        seq_lengths = start_indices[1:] - start_indices[:-1]
+        # Calculate the cumulative sequence lengths
+        cu_seqlens = torch.cat(
+            [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)]
+        )
+        # Append the padding length to the cumulative sequence lengths
+        if padding_length:
+            cu_seqlens = torch.cat(
+                [cu_seqlens, torch.tensor([len(row)], dtype=torch.int32, device=device)]
+            )
+        max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        results.append(cu_seqlens)
+        max_seq_lens.append(max_seq_len)
+
+    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -66,7 +66,11 @@ class SystemDataPrompter(AlpacaPrompter):
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
-        formatted_sys_prompt = f"### System:\n{system}\n\n" if system else ""
+        formatted_sys_prompt = (
+            self.system_format.format(system=system)
+            if system and self.system_format
+            else ""
+        )
        if input:
            res = formatted_sys_prompt + self.turn_format.format(
                instruction=instruction, input=input
@@ -86,12 +90,20 @@ class OpenOrcaSystemDataPrompter(SystemDataPrompter):
    """

    def match_prompt_style(self):
+        # pylint: disable=duplicate-code
        if self.prompt_style == PromptStyle.INSTRUCT.value:
            self.turn_format = "### User:\n{instruction}\n\n### Additional Context:\n{input}\n\n### Assistant:\n"
            self.turn_no_input_format = "### User:\n{instruction}\n\n### Assistant:\n"
        if self.prompt_style == PromptStyle.CHAT.value:
            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+            self.system_format = "SYSTEM: {system}\n"
+        if self.prompt_style == PromptStyle.CHATML.value:
+            self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n"
+            self.turn_no_input_format = (
+                "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
+            )
+            self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"


 class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
@@ -137,3 +149,12 @@ def load_open_orca(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
+
+
+def load_open_orca_chatml(tokenizer, cfg):
+    return OpenOrcaPromptTokenizingStrategy(
+        OpenOrcaSystemDataPrompter(PromptStyle.CHATML.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -29,7 +29,7 @@ from dataclasses import dataclass, field
 from typing import Generator, List, Sequence

 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import IGNORE_TOKEN_ID
+from axolotl.prompters import IGNORE_TOKEN_ID, SHAREGPT_ASSERTION_FAILED_ROLE


@dataclass
@@ -190,7 +190,7 @@ class Llama2ChatPrompter:  # pylint: disable=too-few-public-methods
        conv.messages = []  # pylint: disable=R0801
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2]
+            assert role == conv.roles[j % 2], SHAREGPT_ASSERTION_FAILED_ROLE
            if sentence["value"]:
                conv.append_message(role, sentence["value"])
        yield conv
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -16,6 +16,7 @@ class PromptStyle(Enum):

    INSTRUCT = "instruct"
    CHAT = "chat"
+    CHATML = "chatml"


 class AlpacaPrompter:
@@ -25,6 +26,7 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    system_format: str
    turn_format: str
    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None
@@ -34,14 +36,23 @@ class AlpacaPrompter:
        self.match_prompt_style()

    def match_prompt_style(self):
+        # pylint: disable=duplicate-code
        if self.prompt_style == PromptStyle.INSTRUCT.value:
            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
            self.turn_no_input_format = (
                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
+            self.system_format = "### System:\n{system}\n\n"
        if self.prompt_style == PromptStyle.CHAT.value:
            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+            self.system_format = "SYSTEM: {system}\n"
+        if self.prompt_style == PromptStyle.CHATML.value:
+            self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n"
+            self.turn_no_input_format = (
+                "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
+            )
+            self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"

    def build_prompt(
        self,
@@ -260,6 +271,11 @@ class Conversation:
        self.messages.append([role, message])


+SHAREGPT_ASSERTION_FAILED_ROLE = (
+    "Role did not alternate between turns (gpt and human). Please check your data."
+)
+
+
 class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
    """
    A prompter that generates prompts for the ShareGPT
@@ -296,7 +312,9 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
        if len(source) < 2:
            # If there isn't a back and forth conversation, ignore it
            # also happens on the data splitting leaving empty conversations
-            raise IndexError
+            raise IndexError(
+                f"A conversation entry has less than 2 messages :\n{source}"
+            )

        conv = self._conversation.copy()
        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
@@ -316,7 +334,7 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2]
+            assert role == conv.roles[j % 2], SHAREGPT_ASSERTION_FAILED_ROLE
            conv.append_message(role, sentence["value"])

        for part in conv.get_prompt():
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -0,0 +1,43 @@
+"""Benchmarking and measurement utilities"""
+
+import pynvml
+import torch
+
+
+def gpu_memory_usage(device=0):
+    return torch.cuda.memory_allocated(device) / 1024.0**3
+
+
+def gpu_memory_usage_all(device=0):
+    usage = torch.cuda.memory_allocated(device) / 1024.0**3
+    reserved = torch.cuda.memory_reserved(device) / 1024.0**3
+    smi = gpu_memory_usage_smi(device)
+    return usage, reserved - usage, max(0, smi - reserved)
+
+
+def gpu_memory_usage_smi(device=0):
+    if isinstance(device, torch.device):
+        device = device.index
+    if isinstance(device, str) and device.startswith("cuda:"):
+        device = int(device[5:])
+
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    return info.used / 1024.0**3
+
+
+def log_gpu_memory_usage(log, msg, device):
+    if not torch.cuda.is_available():
+        return (0, 0, 0)
+
+    usage, cache, misc = gpu_memory_usage_all(device)
+    extras = []
+    if cache > 0:
+        extras.append(f"+{cache:.03f}GB cache")
+    if misc > 0:
+        extras.append(f"+{misc:.03f}GB misc")
+    log.info(
+        f"GPU memory usage {msg}: {usage:.03f}GB ({', '.join(extras)})", stacklevel=2
+    )
+    return usage, cache, misc
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -1,5 +1,6 @@
 """Callbacks for Trainer class"""

+import logging
 import os

 from optimum.bettertransformer import BetterTransformer
@@ -11,6 +12,10 @@ from transformers import (
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy

+from axolotl.utils.bench import log_gpu_memory_usage
+
+LOG = logging.getLogger("axolotl.callbacks")
+

 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
    """Callback to save the PEFT adapter"""
@@ -67,3 +72,25 @@ class SaveBetterTransformerModelCallback(
            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
            control.should_save = False
        return control
+
+
+class GPUStatsCallback(
+    TrainerCallback
+):  # pylint: disable=too-few-public-methods disable=unused-argument
+    """Callback to track GPU utilization"""
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.logged = False
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if not self.logged and state.global_step > 1:
+            log_gpu_memory_usage(LOG, "while training", self.cfg.device)
+            self.logged = True
+        return control
--- a/src/axolotl/utils/collators.py
+++ b/src/axolotl/utils/collators.py
@@ -0,0 +1,121 @@
+"""
+DataCollator for axolotl to pad labels and position_ids for packed sequences
+"""
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import numpy as np
+from transformers import PreTrainedTokenizerBase
+from transformers.utils import PaddingStrategy
+
+
+@dataclass
+class DataCollatorForSeq2Seq:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels and position_ids
+
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        model ([`PreTrainedModel`]):
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+            prepare the *decoder_input_ids*
+
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+              sequence is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    position_pad_token_id: int = 0
+    return_tensors: str = "pt"
+
+    def __call__(self, features, return_tensors=None):
+        labels = None
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+
+        for feature_name, pad_token_id in [
+            ("labels", self.label_pad_token_id),
+            ("position_ids", self.position_pad_token_id),
+        ]:
+            feat = (
+                [feature[feature_name] for feature in features]
+                if feature_name in features[0].keys()
+                else None
+            )
+            labels = feat if feat and feature_name == "labels" else labels
+            # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+            # same length to return tensors.
+            if feat is not None:
+                max_feature_length = max(len(l) for l in feat)  # noqa: E741
+                if self.pad_to_multiple_of is not None:
+                    max_feature_length = (
+                        (max_feature_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                    )
+
+                padding_side = self.tokenizer.padding_side
+                for feature in features:
+                    remainder = [pad_token_id] * (
+                        max_feature_length - len(feature[feature_name])
+                    )
+                    if isinstance(feature[feature_name], list):
+                        feature[feature_name] = (
+                            feature[feature_name] + remainder
+                            if padding_side == "right"
+                            else remainder + feature[feature_name]
+                        )
+                    elif padding_side == "right":
+                        feature[feature_name] = np.concatenate(
+                            [feature[feature_name], remainder]
+                        ).astype(np.int64)
+                    else:
+                        feature[feature_name] = np.concatenate(
+                            [remainder, feature[feature_name]]
+                        ).astype(np.int64)
+
+        features = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=return_tensors,
+        )
+
+        # prepare decoder_input_ids
+        if (
+            labels is not None
+            and self.model is not None
+            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+        ):
+            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
+                labels=features["labels"]
+            )
+            features["decoder_input_ids"] = decoder_input_ids
+
+        return features
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -1,13 +1,84 @@
-"""Module for validating config files"""
+"""Module for working with config dicts"""

 import logging
+import os

 import torch

+from axolotl.utils.bench import log_gpu_memory_usage
+
 LOG = logging.getLogger("axolotl")


+def choose_device(cfg):
+    def get_device():
+        try:
+            if torch.cuda.is_available():
+                return f"cuda:{cfg.local_rank}"
+
+            if torch.backends.mps.is_available():
+                return "mps"
+
+            raise SystemError("No CUDA/mps device found")
+        except Exception:  # pylint: disable=broad-exception-caught
+            return "cpu"
+
+    cfg.device = get_device()
+    if cfg.device_map != "auto":
+        if cfg.device.startswith("cuda"):
+            cfg.device_map = {"": cfg.local_rank}
+        else:
+            cfg.device_map = {"": cfg.device}
+
+    # in `accelerate launch`, we need to not pass through any device map and let
+    # accelerate figure out which parts of the model to put on which gpu
+    accelerate_vars = [var for var in os.environ if var.startswith("ACCELERATE_USE_")]
+    if accelerate_vars:
+        cfg.device_map = None
+
+
+def normalize_config(cfg):
+    # setup some derived config / hyperparams
+    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
+        cfg.batch_size // cfg.micro_batch_size
+    )
+    cfg.batch_size = (
+        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
+    )
+    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
+    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    choose_device(cfg)
+    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
+    if cfg.ddp:
+        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
+        cfg.batch_size = cfg.batch_size * cfg.world_size
+
+    if cfg.device == "mps":
+        cfg.load_in_8bit = False
+        cfg.tf32 = False
+        if cfg.bf16:
+            cfg.fp16 = True
+        cfg.bf16 = False
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
+
+    log_gpu_memory_usage(LOG, "baseline", cfg.device)
+
+
 def validate_config(cfg):
+    if cfg.max_packed_sequence_len and cfg.sample_packing:
+        raise ValueError(
+            "please set only one of max_packed_sequence_len (deprecated soon) or sample_packing"
+        )
+    if cfg.max_packed_sequence_len:
+        LOG.warning(
+            str(
+                PendingDeprecationWarning(
+                    "max_packed_sequence_len will be deprecated in favor of sample_packing"
+                )
+            )
+        )
+
    if cfg.gradient_accumulation_steps and cfg.batch_size:
        raise ValueError(
            "please set only one of gradient_accumulation_steps or batch_size"
@@ -97,6 +168,24 @@ def validate_config(cfg):
            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
        )

+    if cfg.gptq and cfg.model_revision:
+        raise ValueError(
+            "model_revision is not supported for GPTQ models. "
+            + "Please download the model from HuggingFace Hub manually for correct branch, "
+            + "point to its path, and remove model_revision from the config."
+        )
+
+    if cfg.sample_packing and cfg.sdp_attention:
+        # incompatible due to bug w/ accelerate causing 0.0 loss when using llama2
+        raise ValueError(
+            "sample_packing not compatible with sdp_attention. Use flash_attention"
+        )
+
+    if cfg.sample_packing and cfg.xformers_attention:
+        raise ValueError(
+            "sample_packing not compatible with xformers_attention. Use flash_attention"
+        )
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,13 +1,19 @@
 """Module containing data utilities"""
 import functools
-import itertools
+import hashlib
 import logging
 from hashlib import md5
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import Tuple, Union

 import torch
-from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+from datasets import (
+    Dataset,
+    DatasetDict,
+    concatenate_datasets,
+    load_dataset,
+    load_from_disk,
+)
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase

@@ -35,6 +41,7 @@ from axolotl.prompters import (
    ShareGPTPrompter,
    SummarizeTLDRPrompter,
 )
+from axolotl.utils.distributed import barrier, is_main_process

 LOG = logging.getLogger("axolotl")

@@ -109,6 +116,7 @@ def load_tokenized_prepared_datasets(
            local_path = Path(d.path)
            if local_path.exists():
                if local_path.is_dir():
+                    # TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
                    ds = load_dataset(
                        d.path,
                        name=d.name,
@@ -262,20 +270,12 @@ def load_tokenized_prepared_datasets(
                raise ValueError(
                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
                )
-        LOG.info("tokenizing, merging, and shuffling master dataset")
+        LOG.info("merging datasets")
+        dataset = concatenate_datasets(datasets)

-        samples: List[int] = []
-        chunk_size = 1000
-        for d in datasets:
-            d_iter = iter(d)
-            while True:
-                chunk = list(itertools.islice(d_iter, chunk_size))
-                if not chunk:
-                    break
-                samples.extend(chunk)
-
-        LOG.info("shuffle")
-        dataset = Dataset.from_list(samples).shuffle(seed=seed)
+        if len(datasets) > 1:
+            LOG.info("shuffle merged datasets")
+            dataset = dataset.shuffle(seed=seed)
        if cfg.local_rank == 0:
            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
            dataset.save_to_disk(prepared_ds_path)
@@ -374,6 +374,7 @@ def load_prepare_datasets(
            dataset = Dataset.from_list(list(constant_len_dataset))

            # filter out bad data
+            # TODO convert to dataset.filter(...)
            dataset = Dataset.from_list(
                [
                    d
@@ -413,7 +414,51 @@ def load_prepare_datasets(
        )

    if cfg.val_set_size:
-        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
+        # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
+        to_hash_train = (
+            dataset._fingerprint  # pylint: disable=protected-access
+            + "|"
+            + str(cfg.val_set_size)
+            + "|"
+            + "train"
+            + "|"
+            + str(cfg.seed or 42)
+        )
+        to_hash_test = (
+            dataset._fingerprint  # pylint: disable=protected-access
+            + "|"
+            + str(cfg.val_set_size)
+            + "|"
+            + "test"
+            + "|"
+            + str(cfg.seed or 42)
+        )
+        train_fingerprint = hashlib.md5(
+            to_hash_train.encode(), usedforsecurity=False
+        ).hexdigest()
+        test_fingerprint = hashlib.md5(
+            to_hash_test.encode(), usedforsecurity=False
+        ).hexdigest()
+
+        if is_main_process():
+            dataset = dataset.train_test_split(
+                test_size=cfg.val_set_size,
+                shuffle=False,
+                seed=cfg.seed or 42,
+                train_new_fingerprint=train_fingerprint,
+                test_new_fingerprint=test_fingerprint,
+            )
+        barrier()
+        if not is_main_process():
+            dataset = dataset.train_test_split(
+                test_size=cfg.val_set_size,
+                shuffle=False,
+                seed=cfg.seed or 42,
+                train_new_fingerprint=train_fingerprint,
+                test_new_fingerprint=test_fingerprint,
+            )
+        barrier()
+
        train_dataset = dataset["train"]
        eval_dataset = dataset["test"]
    else:
--- a/src/axolotl/utils/dataloader.py
+++ b/src/axolotl/utils/dataloader.py
@@ -0,0 +1,288 @@
+# pylint: skip-file
+import hashlib
+import itertools
+import logging
+import math
+from typing import Any, Callable, List, Union
+
+import numba
+import numpy as np
+from torch.utils.data import DistributedSampler, Sampler
+
+LOG = logging.getLogger("axolotl.utils.dataloader")
+
+
+@numba.njit
+def ffd_check(a: np.ndarray, c: int, n: int):
+    # First-fit-decreasing bin packing
+    # Check if a[] could fit in n bins with capacity c
+    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
+
+    a = np.sort(a)[::-1]
+    bins = np.full((n,), c, dtype=a.dtype)
+    for size in a:
+        not_found = True
+        for idx in range(n):
+            if bins[idx] >= size:
+                bins[idx] -= size
+                not_found = False
+                break
+
+        if not_found:
+            return False
+
+    return True
+
+
+@numba.njit
+def ffd_with_result(a: np.ndarray, c: int, start_index: int):
+    # First-fit-decreasing bin packing (with result return)
+
+    indices = np.argsort(a)[::-1]
+    a = a[indices]
+
+    bins: List[Any] = []
+    bins_result: List[Any] = []
+    for a_id, size in enumerate(a):
+        add_new = True
+        for idx in range(len(bins)):
+            if bins[idx] >= size:
+                bins[idx] -= size
+                bins_result[idx].append(indices[a_id] + start_index)
+                add_new = False
+                break
+
+        if add_new:
+            bins.append(c - size)
+            bins_result.append([indices[a_id] + start_index])
+
+    return bins_result, len(a)
+
+
+@numba.njit
+def allocate(
+    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
+):
+    """
+    :param lengths: array of lengths of each sample
+    :param lengths_cumsum: cumulative sum of consecutive lengths
+    :param rank: rank for this process
+    :param c: length of tokens per batch
+    :param n: number of ranks
+    :return:
+    """
+    # Dynamic batch allocator, similar to Multifit
+    # https://en.wikipedia.org/wiki/Multifit_algorithm
+    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
+
+    s = 0
+    start_index = 0
+    result = []
+    result_totseqs = []
+
+    while True:
+        # binary search [left, right)
+        left = 1
+        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
+
+        while right - left > 1:
+            mid = (left + right) // 2
+            if ffd_check(lengths[start_index : start_index + mid], c, n):
+                left = mid
+            else:
+                right = mid
+
+        # use length left
+        batch, tot_seqs = ffd_with_result(
+            lengths[start_index : start_index + left], c, start_index
+        )
+        if len(batch) < n:
+            break
+
+        start_index += left
+        s = lengths_cumsum[start_index - 1]
+
+        # add local rank
+        result.append(batch[rank])
+        # add total seqs for all ranks
+        result_totseqs.append(tot_seqs)
+        # yield batch[rank], tot_seqs, s, len(result) * c * n
+    return result, result_totseqs, s, len(result) * c * n
+
+
+def chunk(iterable, n):
+    """
+    Chunk data into tuples of length n
+    """
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(itertools.islice(it, n)):
+        yield batch
+
+
+def hash_indices(lst: List[int]) -> str:
+    # Convert the list of integers to a string representation
+    concatenated = ",".join(map(str, lst))
+
+    # Generate the hash
+    sha256 = hashlib.sha256()
+    sha256.update(concatenated.encode())
+
+    return sha256.hexdigest()
+
+
+class MultipackDistributedDataloader:
+    """Unpadded data loading using Multipack.
+    Adapted from https://github.com/imoneoi/openchat/blob/v3_fix_mle_loss/ochat/training_deepspeed/multipack_dataloader.py
+    Approximate (at most ~1.22x) the optimal solution of the identical-machines scheduling problem, which is NP-hard.
+    """
+
+    def __init__(
+        self,
+        dataset: Any,
+        collate_fn: Callable,
+        seq_max_length: int = 2048,
+        batch_size: int = 1,
+        sampler: Union[Sampler, DistributedSampler] = None,
+        packing_efficiency_estimate: float = 1.0,
+        sample_packing_seq_len_multiplier: int = 1,
+        device_count: int = 1,
+    ):
+        # Dataset
+        self.dataset = dataset
+        self.lengths = (
+            dataset.data.column("position_ids")
+            .to_pandas()
+            .apply(lambda x: x[-1] + 1)
+            .values
+        )
+        assert isinstance(self.lengths, np.ndarray)
+        assert batch_size % sample_packing_seq_len_multiplier == 0
+        assert batch_size >= sample_packing_seq_len_multiplier
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.sample_packing_seq_len_multiplier = sample_packing_seq_len_multiplier
+        self.seq_max_length = seq_max_length
+        self.batch_max_length = batch_size * seq_max_length
+        self.collate_fn = collate_fn
+
+        self.num_replicas = 1
+        self.rank = 0
+
+        # statistics
+        self.eff_total_used = 0
+        self.eff_total_slots = 0
+        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
+        self.device_count = device_count
+
+    def generate_batches(self, set_stats=False):
+        LOG.info("generating packed batches")
+        if self.sampler:
+            indices = [idx for idx in self.sampler]
+        else:
+            indices = range(0, len(self.dataset))
+
+        LOG.info(hash_indices(indices))
+        lengths = self.lengths[indices]
+        lengths_cumsum = np.cumsum(lengths)
+
+        batches, totseqs, total_used, total_slots = allocate(
+            lengths=lengths,
+            lengths_cumsum=lengths_cumsum,
+            rank=self.rank,
+            # c=self.batch_max_length,
+            c=self.seq_max_length * self.sample_packing_seq_len_multiplier,
+            n=self.num_replicas,
+        )
+
+        batches = [[indices[b_idx] for b_idx in batch] for batch in batches]
+
+        # statistics
+        if set_stats:
+            self.eff_total_used += total_used
+            self.eff_total_slots += total_slots
+
+        return batches, totseqs
+
+    def __iter__(self):
+        if hasattr(self.sampler, "set_epoch"):
+            new_epoch = self.sampler.epoch + 1
+            self.sampler.set_epoch(new_epoch)
+            LOG.info(f"calling sampler.set_epoch({new_epoch})")
+        all_batches, _ = self.generate_batches(set_stats=True)
+        features = self.dataset.features.keys()
+        len_remaining = self._len_est()
+        for batches in chunk(
+            all_batches, self.batch_size // self.sample_packing_seq_len_multiplier
+        ):
+            chunked_data = []
+            attn_mask_cum_idx = 0
+            for batch in batches:
+                concatenated = {}
+                batched_data = [self.dataset[batch_idx] for batch_idx in batch]
+                for feature in features:
+                    if feature == "attention_mask":
+                        arrays = [
+                            (attn_mask_cum_idx + idx + 1) * np.array(item[feature])
+                            for idx, item in enumerate(batched_data)
+                            if feature in item
+                        ]
+                        attn_mask_cum_idx += len(batched_data)
+                        concatenated[feature] = np.concatenate(arrays)
+                    else:
+                        arrays = [
+                            np.array(item[feature])
+                            for item in batched_data
+                            if feature in item
+                        ]
+                        concatenated[feature] = np.concatenate(arrays)
+                chunked_data.append(concatenated)
+            yield self.collate_fn(chunked_data)
+            len_remaining -= 1
+            if not len_remaining:
+                return
+
+    def _len_est(self):
+        lengths_sum = np.sum(self.lengths)
+        lengths_sum_per_device = lengths_sum // self.device_count
+        LOG.info(
+            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+            f"total_num_tokens per device: {lengths_sum_per_device}"
+        )
+
+        # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
+        return (
+            math.floor(
+                0.99
+                * lengths_sum_per_device
+                / self.packing_efficiency_estimate
+                // self.seq_max_length
+                // self.batch_size
+            )
+            - 1
+        )
+
+    def __len__(self):
+        # this doesn't return the actual length b/c with distributed samplers, not all dataloaders get
+        # the same share of total tokens
+        # if not self.eff_total_used:
+        #     batches, _ = self.generate_batches(set_stats=True)
+        # LOG.info(
+        #     f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+        #     f"actual packing efficiency: {self.efficiency()}"
+        # )
+        return max(1, self._len_est())
+
+    def len_w_stats(self):
+        if not self.eff_total_used:
+            batches, _ = self.generate_batches(set_stats=True)
+        LOG.info(
+            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+            f"actual packing efficiency: {self.efficiency()}"
+        )
+        return max(1, self._len_est())
+
+    def efficiency(self):
+        return self.eff_total_used / self.eff_total_slots
--- a/src/axolotl/utils/dict.py
+++ b/src/axolotl/utils/dict.py
@@ -10,3 +10,6 @@ class DictDefault(Dict):

    def __missing__(self, key):
        return None
+
+    def __or__(self, other):
+        return DictDefault(super().__or__(other))
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -0,0 +1,41 @@
+"""
+utility helpers for distributed checks
+"""
+import torch.distributed as dist
+from accelerate import Accelerator
+
+accelerate = None  # pylint: disable=invalid-name
+
+
+def load_accelerate():
+    global accelerate  # pylint: disable=global-statement
+    accelerate = Accelerator()
+
+
+def is_distributed():
+    """
+    Check if distributed training is initialized.
+    """
+    global accelerate  # pylint: disable=global-statement
+    if not accelerate:
+        accelerate = Accelerator()
+    return dist.is_available() and dist.is_initialized()
+
+
+def barrier():
+    """
+    Acts as a barrier to wait for all processes. This ensures that all processes
+    reach the barrier before proceeding further.
+    """
+    if is_distributed():
+        dist.barrier()
+
+
+def is_main_process():
+    """
+    Check if the current process is the main process.
+    If not in distributed mode, always return True.
+    """
+    if not is_distributed():
+        return True
+    return dist.get_rank() == 0
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -22,6 +22,7 @@ from transformers import (  # noqa: F401
 )

 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
+from axolotl.utils.bench import log_gpu_memory_usage

 LOG = logging.getLogger("axolotl")

@@ -31,31 +32,66 @@ if TYPE_CHECKING:
    from axolotl.utils.dict import DictDefault  # noqa: F401


-def load_tokenizer(
-    tokenizer_config,
-    tokenizer_type,
-    cfg,
+def smart_tokenizer_and_embedding_resize(
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    resize_token_embeddings_multiple: Optional[int] = None,
 ):
-    use_fast = True  # this is the default
-    if cfg.tokenizer_use_fast is not None:
-        use_fast = cfg.tokenizer_use_fast
-    if tokenizer_type:
-        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
-            tokenizer_config,
-            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
+    """Resize tokenizer and embedding.
+
+    Note: This function resizes the tokenizer to accommodate additional special tokens and the
+    embedding matrix of the model to match the new size of the tokenizer. If any new special tokens
+    have been added, the function computes the average embedding values of the existing embeddings
+    and sets those values for the new special token embeddings. This is done separately for the input
+    embeddings and output embeddings of the model.
+    """
+
+    old_tokens = model.get_input_embeddings().weight.data.shape[0]
+    num_new_tokens = len(tokenizer) - old_tokens
+    embeddings_len = (
+        math.ceil(len(tokenizer) / resize_token_embeddings_multiple)
+        * resize_token_embeddings_multiple
+        if resize_token_embeddings_multiple
+        else len(tokenizer)
+    )
+    model.resize_token_embeddings(embeddings_len)
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_config,
-            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
        )

-    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def load_tokenizer(cfg):
+    tokenizer_kwargs = {}
+    use_fast = True  # this is the default
+
+    if cfg.tokenizer_use_fast is not None:
+        use_fast = cfg.tokenizer_use_fast
+    if cfg.tokenizer_legacy is not None:
+        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
+        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
+
+    tokenizer_cls = AutoTokenizer
+    if cfg.tokenizer_type:
+        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
+
+    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
+    tokenizer = tokenizer_cls.from_pretrained(
+        tokenizer_config,
+        trust_remote_code=cfg.trust_remote_code or False,
+        use_fast=use_fast,
+        **tokenizer_kwargs,
+    )

    if tokenizer.__class__.__name__ in [
        "LlamaTokenizer",
@@ -63,6 +99,11 @@ def load_tokenizer(
    ]:
        tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN

+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+
    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -77,17 +118,21 @@ def load_tokenizer(


 def load_model(
-    base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
-):
-    # type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    cfg, tokenizer
+):  # type: (DictDefault, PreTrainedTokenizerBase) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
-    Load a model from a base model and a model type.
+    Load a model for a given configuration and tokenizer.
    """
+    base_model = cfg.base_model
+    base_model_config = cfg.base_model_config
+    model_type = cfg.model_type

    # TODO refactor as a kwarg
    load_in_8bit = cfg.load_in_8bit
-    cfg.is_llama_derived_model = "llama" in base_model or (
-        cfg.model_type and "llama" in cfg.model_type.lower()
+    cfg.is_llama_derived_model = (
+        "llama" in base_model
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
+        or cfg.is_llama_derived_model
    )

    if cfg.is_llama_derived_model and cfg.flash_attention:
@@ -132,6 +177,14 @@ def load_model(
        LOG.info("patching with xpos rope")
        replace_llama_rope_with_xpos_rope()

+    if cfg.is_llama_derived_model and (
+        cfg.max_packed_sequence_len or cfg.sample_packing
+    ):
+        from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
+
+        LOG.info("patching _expand_mask")
+        hijack_expand_mask()
+
    if cfg.bf16 or cfg.bfloat16:
        torch_dtype = torch.bfloat16
    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
@@ -215,14 +268,20 @@ def load_model(
        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
            from transformers import LlamaForCausalLM

-            config = LlamaConfig.from_pretrained(base_model_config)
+            config_kwargs = {}
+            if cfg.rope_scaling:
+                config_kwargs["rope_scaling"] = cfg.rope_scaling
+            config = LlamaConfig.from_pretrained(
+                base_model_config,
+                **config_kwargs,
+            )
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                config=config,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
-                device_map="auto" if cfg.world_size == 1 else cfg.device_map,
                **model_kwargs,
            )
        # elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
@@ -254,10 +313,10 @@ def load_model(
        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
-                device_map=cfg.device_map,
                trust_remote_code=cfg.trust_remote_code or False,
                **model_kwargs,
            )
@@ -285,10 +344,10 @@ def load_model(
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                config=config,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
-                device_map=cfg.device_map,
                trust_remote_code=cfg.trust_remote_code or False,
                **model_kwargs,
            )
@@ -299,31 +358,33 @@ def load_model(
        LOG.exception(err)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
+            device_map=cfg.device_map,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
            torch_dtype=torch_dtype,
-            device_map=cfg.device_map,
            trust_remote_code=cfg.trust_remote_code or False,
            **model_kwargs,
        )

-    embeddings_len = (
-        math.ceil(len(tokenizer) / 32) * 32
-        if cfg.resize_token_embeddings_to_32x
-        else len(tokenizer)
+    smart_tokenizer_and_embedding_resize(
+        tokenizer,
+        model,
+        resize_token_embeddings_multiple=cfg.resize_token_embeddings_multiple,
    )
-    model.resize_token_embeddings(embeddings_len)

    if (
        hasattr(model.config, "max_position_embeddings")
        and model.config.max_position_embeddings
-        and cfg.sequence_len >= model.config.max_position_embeddings
+        and cfg.sequence_len > model.config.max_position_embeddings
    ):
        LOG.warning(
            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
        )
        model.config.max_position_embeddings = cfg.sequence_len

+    if model.device.type == "cuda":
+        log_gpu_memory_usage(LOG, "after model load", model.device)
+
    if not cfg.gptq and (
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -343,7 +404,7 @@ def load_model(
                    if hasattr(module, "weight"):
                        module.to(torch_dtype)

-    model, lora_config = load_adapter(model, cfg, adapter)
+    model, lora_config = load_adapter(model, cfg, cfg.adapter)

    if cfg.ddp and not load_in_8bit:
        model.to(f"cuda:{cfg.local_rank}")
@@ -382,6 +443,9 @@ def load_model(
    if cfg.flash_optimum:
        model = BetterTransformer.transform(model)

+    if cfg.adapter is not None:
+        log_gpu_memory_usage(LOG, "after adapters", model.device)
+
    # TODO resume_from_checkpoint handling
    return model, lora_config

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,26 +1,33 @@
 """Module containing the Trainer class and related functions"""
-
 import importlib
 import logging
 import math
 import os
 import sys
+from contextlib import contextmanager
 from dataclasses import dataclass, field
+from functools import partial
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union

 import bitsandbytes as bnb
+import numpy as np
 import torch.cuda
 import transformers
+from datasets import Dataset, set_caching_enabled
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
+from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
+    GPUStatsCallback,
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
+from axolotl.utils.collators import DataCollatorForSeq2Seq
+from axolotl.utils.dataloader import MultipackDistributedDataloader
 from axolotl.utils.schedulers import (
    InterpolatingLogScheduler,
    get_cosine_schedule_with_quadratic_warmup,
@@ -29,6 +36,68 @@ from axolotl.utils.schedulers import (
 LOG = logging.getLogger("axolotl")


+@torch.jit.script
+def weighted_cross_entropy(
+    logits: torch.Tensor, labels: torch.Tensor, weights: torch.Tensor
+):
+    # Flatten the logits, labels, and weights tensors
+    logits = logits.view(
+        -1, logits.size(-1)
+    )  # logits becomes of shape [batch_size*sequence_length, vocab_size]
+    labels = labels.view(-1)  # labels becomes of shape [batch_size*sequence_length]
+    weights = weights.view(-1)  # weights becomes of shape [batch_size*sequence_length]
+
+    # Compute the unweighted cross entropy loss
+    losses = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
+
+    # Apply the weights to the losses and compute their sum
+    return (weights * losses).sum()
+
+
+@torch.jit.script
+def create_weighted_mask(labels: torch.Tensor):
+    # Check if the tensor is 2D. If not, unsqueeze it to make it 2D
+    if len(labels.shape) == 1:
+        labels = labels.unsqueeze(0)
+
+    weights = torch.zeros_like(labels).float()
+    for i in range(labels.shape[0]):
+        mask = labels[i] != -100
+
+        # Create a tensor to track group ids
+        group_ids = torch.zeros_like(labels[i]).int()
+        curr_group_id = 0
+
+        for j in range(1, len(labels[i])):
+            if mask[j] and not mask[j - 1]:  # switch from masked to unmasked label
+                curr_group_id += 1  # start new group
+            group_ids[j] = (
+                curr_group_id if mask[j] else 0
+            )  # assign group id if unmasked label
+
+        # Count only unmasked labels in each group
+        group_counts = torch.bincount(group_ids[mask])
+
+        mask_weights = torch.zeros_like(labels[i]).float()
+        mask_weights[mask] = 1.0 / group_counts[group_ids[mask]]
+
+        weights[i] = mask_weights
+
+    return weights.squeeze()  # squeeze the output to match the input dimension
+
+
+def trainer_weighted_loss(model_output, labels, shift_labels=True):
+    logits = (
+        model_output["logits"] if isinstance(model_output, dict) else model_output[0]
+    )
+    if shift_labels:
+        logits = logits[..., :-1, :].contiguous()
+        labels = labels[..., 1:].contiguous()
+
+    weights = create_weighted_mask(labels)
+    return weighted_cross_entropy(logits, labels, weights)
+
+
@dataclass
 class AxolotlTrainingArguments(TrainingArguments):
    """
@@ -39,6 +108,22 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
    )
+    sample_packing: bool = field(
+        default=False,
+        metadata={"help": "Use sample packing for efficient training."},
+    )
+    sample_packing_efficiency: float = field(
+        default=1.0,
+        metadata={"help": "Sample packing efficiency for calculating batch length."},
+    )
+    max_seq_length: int = field(
+        default=2048,
+        metadata={"help": "The maximum sequence length the model can handle"},
+    )
+    sample_packing_seq_len_multiplier: int = field(
+        default=1,
+        metadata={"help": "the multiplier for the max len for packed sequences"},
+    )


 class AxolotlTrainer(Trainer):
@@ -76,6 +161,64 @@ class AxolotlTrainer(Trainer):
                return super().create_scheduler(num_training_steps, optimizer)
        return self.lr_scheduler

+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.args.world_size > 1 and self.args.sample_packing:
+            return DistributedSampler(
+                self.train_dataset,
+                num_replicas=self.args.world_size,
+                rank=self.args.process_index,
+                seed=self.args.seed,
+            )
+        return super()._get_train_sampler()
+
+    def get_train_dataloader(self) -> Union[DataLoader, MultipackDistributedDataloader]:
+        if self.args.sample_packing:
+            train_sampler = self._get_train_sampler()
+            return self.accelerator.prepare(
+                MultipackDistributedDataloader(
+                    self.train_dataset,
+                    batch_size=self._train_batch_size,
+                    seq_max_length=self.args.max_seq_length,
+                    collate_fn=self.data_collator,
+                    sampler=train_sampler,
+                    packing_efficiency_estimate=self.args.sample_packing_efficiency,
+                    sample_packing_seq_len_multiplier=self.args.sample_packing_seq_len_multiplier,
+                    device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                )
+            )
+        return super().get_train_dataloader()
+
+    def get_eval_dataloader(
+        self, eval_dataset: Optional[Dataset] = None
+    ) -> Union[DataLoader, MultipackDistributedDataloader]:
+        if self.args.sample_packing:
+            eval_dataset = (
+                eval_dataset if eval_dataset is not None else self.eval_dataset
+            )
+            eval_sampler = self._get_eval_sampler(eval_dataset)
+            return self.accelerator.prepare(
+                MultipackDistributedDataloader(
+                    eval_dataset,
+                    batch_size=self.args.eval_batch_size,
+                    seq_max_length=self.args.max_seq_length,
+                    collate_fn=self.data_collator,
+                    sampler=eval_sampler,
+                    packing_efficiency_estimate=self.args.sample_packing_efficiency,
+                    sample_packing_seq_len_multiplier=self.args.eval_batch_size,
+                    device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                )
+            )
+        return super().get_eval_dataloader(eval_dataset)
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        # use one's weighted cross entropy loss calc
+        # if self.args.sample_packing:
+        #     labels = inputs.pop("labels")
+        #     outputs = model(**inputs)
+        #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
+        #     return (loss, outputs) if return_outputs else loss
+        return super().compute_loss(model, inputs, return_outputs=return_outputs)
+

 class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
@@ -106,10 +249,121 @@ class OneCycleLRSchedulerTrainer(AxolotlTrainer):
        return self.lr_scheduler


-def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
-    total_num_steps = int(
-        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
-    )
+def add_position_ids(sample):
+    sample["position_ids"] = torch.arange(len(sample["input_ids"]))
+    return sample
+
+
+def drop_long_seq(sample, sequence_len=2048):
+    return len(sample["input_ids"]) <= sequence_len
+
+
+@contextmanager
+def disable_datasets_caching():
+    try:
+        set_caching_enabled(False)
+        yield
+    finally:
+        set_caching_enabled(True)
+
+
+def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
+    if cfg.sample_packing:
+        drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
+        train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
+            add_position_ids, num_proc=os.cpu_count()
+        )
+        if eval_dataset:
+            eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
+                add_position_ids, num_proc=os.cpu_count()
+            )
+    return train_dataset, eval_dataset
+
+
+def calculate_total_num_steps(cfg, train_dataset, tokenizer):
+    if cfg.sample_packing:
+        # we have to drop anything longer then sequence len otherwise
+        # flash attention with position ids fails
+        if not cfg.total_num_tokens:
+            LOG.info("calculating total_num_tokens")
+            total_num_tokens = np.sum(
+                train_dataset.data.column("input_ids")
+                .to_pandas()
+                .apply(lambda x: len(x))  # pylint: disable=unnecessary-lambda
+                .values
+            )
+            LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
+            cfg.total_num_tokens = total_num_tokens
+
+        if cfg.sample_packing_eff_est:
+            total_num_steps = (
+                # match count to len est in dataloader
+                (
+                    math.floor(
+                        0.99
+                        * cfg.total_num_tokens
+                        / cfg.sample_packing_eff_est
+                        / cfg.sequence_len
+                        // cfg.batch_size
+                        // int(os.environ.get("WORLD_SIZE", 1))
+                    )
+                    - 1
+                )
+                * cfg.num_epochs
+            )
+            LOG.info(
+                f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}"
+            )
+        else:
+            sampler = RandomSampler(train_dataset)
+            data_loader = MultipackDistributedDataloader(
+                train_dataset,
+                batch_size=cfg.micro_batch_size,
+                seq_max_length=cfg.max_packed_sequence_len or cfg.sequence_len,
+                collate_fn=DataCollatorForSeq2Seq(
+                    tokenizer,
+                    return_tensors="pt",
+                    padding="longest",
+                ),
+                sampler=sampler,
+                packing_efficiency_estimate=cfg.sample_packing_eff_est,
+                sample_packing_seq_len_multiplier=cfg.micro_batch_size,
+                device_count=int(os.environ.get("WORLD_SIZE", 1)),
+            )
+            data_loader_len = data_loader.len_w_stats()
+            actual_eff = data_loader.efficiency()
+            LOG.info(f"data_loader_len: {data_loader_len}")
+            total_num_steps = int(
+                math.floor(
+                    data_loader_len
+                    * cfg.micro_batch_size
+                    * cfg.num_epochs
+                    // cfg.batch_size
+                )
+            )
+            LOG.info(
+                f"📝 UPDATE CONFIG WITH: `sample_packing_eff_est: {math.ceil(actual_eff * 100.0) / 100.0}`"
+            )
+            cfg.sample_packing_eff_est = math.ceil(actual_eff * 100.0) / 100.0
+    else:
+        total_num_steps = int(
+            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
+        )
+    LOG.info(f"total_num_steps: {total_num_steps}")
+    return total_num_steps
+
+
+def setup_fsdp_envs(cfg):
+    os.environ["ACCELERATE_USE_FSDP"] = "true"
+    if cfg.fsdp_config.fsdp_sync_module_states:
+        os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
+    if cfg.fsdp_config.fsdp_state_dict_type:
+        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
+
+
+def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
+    if cfg.fsdp:
+        setup_fsdp_envs(cfg)
    warmup_steps = (
        cfg.warmup_steps
        if cfg.warmup_steps is not None
@@ -186,10 +440,29 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        training_arguments_kwargs["push_to_hub"] = True
        training_arguments_kwargs["hub_private_repo"] = True

+        if cfg.hub_strategy:
+            training_arguments_kwargs["hub_strategy"] = cfg.hub_strategy
+
    if cfg.save_safetensors:
        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors

+    if cfg.sample_packing_eff_est:
+        training_arguments_kwargs[
+            "sample_packing_efficiency"
+        ] = cfg.sample_packing_eff_est
+
+    if cfg.val_set_size == 0:
+        evaluation_strategy = "no"
+    elif cfg.eval_steps < 1:
+        # eval every epoch
+        evaluation_strategy = "epoch"
+    else:
+        # eval every eval_steps steps
+        evaluation_strategy = "steps"
+
    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
+        max_steps=total_num_steps if cfg.max_steps else -1,
+        max_seq_length=cfg.sequence_len,
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -198,12 +471,12 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        eval_accumulation_steps=cfg.gradient_accumulation_steps,
        num_train_epochs=cfg.num_epochs,
        learning_rate=cfg.learning_rate,
-        evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
+        evaluation_strategy=evaluation_strategy,
        save_strategy="steps" if cfg.save_steps else "epoch",
        eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
        save_steps=cfg.save_steps,
        output_dir=cfg.output_dir,
-        save_total_limit=3,
+        save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
        load_best_model_at_end=(
            cfg.load_best_model_at_end is not False
            and cfg.val_set_size > 0
@@ -221,6 +494,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.lr_scheduler and cfg.lr_scheduler not in ("one_cycle", "log_sweep")
        else "cosine",
        weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
+        sample_packing=cfg.sample_packing if cfg.sample_packing else False,
+        sample_packing_seq_len_multiplier=cfg.micro_batch_size,
        **training_arguments_kwargs,
    )

@@ -292,6 +567,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)

    callbacks = []
+    callbacks.append(GPUStatsCallback(cfg))
    # TODO on_save callback to sync checkpoints to GCP/AWS in background
    if cfg.early_stopping_patience:
        early_stop_cb = EarlyStoppingCallback(
@@ -314,11 +590,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.collator_pad_to_longest:
        data_collator_kwargs["padding"] = "longest"
    else:
-        data_collator_kwargs["pad_to_multiple_of"] = 8
+        # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
+        # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
+        data_collator_kwargs["pad_to_multiple_of"] = 64

    if cfg.is_llama_derived_model and cfg.landmark_attention:
-        from functools import partial
-
        from axolotl.monkeypatch.llama_landmark_attn import (
            add_mem_tokens,
            get_mem_id,
@@ -346,7 +622,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
-        data_collator=transformers.DataCollatorForSeq2Seq(
+        data_collator=DataCollatorForSeq2Seq(
            tokenizer,
            return_tensors="pt",
            **data_collator_kwargs,
--- a/src/axolotl/utils/wandb.py
+++ b/src/axolotl/utils/wandb.py
@@ -9,6 +9,8 @@ def setup_wandb_env_vars(cfg):
    elif cfg.wandb_project and len(cfg.wandb_project) > 0:
        os.environ["WANDB_PROJECT"] = cfg.wandb_project
        cfg.use_wandb = True
+        if cfg.wandb_entity and len(cfg.wandb_entity) > 0:
+            os.environ["WANDB_ENTITY"] = cfg.wandb_entity
        if cfg.wandb_watch and len(cfg.wandb_watch) > 0:
            os.environ["WANDB_WATCH"] = cfg.wandb_watch
        if cfg.wandb_log_model and len(cfg.wandb_log_model) > 0:
--- a/tests/monkeypatch/test_llama_attn_hijack_flash.py
+++ b/tests/monkeypatch/test_llama_attn_hijack_flash.py
@@ -0,0 +1,30 @@
+"""
+Unit tests for the monkeypatch utils
+"""
+import unittest
+
+import torch
+
+from axolotl.monkeypatch.utils import get_cu_seqlens, get_cu_seqlens_from_pos_ids
+
+
+class TestMonkeyPatchUtils(unittest.TestCase):
+    """
+    Unit test class for monkeypatch utils
+    """
+
+    def test_get_cu_seqlens_1d(self):
+        attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
+        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
+        self.assertTrue(torch.allclose(get_cu_seqlens(attn_mask)[0], target_res))
+
+    def test_get_cu_seqlens_from_pos_ids_1d(self):
+        position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0]])
+        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
+        self.assertTrue(
+            torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_dict.py
+++ b/tests/test_dict.py
@@ -72,6 +72,13 @@ class DictDefaultTest(unittest.TestCase):

        assert cfg.random_key is None, "DictDefault should return None for missing keys"

+    def test_dict_or(self):
+        cfg = DictDefault({}) | DictDefault({})
+
+        assert (
+            cfg.random_key is None
+        ), "DictDefault should return None for missing keys after | operation"
+
    def test_dict_nested_missingparentkey(self):
        """
        Due to subclassing Dict, DictDefault will error if we try to access a nested key whose parent key does not exist.
--- a/tests/test_expand_mask.py
+++ b/tests/test_expand_mask.py
@@ -0,0 +1,44 @@
+"""
+Unit tests for the monkey patch for expand mask to handle packed sequences
+"""
+import unittest
+
+import torch
+
+from axolotl.monkeypatch.llama_expand_mask import _expand_mask
+
+
+class TestExpandMask(unittest.TestCase):
+    """
+    Test class for attention mask expansion for packed sequences
+    """
+
+    def test_output(self):
+        mask = torch.tensor([[1, 1, 1, 2], [2, 3, 3, 0]])
+        dtype = torch.float32
+        expected_output = torch.tensor(
+            [
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, -3.4028e38, 0.0000e00],
+                    ]
+                ],
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [-3.4028e38, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [-3.4028e38, 0.0000e00, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
+                    ]
+                ],
+            ]
+        )
+        # Check that the output matches the expected output
+        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -27,7 +27,7 @@ class TestPacking(unittest.TestCase):
            }
        )

-    def test_resets_attention(self):
+    def test_increments_attention(self):
        prompter = AlpacaPrompter("chat")
        strat = AlpacaPromptTokenizingStrategy(
            prompter,
@@ -55,10 +55,14 @@ class TestPacking(unittest.TestCase):
        # first example doesn't have mask reset
        assert example["input_ids"][0] == self.tokenizer.bos_token_id
        assert example["attention_mask"][0] == 1
+        assert example["position_ids"][0] == 0
+        assert example["position_ids"][1] == 1

        # but subsequent one does
        assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
-        assert example["attention_mask"][next_bos_index] == 0
+        assert example["attention_mask"][next_bos_index] == 2
+        assert example["position_ids"][next_bos_index] == 0
+        assert example["position_ids"][next_bos_index + 1] == 1


 if __name__ == "__main__":
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -134,9 +134,15 @@ class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:4] == [1, 835, 2184, 29901]  # "<s>### System:"
-        assert example["input_ids"][5:7] == [1509, 20118]  # "use cot"
-        assert example["input_ids"][9] == 11889  # USER
+        assert example["input_ids"][0:5] == [
+            1,
+            28962,
+            1254,
+            12665,
+            29901,
+        ]  # "<s>SYSTEM:"
+        assert example["input_ids"][5:7] == [671, 20118]  # " use cot"
+        assert example["input_ids"][8] == 11889  # USER


 class Llama2ChatTokenizationTest(unittest.TestCase):
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -70,7 +70,7 @@ class AlpacaPrompterTest(unittest.TestCase):
            )
        )
        assert "use cot" in res
-        assert res.startswith("### System:")
+        assert res.startswith("SYSTEM:")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -13,17 +13,22 @@ class TestTokenizers(unittest.TestCase):
    """

    def test_default_use_fast(self):
-        cfg = DictDefault({})
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+            }
+        )
+        tokenizer = load_tokenizer(cfg)
        assert "Fast" in tokenizer.__class__.__name__

    def test_dont_use_fast(self):
        cfg = DictDefault(
            {
+                "tokenizer_config": "huggyllama/llama-7b",
                "tokenizer_use_fast": False,
            }
        )
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        tokenizer = load_tokenizer(cfg)
        assert "Fast" not in tokenizer.__class__.__name__


--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -6,8 +6,8 @@ from typing import Optional

 import pytest

+from axolotl.utils.config import validate_config
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.validation import validate_config


 class ValidationTest(unittest.TestCase):
@@ -313,3 +313,27 @@ class ValidationTest(unittest.TestCase):
        )

        validate_config(cfg)
+
+    def test_packing(self):
+        cfg = DictDefault(
+            {
+                "max_packed_sequence_len": 2048,
+            }
+        )
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "max_packed_sequence_len will be deprecated in favor of sample_packing"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "max_packed_sequence_len": 2048,
+                "sample_packing": True,
+            }
+        )
+        regex_exp = r".*set only one of max_packed_sequence_len \(deprecated soon\) or sample_packing.*"
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
Author	SHA1	Message	Date
Wing Lian	31079cd5fd	smart resize embeddings Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-08-14 23:44:15 -04:00
NanoCode012	41ecb451c2	Feat(doc): Add max_steps to readme (#389 )	2023-08-15 00:34:22 +09:00
Gabriel Puliatti	3c2ad00d07	Feat(config): add max steps (#387 )	2023-08-14 11:19:29 -04:00
florian peyron	5d48a10548	Added "epoch" evaluation_strategy (#388 )	2023-08-14 10:59:23 -04:00
NanoCode012	73a0b6ead5	Feat(config): Add hub_strategy (#386 )	2023-08-14 07:12:55 -04:00
florian peyron	63fdb5a7fb	Error msg for sharegpt if conv has less than 2 msg (#379 )	2023-08-14 17:40:40 +09:00
mhenrichsen	fdffef5940	new llama-2 default settings (#370 ) * new default settings * fix whitespace * rm max packed sequence length --------- Co-authored-by: Mads Henrichsen <mads@BrbartiendeMads.lan>	2023-08-14 17:39:09 +09:00
Wing Lian	919246fbc1	don't pass rope_scaling kwarg if it's None (#383 )	2023-08-13 18:57:38 -04:00
Wing Lian	ffac902c1b	bump flash-attn to 2.0.4 for the base docker image (#382 )	2023-08-13 17:55:04 -04:00
Charles Goddard	15f6e57eaa	Fix crash when running without CUDA	2023-08-13 13:36:40 -07:00
NanoCode012	729c299256	Feat(doc): Improve sharegpt doc (#378 ) * Feat(doc): Improve sharegpt doc * Fix typo	2023-08-14 00:36:00 +09:00
Wing Lian	86a91e260b	save tokenizer before training starts (#380 )	2023-08-13 11:28:58 -04:00
Aman Gupta Karmani	094fc2c6e6	try to detect accelerate and only use device_map=None in that case (#373 )	2023-08-13 00:32:07 -04:00
Wing Lian	2dafa730ef	Create FUNDING.yml	2023-08-13 00:30:34 -04:00
Wing Lian	343ac84e5a	fix check for flash attn branching (#377 )	2023-08-12 22:48:08 -04:00
Aman Karmani	0c967279ce	remove unnecessary local variable	2023-08-13 01:58:39 +00:00
Aman Karmani	efb3b2c95e	simplify `load_tokenizer`	2023-08-12 18:55:06 -07:00
Aman Karmani	7b55fe6419	improve GPU logging to break out pytorch cache and system mem	2023-08-12 18:52:57 -07:00
Aman Karmani	e029ab34ea	quiet noise from llama tokenizer by setting pad token earlier	2023-08-12 18:31:40 -07:00
Aman Karmani	8cec513447	extract module for working with cfg	2023-08-12 18:25:27 -07:00
Aman Karmani	a13e45d548	fix DefaultDict.__or__	2023-08-13 01:15:50 +00:00
Wing Lian	918f1b0dfb	revert previous change and build ax images w docker on gpu (#371 )	2023-08-12 20:23:00 -04:00
Wing Lian	c3fde36ada	attempt to run non-base docker builds on regular cpu hosts (#369 )	2023-08-12 19:07:38 -04:00
Wing Lian	2bb0b78975	Attention mask and position id fixes for packing (#285 ) * fix attetion mask with packing * set position ids and use block diagonal attn mask * fix expand mask for multiple batch items, make sure we pad position_ids * don't move masks to cpu * use multi pack dataloader w random sampler * add position_ids back * more fixes for dataloader integration * est total tokens, fix field loop * more fixes, position_ids seems broken * more fixes for sample packing * use distributed sampler, avoid accelerate prepare * use accelerator prepare for dataloader * fix for position_ids w packing * Update src/axolotl/utils/dataloader.py * validation for sample packing and doc * more fixes for 4k and optimizations * optimized expand mask fn * better handling of variance in multipack dataloader length and trainer hanging when it runs out of data * fix rounding of len of batches to int * better handling so that all devices have the same dataloader len * fix step calc for packing * pass sample packing efficiency to training args * add a test for the mask expansion for sequence packing * only process eval dataset for packing if not None * don't split batches when packing * weighted CE losses * weighted CEL fixes * limit packing to sequences of max seq len * seq_len_multiple for packing * make sure the chunk size is an int * sample_packing_seq_len_multiplier config * use cumulative seq len with var len flash attn v2 w packing * properly calculate max len * fix flash-attn, xformers, packing, support chatml * fix chatml system prompt for openorca, legacy tokenizer opts * add chatml * add unit tests for cum seq lens, add ability to build cu_seq_lens from positional ids, fix prompt test * fix test and pylint checks * more packing and dataset optimizations and fixes * filter w multiple cpus * more fixes and optimizations * fixes and go back to distributed sampler since batch sampler won't work * fix counts by accounting for num devices * fix steps calculation * previous accelerate is still most performant * add numba to requirements. * use custom distributed checks * fix sampler to prevent overfit w new epochs * let's not cleanup the cached datasets * calculate cum seq lens with pos_ids instead of mask, simplify packing params, fix distributed barrier * speed optimizations and set accelerate fsdp env vars * optimize dataset concatenation? * more optimizations for dataset handling * fix import for annotation * manual pre-commit fixes * another sum optimization and bug fix for calc steps * fix packing estimations * fix formatting * pylint problems * add back flash attention branch for handling unpacked sequences seperately * Address PR feedback * add optional sample packing config params to readme	2023-08-12 15:14:56 -04:00
NanoCode012	a276c9c88d	Fix(save): Save as safetensors (#363 )	2023-08-13 01:22:52 +09:00
Morgan McGuire	7019509daa	Add wandb_entity to wandb options, update example configs, update README (#361 ) * Update wandb_entity and add wandb descriptions * add wandb to config section * remove trailing whitespace for pre-commit hook * remove trailing whitespace for pre-commit hook --------- Co-authored-by: Morgan McGuire <morganmcguire@Morgans-MacBook-Pro.local> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2023-08-12 12:17:11 -04:00
NanoCode012	96bd6ae1c4	Fix(model loading): Warn when model revision is passed to gptq (#364 ) * fix(model loading): warn when model revision is passed to gptq * chore: improve message	2023-08-13 01:16:59 +09:00
NanoCode012	e37d9358e6	Fix(message): Improve error message for bad format (#365 )	2023-08-13 01:16:18 +09:00
NanoCode012	b5212068ac	Feat: Add rope scaling (#343 ) * Feat: Add rope scaling * fix: move rope config	2023-08-13 00:50:15 +09:00
NanoCode012	289d5c403d	feat(merge): save tokenizer on merge (#362 )	2023-08-13 00:18:10 +09:00
Aman Gupta Karmani	35c8b90306	Merge pull request #355 from tmm1/bitsandbytes-fixes bump to latest bitsandbytes release with major bug fixes	2023-08-11 15:15:38 -07:00
NanoCode012	fae6ed8092	Update README.md on pretraining_dataset (#360 ) * Update README.md on pretraining_dataset * Fix message	2023-08-11 12:17:07 +09:00
NanoCode012	94d03c8402	Clarify pre-tokenize before multigpu (#359 )	2023-08-11 11:27:42 +09:00
Aman Gupta Karmani	11ddccb80f	Merge pull request #356 from tmm1/load_model-args simplify `load_model` signature	2023-08-09 18:24:34 -07:00
Aman Gupta Karmani	964312199e	Merge pull request #354 from tmm1/gpu-util GPU memory usage logging	2023-08-09 15:44:18 -07:00
Aman Karmani	718102271f	simplify load_model signature	2023-08-09 22:36:02 +00:00
Aman Gupta Karmani	f5c11f8262	Merge pull request #350 from tmm1/group-len-false-examples set `group_by_length` to false in all examples	2023-08-09 14:48:48 -07:00
Aman Karmani	fce40aab23	bump to latest bitsandbytes release with major bug fixes	2023-08-09 21:47:11 +00:00
Aman Karmani	9c314101d5	use newer pynvml package	2023-08-09 21:06:28 +00:00
Aman Karmani	e303d64728	log GPU memory usage	2023-08-09 18:26:28 +00:00
Aman Karmani	b4d1d22782	note pattern when using groups	2023-08-07 16:18:42 -07:00
Aman Karmani	9f99104038	update comment for group_by_length	2023-08-07 01:04:56 -07:00
Aman Karmani	36fefcf94b	set group_by_length to false in examples	2023-08-06 23:59:09 -07:00