From e3a38450ded395feb8ef1bc227d09b8b476d18f9 Mon Sep 17 00:00:00 2001
From: Byron Hsu <byronhsu1230@gmail.com>
Date: Thu, 29 Aug 2024 05:19:18 -0700
Subject: [PATCH 01/13] Add liger kernel to features (#1881) [skip ci]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index af604fad5..c84f1cb8c 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Features:
 - Supports fullfinetune, lora, qlora, relora, and gptq
 - Customize configurations using a simple yaml file or CLI overwrite
 - Load different dataset formats, use custom formats, or bring your own tokenized datasets
-- Integrated with xformer, flash attention, rope scaling, and multipacking
+- Integrated with xformer, flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
 - Works with single GPU or multiple GPUs via FSDP or Deepspeed
 - Easily run with Docker locally or on the cloud
 - Log results and optionally checkpoints to wandb or mlflow

From ce33e1ed839cc15b9351d3567ab44076cd4809c6 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 30 Aug 2024 17:51:18 -0400
Subject: [PATCH 02/13] pin liger-kernel to latest 0.2.1 (#1882) [skip ci]

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f5fb547a2..b8d0a388b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,7 +34,7 @@ tensorboard
 python-dotenv==1.0.1
 autoawq>=0.2.5
 triton>=2.3.0
-liger-kernel
+liger-kernel==0.2.1
 
 mamba-ssm==1.2.0.post1
 

From 15408d0f09062463a50d39cb0ff356fa1590a2b3 Mon Sep 17 00:00:00 2001
From: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
Date: Sat, 31 Aug 2024 18:59:48 -0700
Subject: [PATCH 03/13] Update supported models for Liger Kernel (#1875)

* Update supported models for Liger Kernel

Add Mistral LCE, Gemma LCE, Gemma 2 without LCE (softcapping is not yet implemented for Gemma in Liger Kernel LCE forward), Phi3 without LCE

* move import to their appropriate conditions

* Integrate Phi3 LCE support

https://github.com/linkedin/Liger-Kernel/pull/103/

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/axolotl/integrations/liger/__init__.py | 51 +++++++++++++++++++---
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/axolotl/integrations/liger/__init__.py b/src/axolotl/integrations/liger/__init__.py
index 2a3e95163..d58349932 100644
--- a/src/axolotl/integrations/liger/__init__.py
+++ b/src/axolotl/integrations/liger/__init__.py
@@ -23,7 +23,6 @@ import sys
 
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
 from liger_kernel.transformers.geglu import LigerGEGLUMLP
-from liger_kernel.transformers.model.llama import lce_forward as llama_lce_forward
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
@@ -43,6 +42,9 @@ class LigerPlugin(BasePlugin):
 
     def pre_model_load(self, cfg):
         if cfg.model_config_type == "llama":
+            from liger_kernel.transformers.model.llama import (
+                lce_forward as llama_lce_forward,
+            )
             from transformers.models.llama import modeling_llama
 
             if cfg.liger_rope:
@@ -57,6 +59,9 @@ class LigerPlugin(BasePlugin):
                 modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
 
         elif cfg.model_config_type == "mistral":
+            from liger_kernel.transformers.model.mistral import (
+                lce_forward as mistral_lce_forward,
+            )
             from transformers.models.mistral import modeling_mistral
 
             if cfg.liger_rope:
@@ -68,11 +73,12 @@ class LigerPlugin(BasePlugin):
             if cfg.liger_cross_entropy:
                 modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
             if cfg.liger_fused_linear_cross_entropy:
-                logging.warning(
-                    "Fused linear cross entropy is not supported for Mistral."
-                )
+                modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
 
         elif cfg.model_config_type == "gemma":
+            from liger_kernel.transformers.model.gemma import (
+                lce_forward as gemma_lce_forward,
+            )
             from transformers.models.gemma import modeling_gemma
 
             if cfg.liger_rope:
@@ -84,9 +90,7 @@ class LigerPlugin(BasePlugin):
             if cfg.liger_cross_entropy:
                 modeling_gemma.CrossEntropyLoss = LigerCrossEntropyLoss
             if cfg.liger_fused_linear_cross_entropy:
-                logging.warning(
-                    "Fused linear cross entropy is not supported for Gemma."
-                )
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
 
         elif cfg.model_config_type == "jamba":
             from transformers.models.jamba import modeling_jamba
@@ -145,3 +149,36 @@ class LigerPlugin(BasePlugin):
                 modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
             if cfg.liger_fused_linear_cross_entropy:
                 modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
+
+        elif cfg.model_config_type == "gemma2":
+            from transformers.models.gemma2 import modeling_gemma2
+
+            if cfg.liger_rope:
+                modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_gemma2.Gemma2RMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                logging.warning(
+                    "Fused linear cross entropy is not supported for Gemma 2."
+                )
+
+        elif cfg.model_config_type == "phi3":
+            from liger_kernel.transformers.model.phi3 import (
+                lce_forward as phi3_lce_forward,
+            )
+            from transformers.models.phi3 import modeling_phi3
+
+            if cfg.liger_rope:
+                modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_phi3.Phi3RMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_phi3.Phi3MLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward

From 3c6b9eda2ecffb0204eb1c29635f75c0115317b3 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 31 Aug 2024 22:49:35 -0400
Subject: [PATCH 04/13] run pytests with varied pytorch versions too (#1883)

---
 .github/workflows/tests-nightly.yml | 5 +++++
 .github/workflows/tests.yml         | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index 6b35698cb..30ed397ce 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -25,6 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.0"]
     timeout-minutes: 20
 
     steps:
@@ -37,6 +38,10 @@ jobs:
           python-version: ${{ matrix.python_version }}
           cache: 'pip' # caching pip dependencies
 
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
+
       - name: Update requirements.txt
         run: |
           sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 74b4bcfbd..c104e92c2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,6 +36,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.0"]
     timeout-minutes: 20
 
     steps:
@@ -48,6 +49,10 @@ jobs:
           python-version: ${{ matrix.python_version }}
           cache: 'pip' # caching pip dependencies
 
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
+
       - name: Install dependencies
         run: |
           pip3 install --upgrade pip

From bdab3ec587f9e85f5684cf57302a1bf93e78de1e Mon Sep 17 00:00:00 2001
From: Chiwan Park <chiwanpark@hotmail.com>
Date: Mon, 2 Sep 2024 07:34:24 +0900
Subject: [PATCH 05/13] Fix RMSNorm monkey patch for Gemma models (#1886)

---
 src/axolotl/integrations/liger/__init__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/integrations/liger/__init__.py b/src/axolotl/integrations/liger/__init__.py
index d58349932..2047f3815 100644
--- a/src/axolotl/integrations/liger/__init__.py
+++ b/src/axolotl/integrations/liger/__init__.py
@@ -20,6 +20,7 @@ It is designed to be performant, correct, and light-weight.
 """
 import logging
 import sys
+from functools import partial
 
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
 from liger_kernel.transformers.geglu import LigerGEGLUMLP
@@ -84,7 +85,9 @@ class LigerPlugin(BasePlugin):
             if cfg.liger_rope:
                 modeling_gemma.apply_rotary_pos_emb = liger_rotary_pos_emb
             if cfg.liger_rms_norm:
-                modeling_gemma.GemmaRMSNorm = LigerRMSNorm
+                modeling_gemma.GemmaRMSNorm = partial(
+                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
+                )
             if cfg.liger_swiglu:
                 modeling_gemma.GemmaMLP = LigerGEGLUMLP
             if cfg.liger_cross_entropy:
@@ -156,7 +159,9 @@ class LigerPlugin(BasePlugin):
             if cfg.liger_rope:
                 modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
             if cfg.liger_rms_norm:
-                modeling_gemma2.Gemma2RMSNorm = LigerRMSNorm
+                modeling_gemma2.Gemma2RMSNorm = partial(
+                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
+                )
             if cfg.liger_swiglu:
                 modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
             if cfg.liger_cross_entropy:

From 0aeb277456f0ed79ab46191a12998fccc257d414 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 1 Sep 2024 19:29:37 -0400
Subject: [PATCH 06/13] add e2e smoke tests for llama liger integration (#1884)

* add e2e smoke tests for llama liger integration

* fix import

* don't use __main__ for test

* consolidate line
---
 cicd/cicd.sh                       |   4 +-
 tests/e2e/integrations/__init__.py |   0
 tests/e2e/integrations/liger.py    | 110 +++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 tests/e2e/integrations/__init__.py
 create mode 100644 tests/e2e/integrations/liger.py

diff --git a/cicd/cicd.sh b/cicd/cicd.sh
index eceda9b37..104a8f84a 100755
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -2,5 +2,5 @@
 set -e
 
 pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/
-pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ /workspace/axolotl/tests/e2e/
+pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
+pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
diff --git a/tests/e2e/integrations/__init__.py b/tests/e2e/integrations/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/e2e/integrations/liger.py b/tests/e2e/integrations/liger.py
new file mode 100644
index 000000000..4497cebe3
--- /dev/null
+++ b/tests/e2e/integrations/liger.py
@@ -0,0 +1,110 @@
+"""
+Simple end-to-end test for Liger integration
+"""
+
+import unittest
+from pathlib import Path
+
+from axolotl.cli import load_datasets
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.train import train
+from axolotl.utils.config import normalize_config
+from axolotl.utils.dict import DictDefault
+
+from ..utils import with_temp_dir
+
+
+class LigerIntegrationTestCase(unittest.TestCase):
+    """
+    e2e tests for liger integration with Axolotl
+    """
+
+    @with_temp_dir
+    def test_llama_wo_flce(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "JackFram/llama-68m",
+                "tokenizer_type": "LlamaTokenizer",
+                "plugins": [
+                    "axolotl.integrations.liger.LigerPlugin",
+                ],
+                "liger_rope": True,
+                "liger_rms_norm": True,
+                "liger_swiglu": True,
+                "liger_cross_entropy": True,
+                "liger_fused_linear_cross_entropy": False,
+                "sequence_len": 1024,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "micro_batch_size": 8,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "save_safetensors": True,
+                "bf16": "auto",
+            }
+        )
+        normalize_config(cfg)
+        cli_args = TrainerCliArgs()
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+        assert (Path(temp_dir) / "model.safetensors").exists()
+
+    @with_temp_dir
+    def test_llama_w_flce(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "JackFram/llama-68m",
+                "tokenizer_type": "LlamaTokenizer",
+                "plugins": [
+                    "axolotl.integrations.liger.LigerPlugin",
+                ],
+                "liger_rope": True,
+                "liger_rms_norm": True,
+                "liger_swiglu": True,
+                "liger_cross_entropy": False,
+                "liger_fused_linear_cross_entropy": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "micro_batch_size": 8,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "save_safetensors": True,
+                "bf16": "auto",
+            }
+        )
+        normalize_config(cfg)
+        cli_args = TrainerCliArgs()
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+        assert (Path(temp_dir) / "model.safetensors").exists()

From 4e5400c732c6b8baf2d0f7a700ce773b1501b1fc Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 3 Sep 2024 20:02:44 -0400
Subject: [PATCH 07/13] support for auto_find_batch_size when packing (#1885)

* support for auto_find_batch_size when packing

* make sure to return data from validation

* make sure to return data from validation

* actually expose multipack_real_batches in the config

* calculate gathered efficiency in sampler

* tweak to fix auto find and use actual sampler len for multipack

* uncomment

* use args for bsz when not available from auto find
---
 src/axolotl/core/trainer_builder.py           | 15 ++++---
 .../config/models/input/v0_4_1/__init__.py    |  3 ++
 src/axolotl/utils/samplers/multipack.py       | 40 +++++++++++++++++--
 src/axolotl/utils/trainer.py                  |  2 +-
 4 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
index 656ded255..f4cd25783 100755
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -506,9 +506,10 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                 batch_max_len = self.args.max_seq_length
             else:
                 batch_size = 1
-                batch_max_len = (
-                    self.args.per_device_train_batch_size * self.args.max_seq_length
+                train_batch_size = (
+                    self.state.train_batch_size or self.args.per_device_train_batch_size
                 )
+                batch_max_len = train_batch_size * self.args.max_seq_length
             return MultipackBatchSampler(
                 RandomSampler(self.train_dataset),
                 lengths=get_dataset_lengths(self.train_dataset),
@@ -1379,6 +1380,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
             training_arguments_kwargs[
                 "per_device_eval_batch_size"
             ] = self.cfg.eval_batch_size
+        if self.cfg.auto_find_batch_size is not None:
+            training_arguments_kwargs[
+                "auto_find_batch_size"
+            ] = self.cfg.auto_find_batch_size
         training_arguments_kwargs[
             "gradient_accumulation_steps"
         ] = self.cfg.gradient_accumulation_steps
@@ -1461,9 +1466,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
         )
 
         training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
-        training_arguments_kwargs[
-            "multipack_real_batches"
-        ] = not self.cfg.flash_attention
+        training_arguments_kwargs["multipack_real_batches"] = (
+            not self.cfg.flash_attention or self.cfg.multipack_real_batches
+        )
         training_arguments_kwargs["eval_sample_packing"] = bool(
             self.cfg.eval_sample_packing
         )
diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 65a2c5409..9044047cc 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -355,6 +355,8 @@ class HyperparametersConfig(BaseModel):
         },
     )
 
+    auto_find_batch_size: Optional[bool] = None
+
     train_on_inputs: Optional[bool] = False
     group_by_length: Optional[bool] = None
 
@@ -592,6 +594,7 @@ class AxolotlInputConfig(
     eval_sample_packing: Optional[bool] = None
     pad_to_sequence_len: Optional[bool] = None
     curriculum_sampling: Optional[bool] = None
+    multipack_real_batches: Optional[bool] = None
 
     # for PoSE context length extension
     use_pose: Optional[bool] = None
diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index 957ca5746..205c2894d 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -11,6 +11,8 @@ import numba
 import numpy as np
 from torch.utils.data import BatchSampler, Sampler
 
+from axolotl.utils.distributed import reduce_and_broadcast
+
 LOG = logging.getLogger("axolotl.utils.samplers.multipack")
 
 
@@ -174,16 +176,46 @@ class MultipackBatchSampler(BatchSampler):
     def efficiency(self):
         return self.eff_total_used / self.eff_total_slots
 
+    def gather_efficiency(self):
+        def calc_sample_packing_eff_est(estimates: List[float]):
+            LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
+            return math.floor(0.997 * max(estimates))
+
+        sample_packing_actual_eff_all = reduce_and_broadcast(
+            lambda: self.efficiency(),  # pylint: disable=unnecessary-lambda
+            calc_sample_packing_eff_est,
+        )
+        sample_packing_eff_est = (
+            math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
+        )
+        return sample_packing_eff_est
+
+    def gather_len_batches(self, num):
+        def calc_min_len(estimates: list[(int, float)]):
+            LOG.info(f"gather_len_batches: {repr(estimates)}")
+            return math.floor(0.998 * min(estimates))
+
+        min_len_batches = reduce_and_broadcast(
+            lambda: num,
+            calc_min_len,
+        )
+        return min_len_batches
+
     def __len__(self):
-        self.num_batches()
-        return self._len_est()
+        len_batches = self.num_batches()
+        return self.gather_len_batches(len_batches)
 
     def _len_est(self):
+        efficiency = (
+            self.packing_efficiency_estimate
+            if self.packing_efficiency_estimate
+            else self.gather_efficiency()
+        )
         world_size = int(os.getenv("WORLD_SIZE", "1"))
         lengths_sum = np.sum(self.lengths)
         lengths_sum_per_device = lengths_sum // world_size
         LOG.info(
-            f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "
+            f"packing_efficiency_estimate: {efficiency} "
             f"total_num_tokens per device: {lengths_sum_per_device}"
         )
 
@@ -195,7 +227,7 @@ class MultipackBatchSampler(BatchSampler):
                 * math.floor(
                     0.99
                     * lengths_sum_per_device
-                    / self.packing_efficiency_estimate
+                    / efficiency
                     // (self.batch_max_len * self.batch_size)
                 )
                 - 1
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index f4e1fc6cb..1029fff13 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -357,7 +357,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 main_process_only=True,
             )
         else:
-            if cfg.flash_attention:
+            if cfg.flash_attention and not cfg.multipack_real_batches:
                 sampler_batch_size = 1
                 batch_max_len = cfg.micro_batch_size * cfg.sequence_len
             else:

From dca1fe47d44d69c4558729070a3716b6f79e555c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 4 Sep 2024 11:28:47 -0400
Subject: [PATCH 08/13] fix optimizer + fsdp combination in example (#1893)

---
 examples/llama-3/fft-8b-liger-fsdp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml
index a64965d20..e84d221f8 100644
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -31,7 +31,7 @@ wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 2e-5
 

From f18f4268b53be3c9518e2223b1f0aa874d5938c3 Mon Sep 17 00:00:00 2001
From: Tijmen de Haan <tijm6140@gmail.com>
Date: Thu, 5 Sep 2024 18:33:19 +0900
Subject: [PATCH 09/13] Docs for AMD-based HPC systems (#1891)

* Add documentation for installing on AMD-based HPC systems.

* Accept suggestion to add note about deepspeed

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* Update _quarto.yml with amd_hpc doc

---------

Co-authored-by: Tijmen de Haan <tijmen.dehaan@gmail.com>
Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
---
 _quarto.yml      |   1 +
 docs/amd_hpc.qmd | 108 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 docs/amd_hpc.qmd

diff --git a/_quarto.yml b/_quarto.yml
index 6b2eed971..acb487258 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -37,6 +37,7 @@ website:
             - docs/mac.qmd
             - docs/multi-node.qmd
             - docs/unsloth.qmd
+            - docs/amd_hpc.qmd
         - section: "Dataset Formats"
           contents: docs/dataset-formats/*
         - section: "Reference"
diff --git a/docs/amd_hpc.qmd b/docs/amd_hpc.qmd
new file mode 100644
index 000000000..92eadee03
--- /dev/null
+++ b/docs/amd_hpc.qmd
@@ -0,0 +1,108 @@
+---
+title: Training with AMD GPUs on HPC Systems
+description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs
+---
+
+This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.
+
+## Setup
+
+### 1. Install Python
+
+We recommend using Miniforge, a minimal conda-based Python distribution:
+
+```bash
+curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+bash Miniforge3-$(uname)-$(uname -m).sh
+```
+
+### 2. Configure Python Environment
+Add Python to your PATH and ensure it's available at login:
+
+```bash
+echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc
+echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile
+```
+
+### 3. Load AMD GPU Software
+
+Load the ROCm module:
+
+```bash
+module load rocm/5.7.1
+```
+
+Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.
+
+### 4. Install PyTorch
+
+Install PyTorch with ROCm support:
+
+```bash
+pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall
+```
+
+### 5. Install Flash Attention
+
+Clone and install the Flash Attention repository:
+
+```bash
+git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git
+export GPU_ARCHS="gfx90a"
+cd flash-attention
+export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
+patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
+pip install .
+```
+
+### 6. Install Axolotl
+
+Clone and install Axolotl:
+
+```bash
+git clone https://github.com/axolotl-ai-cloud/axolotl
+cd axolotl
+pip install packaging ninja
+pip install -e .
+```
+
+### 7. Apply xformers Workaround
+
+xformers appears to be incompatible with ROCm. Apply the following workarounds:
+ - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers.
+ - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement.
+
+### 8. Prepare Job Submission Script
+
+Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include
+
+```bash
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+```
+
+### 9. Download Base Model
+
+Download a base model using the Hugging Face CLI:
+
+```bash
+huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
+```
+
+### 10. Create Axolotl Configuration
+
+Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.
+
+Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.
+
+### 11. Preprocess Data
+
+Run preprocessing on the login node:
+
+```bash
+CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml
+```
+
+### 12. Train
+
+You are now ready to submit your previously prepared job script. 🚂
\ No newline at end of file

From 93b769a9792db5908885537ed42fb7eef80f0f1c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 5 Sep 2024 09:58:21 -0400
Subject: [PATCH 10/13] lint fix and update gha regex (#1899)

---
 .github/workflows/lint.yml | 2 +-
 docs/amd_hpc.qmd           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 671be4b65..919cfd654 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
        - '**.py'
        - 'requirements.txt'
        - '.github/workflows/*.yml'
-       - "*.md"
+       - "*.[q]md"
        - "examples/**/*.y[a]?ml"
   workflow_dispatch:
 
diff --git a/docs/amd_hpc.qmd b/docs/amd_hpc.qmd
index 92eadee03..d1c274e15 100644
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -105,4 +105,4 @@ CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.ya
 
 ### 12. Train
 
-You are now ready to submit your previously prepared job script. 🚂
\ No newline at end of file
+You are now ready to submit your previously prepared job script. 🚂

From ab461d83c4b78df70d310ce45e33ef145796611d Mon Sep 17 00:00:00 2001
From: Alpay Ariyak <98838263+alpayariyak@users.noreply.github.com>
Date: Thu, 5 Sep 2024 07:11:31 -0700
Subject: [PATCH 11/13] Fix documentation for pre-tokenized dataset (#1894)

It's currently asking to not add BOS and EOS, stating that Axolotl adds them, but this is not true
---
 docs/dataset-formats/tokenized.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dataset-formats/tokenized.qmd b/docs/dataset-formats/tokenized.qmd
index b2ea003c0..61028cae7 100644
--- a/docs/dataset-formats/tokenized.qmd
+++ b/docs/dataset-formats/tokenized.qmd
@@ -7,7 +7,7 @@ order: 5
 - Pass an empty `type:` in your axolotl config.
 - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
 - To indicate that a token should be ignored during training, set its corresponding label to `-100`.
-- Do not add BOS/EOS. Axolotl will add them for you based on the default tokenizer for the model you're using.
+- You must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.
 - For pretraining, do not truncate/pad documents to the context window length.
 - For instruction training, documents must be truncated/padded as desired.
 

From 6e354682e3c1735d3f7fb9e362280c38e922260f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 5 Sep 2024 10:58:50 -0400
Subject: [PATCH 12/13] fix zero3 integration (#1897)

* fix zero3 integration

* bump transformers and accelerate too
---
 requirements.txt             | 4 ++--
 src/axolotl/utils/trainer.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b8d0a388b..c61216e63 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
 peft==0.12.0
-transformers==4.44.0
+transformers==4.44.2
 tokenizers>=0.19.1
 bitsandbytes==0.43.3
-accelerate==0.33.0
+accelerate==0.34.0
 datasets==2.20.0
 deepspeed==0.14.4
 pydantic==2.6.3
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 1029fff13..89ae4e697 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -425,7 +425,8 @@ def setup_deepspeed_env(cfg, stage=None):
         os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage)
         if stage == 3:
             os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
-    HfTrainerDeepSpeedConfig(cfg.deepspeed)
+    # If we don't assign this, it doesn't actually get set in the accelerate weakref
+    _ = HfTrainerDeepSpeedConfig(cfg.deepspeed)
 
 
 def setup_fsdp_envs(cfg):

From 3853ab7ae9220dfbd78cd628e54fde75fb89df97 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 7 Sep 2024 14:39:31 -0400
Subject: [PATCH 13/13] bump accelerate to 0.34.2 (#1901)

* bump accelerate

* add fixture to predownload the test model

* change fixture
---
 .github/workflows/multi-gpu-e2e.yml | 3 +++
 requirements.txt                    | 2 +-
 tests/e2e/multigpu/test_llama.py    | 7 +++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 91cbaf957..ab886c67f 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -1,6 +1,9 @@
 name: docker-multigpu-tests-biweekly
 
 on:
+  pull_request:
+    paths:
+      - 'tests/e2e/multigpu/*.py'
   workflow_dispatch:
   schedule:
     - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
diff --git a/requirements.txt b/requirements.txt
index c61216e63..83116af60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ peft==0.12.0
 transformers==4.44.2
 tokenizers>=0.19.1
 bitsandbytes==0.43.3
-accelerate==0.34.0
+accelerate==0.34.2
 datasets==2.20.0
 deepspeed==0.14.4
 pydantic==2.6.3
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 344c57fb8..61bb8ed32 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -10,6 +10,7 @@ from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from huggingface_hub import snapshot_download
 
 from axolotl.utils.dict import DictDefault
 
@@ -19,6 +20,12 @@ LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
 
 
+@pytest.fixture(scope="session", autouse=True)
+def download_model():
+    # download the model
+    snapshot_download("TinyLlama/TinyLlama_v1.1")
+
+
 class TestMultiGPULlama(unittest.TestCase):
     """
     Test case for Llama models using LoRA