From 54392ac8a662f0746f4128e68d1088edb58a3711 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 9 Aug 2024 11:50:13 -0400
Subject: [PATCH] Attempt to run multigpu in PR CI for now to ensure it works
 (#1815) [skip ci]

* Attempt to run multigpu in PR CI for now to ensure it works

* fix yaml file

* forgot to include multigpu tests

* fix call to cicd.multigpu

* dump dictdefault to dict for yaml conversion

* use to_dict instead of casting

* 16bit-lora w flash attention, 8bit lora seems problematic

* add llama fsdp test

* more tests

* Add test for qlora + fsdp with prequant

* limit accelerate to 2 processes and disable broken qlora+fsdp+bnb test

* move multigpu tests to biweekly
---
 .github/workflows/multi-gpu-e2e.yml |  44 ++++
 cicd/cicd.sh                        |   2 +-
 cicd/multigpu.py                    |  77 +++++++
 cicd/multigpu.sh                    |   5 +
 cicd/tests.py                       |   8 +-
 tests/e2e/multigpu/__init__.py      |   0
 tests/e2e/multigpu/test_llama.py    | 341 ++++++++++++++++++++++++++++
 7 files changed, 473 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/multi-gpu-e2e.yml
 create mode 100644 cicd/multigpu.py
 create mode 100755 cicd/multigpu.sh
 create mode 100644 tests/e2e/multigpu/__init__.py
 create mode 100644 tests/e2e/multigpu/test_llama.py

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
new file mode 100644
index 000000000..c854af9ab
--- /dev/null
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -0,0 +1,44 @@
+name: docker-multigpu-tests-biweekly
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
+
+jobs:
+  test-axolotl-multigpu:
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.11"
+            pytorch: 2.3.1
+            axolotl_extras:
+            num_gpus: 2
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 120
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==0.63.64 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.multigpu
diff --git a/cicd/cicd.sh b/cicd/cicd.sh
index 180150ea2..eceda9b37 100755
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,4 +3,4 @@ set -e
 
 pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
 pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/
-pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
+pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ /workspace/axolotl/tests/e2e/
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
new file mode 100644
index 000000000..be10fbc73
--- /dev/null
+++ b/cicd/multigpu.py
@@ -0,0 +1,77 @@
+"""
+ modal application to run axolotl gpu tests in Modal
+ """
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import Image, Stub
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = (
+    Image.from_dockerfile(
+        pathlib.Path(temp_dir) / "Dockerfile",
+        force_build=True,
+        gpu="A10G",
+    )
+    .env(df_args)
+    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
+)
+
+stub = Stub("Axolotl CI/CD", secrets=[])
+
+
+N_GPUS = int(os.environ.get("N_GPUS", 2))
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+
+
+@stub.function(
+    image=cicd_image,
+    gpu=GPU_CONFIG,
+    timeout=45 * 60,
+    cpu=8.0,
+    memory=131072 * N_GPUS,
+)
+def cicd_pytest():
+    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
+
+
+@stub.local_entrypoint()
+def main():
+    cicd_pytest.remote()
diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
new file mode 100755
index 000000000..ff7f9f19a
--- /dev/null
+++ b/cicd/multigpu.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -e
+
+# only run one test at a time so as not to OOM the GPU
+pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
diff --git a/cicd/tests.py b/cicd/tests.py
index bfbdb7b90..c21467637 100644
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -1,6 +1,8 @@
 """
  modal application to run axolotl gpu tests in Modal
  """
+# pylint: disable=duplicate-code
+
 import os
 import pathlib
 import tempfile
@@ -21,9 +23,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
     "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
     "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
-    "CUDA": os.environ.get("CUDA", "118"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
     "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
     "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
 }
diff --git a/tests/e2e/multigpu/__init__.py b/tests/e2e/multigpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
new file mode 100644
index 000000000..344c57fb8
--- /dev/null
+++ b/tests/e2e/multigpu/test_llama.py
@@ -0,0 +1,341 @@
+"""
+E2E tests for multigpu lora tinyllama
+"""
+
+import logging
+import os
+import unittest
+from pathlib import Path
+
+import pytest
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+
+from axolotl.utils.dict import DictDefault
+
+from ..utils import with_temp_dir
+
+LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
+os.environ["WANDB_DISABLED"] = "true"
+
+
+class TestMultiGPULlama(unittest.TestCase):
+    """
+    Test case for Llama models using LoRA
+    """
+
+    @with_temp_dir
+    def test_lora_ddp(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
+                "sequence_len": 2048,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 100,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+    @with_temp_dir
+    def test_lora_ddp_packed(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
+                "sequence_len": 2048,
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 50,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+    @with_temp_dir
+    def test_fsdp(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
+                "sequence_len": 2048,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 100,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+    @with_temp_dir
+    def test_fsdp_packed(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "sequence_len": 2048,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 100,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+    @pytest.mark.skip("disabled due to upstream issue")
+    @with_temp_dir
+    def test_fsdp_qlora_prequant_packed(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16",
+                "tokenizer_type": "AutoTokenizer",
+                "adapter": "qlora",
+                "load_in_4bit": True,
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "lora_modules_to_save": [
+                    "embed_tokens",
+                    "lm_head",
+                ],
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "sequence_len": 2048,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|end_of_text|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:25%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 100,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )