additional validation for fsdp2, bump dep versions

make sure to patch all the loaded models
more fixes to flex for fsdp2
2025-04-06 15:18:56 -04:00 · 2025-04-06 14:45:30 -04:00 · 2025-04-06 14:24:50 -04:00 · 2025-04-06 13:55:46 -04:00 · 2025-04-06 12:30:14 -04:00 · 2025-04-06 07:55:54 -04:00
7 changed files with 26 additions and 39 deletions
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -24,13 +24,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -45,6 +38,13 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -211,7 +211,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras: vllm
    steps:
@@ -258,7 +258,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
    steps:
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,14 +11,13 @@ liger-kernel==0.5.5
 packaging==23.2
-peft==0.15.1
+peft==0.15.0
 transformers==4.51.0
 tokenizers>=0.21.1
 accelerate==1.6.0
 datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.16.1
 hf_xet==1.0.0
 optimum==1.16.2
 hf_transfer
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -13,7 +13,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
 SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mllama_text_model",
    "llama",
    "llama4",
    "mistral",
    "mixtral",
    "qwen2",
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -7,11 +7,9 @@ import os
 from pathlib import Path
 import pytest
 import transformers
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from huggingface_hub import snapshot_download
 from packaging import version
 from transformers.testing_utils import get_torch_dist_unique_port
 from axolotl.utils.dict import DictDefault
@@ -30,10 +28,6 @@ def download_model():
    snapshot_download("HuggingFaceTB/SmolLM2-135M")
 def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)
 class TestMultiGPULlama:
    """
    Test case for Llama models using LoRA
@@ -62,7 +56,7 @@ class TestMultiGPULlama:
                ],
                "num_epochs": 1,
                "max_steps": 2,
-                "micro_batch_size": 1,
+                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
@@ -114,7 +108,7 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -122,7 +116,6 @@ class TestMultiGPULlama:
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:20%]",
                    },
                ],
                "num_epochs": 1,
@@ -200,7 +193,7 @@ class TestMultiGPULlama:
                ],
                "num_epochs": 1,
                "max_steps": 2,
-                "micro_batch_size": 2,
+                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
@@ -397,7 +390,7 @@ class TestMultiGPULlama:
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
@@ -410,7 +403,7 @@ class TestMultiGPULlama:
                ],
                "num_epochs": 1,
                "max_steps": 2,
-                "micro_batch_size": 2,
+                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
@@ -558,7 +551,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
@@ -572,7 +565,7 @@ class TestMultiGPULlama:
                ],
                "num_epochs": 1,
                "max_steps": 2,
-                "micro_batch_size": 2,
+                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
@@ -619,11 +612,8 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
        )
-    # TODO: remove skip once deepspeed regression is fixed
+    @pytest.mark.skip(
-    # see https://github.com/huggingface/transformers/pull/37324
+        reason="ds-zero3 broken in main until transformers#37281 resolved"
    @pytest.mark.skipif(
        transformers_version_eq("4.51.0"),
        reason="zero3 is not supported with transformers==4.51.0",
    )
    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
@@ -661,7 +651,7 @@ class TestMultiGPULlama:
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
@@ -734,7 +724,7 @@ class TestMultiGPULlama:
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
@@ -807,7 +797,7 @@ class TestMultiGPULlama:
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
@@ -895,7 +885,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
-                # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
            }
        )
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -31,7 +31,7 @@ class TestMultiGPURay:
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 1024,
+                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
@@ -94,8 +94,8 @@ class TestMultiGPURay:
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                "sequence_len": 2048,
-                "val_set_size": 0.01,
+                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
Author	SHA1	Message	Date
Wing Lian	c7f1c191a3	additional validation for fsdp2, bump dep versions	2025-04-06 15:18:56 -04:00
Wing Lian	1a5d445413	make sure to patch all the loaded models	2025-04-06 14:45:30 -04:00
Wing Lian	7e410ab480	more fixes to flex for fsdp2	2025-04-06 14:24:50 -04:00
Wing Lian	b5a51c378b	okay, actually use fdsp2...	2025-04-06 13:55:46 -04:00
Wing Lian	c902f4222d	make sure both flex and flash attn work with fsdp2, skip fix untrained tokens	2025-04-06 12:30:14 -04:00
Wing Lian	9329db9c3a	fix fsdp2 config for ci	2025-04-06 07:55:54 -04:00
Wing Lian	ad7293f617	skip zero3 tests for this PR for now	2025-04-06 07:49:38 -04:00
Wing Lian	475125e4ca	use transformers commit with fsdp2 support	2025-04-06 07:49:06 -04:00
Wing Lian	2b5e546da0	add fsdp2 e2e tests	2025-04-06 07:49:06 -04:00
Wing Lian	252dc5c91b	liger + torch compile fix	2025-04-06 07:49:06 -04:00
Wing Lian	af3f981f51	allow 8bit optims with fsdp2	2025-04-06 07:49:06 -04:00
Wing Lian	52b96031b4	use accelerate release 1.6.0	2025-04-06 07:49:05 -04:00
Wing Lian	03dcf1a5ea	fsdp2 support	2025-04-06 07:49:05 -04:00