Compare commits

..

13 Commits

Author SHA1 Message Date
Wing Lian
c7f1c191a3 additional validation for fsdp2, bump dep versions 2025-04-06 15:18:56 -04:00
Wing Lian
1a5d445413 make sure to patch all the loaded models 2025-04-06 14:45:30 -04:00
Wing Lian
7e410ab480 more fixes to flex for fsdp2 2025-04-06 14:24:50 -04:00
Wing Lian
b5a51c378b okay, actually use fdsp2... 2025-04-06 13:55:46 -04:00
Wing Lian
c902f4222d make sure both flex and flash attn work with fsdp2, skip fix untrained tokens 2025-04-06 12:30:14 -04:00
Wing Lian
9329db9c3a fix fsdp2 config for ci 2025-04-06 07:55:54 -04:00
Wing Lian
ad7293f617 skip zero3 tests for this PR for now 2025-04-06 07:49:38 -04:00
Wing Lian
475125e4ca use transformers commit with fsdp2 support 2025-04-06 07:49:06 -04:00
Wing Lian
2b5e546da0 add fsdp2 e2e tests 2025-04-06 07:49:06 -04:00
Wing Lian
252dc5c91b liger + torch compile fix 2025-04-06 07:49:06 -04:00
Wing Lian
af3f981f51 allow 8bit optims with fsdp2 2025-04-06 07:49:06 -04:00
Wing Lian
52b96031b4 use accelerate release 1.6.0 2025-04-06 07:49:05 -04:00
Wing Lian
03dcf1a5ea fsdp2 support 2025-04-06 07:49:05 -04:00
7 changed files with 26 additions and 39 deletions

View File

@@ -24,13 +24,6 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
@@ -45,6 +38,13 @@ jobs:
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal]
timeout-minutes: 120
steps:

View File

@@ -211,7 +211,7 @@ jobs:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
pytorch: 2.5.1
num_gpus: 1
axolotl_extras: vllm
steps:
@@ -258,7 +258,7 @@ jobs:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.5.1
pytorch: 2.6.0
num_gpus: 1
axolotl_extras: vllm
steps:

View File

@@ -11,14 +11,13 @@ liger-kernel==0.5.5
packaging==23.2
peft==0.15.1
peft==0.15.0
transformers==4.51.0
tokenizers>=0.21.1
accelerate==1.6.0
datasets==3.5.0
deepspeed>=0.15.4
trl==0.16.1
hf_xet==1.0.0
optimum==1.16.2
hf_transfer

View File

@@ -13,7 +13,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [
"mllama_text_model",
"llama",
"llama4",
"mistral",
"mixtral",
"qwen2",

File diff suppressed because one or more lines are too long

View File

@@ -7,11 +7,9 @@ import os
from pathlib import Path
import pytest
import transformers
import yaml
from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download
from packaging import version
from transformers.testing_utils import get_torch_dist_unique_port
from axolotl.utils.dict import DictDefault
@@ -30,10 +28,6 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M")
def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version)
class TestMultiGPULlama:
"""
Test case for Llama models using LoRA
@@ -62,7 +56,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 1,
"micro_batch_size": 4,
"gradient_accumulation_steps": 4,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -114,7 +108,7 @@ class TestMultiGPULlama:
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.05,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
},
@@ -122,7 +116,6 @@ class TestMultiGPULlama:
{
"path": "tatsu-lab/alpaca",
"type": "alpaca",
"split": "train[:20%]",
},
],
"num_epochs": 1,
@@ -200,7 +193,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 4,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -397,7 +390,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"sequence_len": 2048,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -410,7 +403,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -558,7 +551,7 @@ class TestMultiGPULlama:
"sample_packing": True,
"eval_sample_packing": False,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"sequence_len": 2048,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -572,7 +565,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -619,11 +612,8 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
)
# TODO: remove skip once deepspeed regression is fixed
# see https://github.com/huggingface/transformers/pull/37324
@pytest.mark.skipif(
transformers_version_eq("4.51.0"),
reason="zero3 is not supported with transformers==4.51.0",
@pytest.mark.skip(
reason="ds-zero3 broken in main until transformers#37281 resolved"
)
@pytest.mark.parametrize(
"gradient_accumulation_steps",
@@ -661,7 +651,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"sequence_len": 2048,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -734,7 +724,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"sequence_len": 2048,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -807,7 +797,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"sequence_len": 2048,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -895,7 +885,7 @@ class TestMultiGPULlama:
"sample_packing": True,
"bf16": True,
"save_safetensors": True,
# "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
}
)

View File

@@ -31,7 +31,7 @@ class TestMultiGPURay:
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 1024,
"sequence_len": 2048,
"adapter": "lora",
"lora_r": 8,
"lora_alpha": 16,
@@ -94,8 +94,8 @@ class TestMultiGPURay:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 1024,
"val_set_size": 0.01,
"sequence_len": 2048,
"val_set_size": 0.05,
"special_tokens": {
"pad_token": "<|endoftext|>",
},