Compare commits

..

13 Commits

Author SHA1 Message Date
Wing Lian
c7f1c191a3 additional validation for fsdp2, bump dep versions 2025-04-06 15:18:56 -04:00
Wing Lian
1a5d445413 make sure to patch all the loaded models 2025-04-06 14:45:30 -04:00
Wing Lian
7e410ab480 more fixes to flex for fsdp2 2025-04-06 14:24:50 -04:00
Wing Lian
b5a51c378b okay, actually use fdsp2... 2025-04-06 13:55:46 -04:00
Wing Lian
c902f4222d make sure both flex and flash attn work with fsdp2, skip fix untrained tokens 2025-04-06 12:30:14 -04:00
Wing Lian
9329db9c3a fix fsdp2 config for ci 2025-04-06 07:55:54 -04:00
Wing Lian
ad7293f617 skip zero3 tests for this PR for now 2025-04-06 07:49:38 -04:00
Wing Lian
475125e4ca use transformers commit with fsdp2 support 2025-04-06 07:49:06 -04:00
Wing Lian
2b5e546da0 add fsdp2 e2e tests 2025-04-06 07:49:06 -04:00
Wing Lian
252dc5c91b liger + torch compile fix 2025-04-06 07:49:06 -04:00
Wing Lian
af3f981f51 allow 8bit optims with fsdp2 2025-04-06 07:49:06 -04:00
Wing Lian
52b96031b4 use accelerate release 1.6.0 2025-04-06 07:49:05 -04:00
Wing Lian
03dcf1a5ea fsdp2 support 2025-04-06 07:49:05 -04:00
7 changed files with 26 additions and 39 deletions

View File

@@ -24,13 +24,6 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
@@ -45,6 +38,13 @@ jobs:
axolotl_extras: vllm axolotl_extras: vllm
num_gpus: 2 num_gpus: 2
nightly_build: "true" nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal] runs-on: [self-hosted, modal]
timeout-minutes: 120 timeout-minutes: 120
steps: steps:

View File

@@ -211,7 +211,7 @@ jobs:
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.5.1
num_gpus: 1 num_gpus: 1
axolotl_extras: vllm axolotl_extras: vllm
steps: steps:
@@ -258,7 +258,7 @@ jobs:
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.5.1 pytorch: 2.6.0
num_gpus: 1 num_gpus: 1
axolotl_extras: vllm axolotl_extras: vllm
steps: steps:

View File

@@ -11,14 +11,13 @@ liger-kernel==0.5.5
packaging==23.2 packaging==23.2
peft==0.15.1 peft==0.15.0
transformers==4.51.0 transformers==4.51.0
tokenizers>=0.21.1 tokenizers>=0.21.1
accelerate==1.6.0 accelerate==1.6.0
datasets==3.5.0 datasets==3.5.0
deepspeed>=0.15.4 deepspeed>=0.15.4
trl==0.16.1 trl==0.16.1
hf_xet==1.0.0
optimum==1.16.2 optimum==1.16.2
hf_transfer hf_transfer

View File

@@ -13,7 +13,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"mllama_text_model", "mllama_text_model",
"llama", "llama",
"llama4",
"mistral", "mistral",
"mixtral", "mixtral",
"qwen2", "qwen2",

File diff suppressed because one or more lines are too long

View File

@@ -7,11 +7,9 @@ import os
from pathlib import Path from pathlib import Path
import pytest import pytest
import transformers
import yaml import yaml
from accelerate.test_utils import execute_subprocess_async from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from packaging import version
from transformers.testing_utils import get_torch_dist_unique_port from transformers.testing_utils import get_torch_dist_unique_port
from axolotl.utils.dict import DictDefault from axolotl.utils.dict import DictDefault
@@ -30,10 +28,6 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M") snapshot_download("HuggingFaceTB/SmolLM2-135M")
def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version)
class TestMultiGPULlama: class TestMultiGPULlama:
""" """
Test case for Llama models using LoRA Test case for Llama models using LoRA
@@ -62,7 +56,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 1, "micro_batch_size": 4,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -114,7 +108,7 @@ class TestMultiGPULlama:
"lora_alpha": 16, "lora_alpha": 16,
"lora_dropout": 0.05, "lora_dropout": 0.05,
"lora_target_linear": True, "lora_target_linear": True,
"val_set_size": 0.05, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
}, },
@@ -122,7 +116,6 @@ class TestMultiGPULlama:
{ {
"path": "tatsu-lab/alpaca", "path": "tatsu-lab/alpaca",
"type": "alpaca", "type": "alpaca",
"split": "train[:20%]",
}, },
], ],
"num_epochs": 1, "num_epochs": 1,
@@ -200,7 +193,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 2, "micro_batch_size": 4,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -397,7 +390,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -410,7 +403,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 2, "micro_batch_size": 4,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -558,7 +551,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -572,7 +565,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 2, "micro_batch_size": 4,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -619,11 +612,8 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
) )
# TODO: remove skip once deepspeed regression is fixed @pytest.mark.skip(
# see https://github.com/huggingface/transformers/pull/37324 reason="ds-zero3 broken in main until transformers#37281 resolved"
@pytest.mark.skipif(
transformers_version_eq("4.51.0"),
reason="zero3 is not supported with transformers==4.51.0",
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"gradient_accumulation_steps", "gradient_accumulation_steps",
@@ -661,7 +651,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -734,7 +724,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -807,7 +797,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -895,7 +885,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"bf16": True, "bf16": True,
"save_safetensors": True, "save_safetensors": True,
# "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True, "use_tensorboard": True,
} }
) )

View File

@@ -31,7 +31,7 @@ class TestMultiGPURay:
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 1024, "sequence_len": 2048,
"adapter": "lora", "adapter": "lora",
"lora_r": 8, "lora_r": 8,
"lora_alpha": 16, "lora_alpha": 16,
@@ -94,8 +94,8 @@ class TestMultiGPURay:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 1024, "sequence_len": 2048,
"val_set_size": 0.01, "val_set_size": 0.05,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
}, },