llama4 support (#2493)

* llama4 support

* add xet support [skip ci]

* be flexible on transformers version and skip test on version

* don't use deepspeed for the fix_untrained_tokens test

* reordering to trigger torch 2.6.0 tests first

* slightly smaller train set

* use 4.51.0 for now

* remove stray print, add llama4 chat template to schema, bump peft to 0.15.1

* patches to make llama4 performant

* add preliminary fp8 support
This commit is contained in:
Wing Lian
2025-04-07 10:49:15 -04:00
committed by GitHub
parent 5f4af3665d
commit 8bbad21bfd
17 changed files with 409 additions and 34 deletions

View File

@@ -7,9 +7,11 @@ import os
from pathlib import Path
import pytest
import transformers
import yaml
from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download
from packaging import version
from transformers.testing_utils import get_torch_dist_unique_port
from axolotl.utils.dict import DictDefault
@@ -28,6 +30,10 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M")
def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version)
class TestMultiGPULlama:
"""
Test case for Llama models using LoRA
@@ -56,7 +62,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 4,
"micro_batch_size": 1,
"gradient_accumulation_steps": 4,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -108,7 +114,7 @@ class TestMultiGPULlama:
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.01,
"val_set_size": 0.05,
"special_tokens": {
"pad_token": "<|endoftext|>",
},
@@ -116,6 +122,7 @@ class TestMultiGPULlama:
{
"path": "tatsu-lab/alpaca",
"type": "alpaca",
"split": "train[:20%]",
},
],
"num_epochs": 1,
@@ -193,7 +200,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 4,
"micro_batch_size": 2,
"gradient_accumulation_steps": 4,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -390,7 +397,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -403,7 +410,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 4,
"micro_batch_size": 2,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -493,9 +500,7 @@ class TestMultiGPULlama:
],
"fsdp_config": {
"fsdp_version": 2,
"fsdp_forward_prefetch": True,
"fsdp_sync_module_states": True,
"fsdp_use_orig_params": True,
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
"fsdp_offload_params": False,
"fsdp_cpu_ram_efficient_loading": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
@@ -551,7 +556,7 @@ class TestMultiGPULlama:
"sample_packing": True,
"eval_sample_packing": False,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -565,7 +570,7 @@ class TestMultiGPULlama:
],
"num_epochs": 1,
"max_steps": 2,
"micro_batch_size": 4,
"micro_batch_size": 2,
"gradient_accumulation_steps": 2,
# "gradient_checkpointing": True,
"output_dir": temp_dir,
@@ -612,8 +617,11 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
)
@pytest.mark.skip(
reason="ds-zero3 broken in main until transformers#37281 resolved"
# TODO: remove skip once deepspeed regression is fixed
# see https://github.com/huggingface/transformers/pull/37324
@pytest.mark.skipif(
transformers_version_eq("4.51.0"),
reason="zero3 is not supported with transformers==4.51.0",
)
@pytest.mark.parametrize(
"gradient_accumulation_steps",
@@ -651,7 +659,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -724,7 +732,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -797,7 +805,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
@@ -885,7 +893,7 @@ class TestMultiGPULlama:
"sample_packing": True,
"bf16": True,
"save_safetensors": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
# "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
}
)

View File

@@ -31,7 +31,7 @@ class TestMultiGPURay:
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 2048,
"sequence_len": 1024,
"adapter": "lora",
"lora_r": 8,
"lora_alpha": 16,
@@ -94,8 +94,8 @@ class TestMultiGPURay:
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"val_set_size": 0.05,
"sequence_len": 1024,
"val_set_size": 0.01,
"special_tokens": {
"pad_token": "<|endoftext|>",
},