Compare commits

...

8 Commits

Author SHA1 Message Date
Wing Lian
320553850a update peft to 0.15.1 2025-04-06 19:55:07 -04:00
Wing Lian
b38f70e068 use 4.51.0 for now 2025-04-06 18:14:14 -04:00
Wing Lian
cf4c84e21d slightly smaller train set 2025-04-06 17:11:52 -04:00
Wing Lian
98d98ea1dd reordering to trigger torch 2.6.0 tests first 2025-04-06 17:11:52 -04:00
Wing Lian
0cf42ab8a3 don't use deepspeed for the fix_untrained_tokens test 2025-04-06 17:11:52 -04:00
Wing Lian
3d0ab75a0c be flexible on transformers version and skip test on version 2025-04-06 17:11:50 -04:00
Wing Lian
d375be90ff add xet support [skip ci] 2025-04-06 17:09:23 -04:00
Wing Lian
98827e8f3b llama4 support 2025-04-06 17:08:57 -04:00
7 changed files with 39 additions and 26 deletions

View File

@@ -24,6 +24,13 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
@@ -38,13 +45,6 @@ jobs:
axolotl_extras: vllm axolotl_extras: vllm
num_gpus: 2 num_gpus: 2
nightly_build: "true" nightly_build: "true"
- cuda: 124
cuda_version: 12.4.1
python_version: "3.11"
pytorch: 2.6.0
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal] runs-on: [self-hosted, modal]
timeout-minutes: 120 timeout-minutes: 120
steps: steps:

View File

@@ -211,7 +211,7 @@ jobs:
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.5.1 pytorch: 2.6.0
num_gpus: 1 num_gpus: 1
axolotl_extras: vllm axolotl_extras: vllm
steps: steps:
@@ -258,7 +258,7 @@ jobs:
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.5.1
num_gpus: 1 num_gpus: 1
axolotl_extras: vllm axolotl_extras: vllm
steps: steps:

View File

@@ -11,13 +11,14 @@ liger-kernel==0.5.5
packaging==23.2 packaging==23.2
peft==0.15.0 peft==0.15.1
transformers==4.51.0 transformers==4.51.0
tokenizers>=0.21.1 tokenizers>=0.21.1
accelerate==1.6.0 accelerate==1.6.0
datasets==3.5.0 datasets==3.5.0
deepspeed>=0.15.4 deepspeed>=0.15.4
trl==0.16.1 trl==0.16.1
hf_xet==1.0.0
optimum==1.16.2 optimum==1.16.2
hf_transfer hf_transfer

View File

@@ -13,6 +13,7 @@ from axolotl.monkeypatch.utils import get_unpad_data
SUPPORTED_MULTIPACK_MODEL_TYPES = [ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"mllama_text_model", "mllama_text_model",
"llama", "llama",
"llama4",
"mistral", "mistral",
"mixtral", "mixtral",
"qwen2", "qwen2",

File diff suppressed because one or more lines are too long

View File

@@ -7,9 +7,11 @@ import os
from pathlib import Path from pathlib import Path
import pytest import pytest
import transformers
import yaml import yaml
from accelerate.test_utils import execute_subprocess_async from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from packaging import version
from transformers.testing_utils import get_torch_dist_unique_port from transformers.testing_utils import get_torch_dist_unique_port
from axolotl.utils.dict import DictDefault from axolotl.utils.dict import DictDefault
@@ -28,6 +30,10 @@ def download_model():
snapshot_download("HuggingFaceTB/SmolLM2-135M") snapshot_download("HuggingFaceTB/SmolLM2-135M")
def transformers_version_eq(required_version):
return version.parse(transformers.__version__) == version.parse(required_version)
class TestMultiGPULlama: class TestMultiGPULlama:
""" """
Test case for Llama models using LoRA Test case for Llama models using LoRA
@@ -56,7 +62,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 1,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -108,7 +114,7 @@ class TestMultiGPULlama:
"lora_alpha": 16, "lora_alpha": 16,
"lora_dropout": 0.05, "lora_dropout": 0.05,
"lora_target_linear": True, "lora_target_linear": True,
"val_set_size": 0.01, "val_set_size": 0.05,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
}, },
@@ -116,6 +122,7 @@ class TestMultiGPULlama:
{ {
"path": "tatsu-lab/alpaca", "path": "tatsu-lab/alpaca",
"type": "alpaca", "type": "alpaca",
"split": "train[:20%]",
}, },
], ],
"num_epochs": 1, "num_epochs": 1,
@@ -193,7 +200,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 4, "gradient_accumulation_steps": 4,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -390,7 +397,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -403,7 +410,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -551,7 +558,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"eval_sample_packing": False, "eval_sample_packing": False,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -565,7 +572,7 @@ class TestMultiGPULlama:
], ],
"num_epochs": 1, "num_epochs": 1,
"max_steps": 2, "max_steps": 2,
"micro_batch_size": 4, "micro_batch_size": 2,
"gradient_accumulation_steps": 2, "gradient_accumulation_steps": 2,
# "gradient_checkpointing": True, # "gradient_checkpointing": True,
"output_dir": temp_dir, "output_dir": temp_dir,
@@ -612,8 +619,11 @@ class TestMultiGPULlama:
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
) )
@pytest.mark.skip( # TODO: remove skip once deepspeed regression is fixed
reason="ds-zero3 broken in main until transformers#37281 resolved" # see https://github.com/huggingface/transformers/pull/37324
@pytest.mark.skipif(
transformers_version_eq("4.51.0"),
reason="zero3 is not supported with transformers==4.51.0",
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"gradient_accumulation_steps", "gradient_accumulation_steps",
@@ -651,7 +661,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -724,7 +734,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -797,7 +807,7 @@ class TestMultiGPULlama:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.01, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
@@ -885,7 +895,7 @@ class TestMultiGPULlama:
"sample_packing": True, "sample_packing": True,
"bf16": True, "bf16": True,
"save_safetensors": True, "save_safetensors": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True, "use_tensorboard": True,
} }
) )

View File

@@ -31,7 +31,7 @@ class TestMultiGPURay:
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 2048, "sequence_len": 1024,
"adapter": "lora", "adapter": "lora",
"lora_r": 8, "lora_r": 8,
"lora_alpha": 16, "lora_alpha": 16,
@@ -94,8 +94,8 @@ class TestMultiGPURay:
"base_model": "HuggingFaceTB/SmolLM2-135M", "base_model": "HuggingFaceTB/SmolLM2-135M",
"sample_packing": True, "sample_packing": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"sequence_len": 2048, "sequence_len": 1024,
"val_set_size": 0.05, "val_set_size": 0.01,
"special_tokens": { "special_tokens": {
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
}, },