diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh index 84dfc6f71..008a74bff 100755 --- a/cicd/multigpu.sh +++ b/cicd/multigpu.sh @@ -2,5 +2,5 @@ set -e # only run one test at a time so as not to OOM the GPU -pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ -pytest -v -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ +pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ +pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ diff --git a/requirements.txt b/requirements.txt index dde64f392..78ced5728 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ trl==0.16.0 optimum==1.16.2 hf_transfer sentencepiece -gradio==3.50.2 +gradio==5.23.3 modal==0.70.5 pydantic==2.10.6 @@ -59,8 +59,8 @@ langdetect==1.0.9 immutabledict==4.2.0 antlr4-python3-runtime==4.13.2 -torchao==0.7.0 -schedulefree==1.3.0 +torchao==0.9.0 +schedulefree==1.4.1 axolotl-contribs-lgpl==0.0.6 axolotl-contribs-mit==0.0.3 diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 0e228aef0..ee1869f7d 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -44,7 +44,7 @@ class TestMultiGPULlama: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -58,7 +58,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 4, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", @@ -108,7 +108,7 @@ class TestMultiGPULlama: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -122,7 +122,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", @@ -169,7 +169,7 @@ class TestMultiGPULlama: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -195,7 +195,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 4, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "warmup_steps": 0, "learning_rate": 0.00001, @@ -247,7 +247,7 @@ class TestMultiGPULlama: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -273,7 +273,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": 4, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "warmup_steps": 0, "learning_rate": 0.00001, @@ -334,7 +334,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 2, "gradient_accumulation_steps": gradient_accumulation_steps, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", @@ -391,7 +391,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -405,7 +405,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 2, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", @@ -470,7 +470,7 @@ class TestMultiGPULlama: "eval_sample_packing": False, "pad_to_sequence_len": True, "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -485,7 +485,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 4, "gradient_accumulation_steps": 2, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", @@ -567,7 +567,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -640,7 +640,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -713,7 +713,7 @@ class TestMultiGPULlama: "sample_packing": True, "pad_to_sequence_len": True, "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { "pad_token": "<|endoftext|>", }, @@ -788,7 +788,7 @@ class TestMultiGPULlama: "max_steps": 2, "micro_batch_size": 1, "gradient_accumulation_steps": 1, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py index af39c6361..9599c3abf 100644 --- a/tests/e2e/multigpu/test_qwen2.py +++ b/tests/e2e/multigpu/test_qwen2.py @@ -37,7 +37,7 @@ class TestMultiGPUQwen2: "lora_alpha": 16, "lora_dropout": 0.05, "lora_target_linear": True, - "val_set_size": 0.05, + "val_set_size": 0.01, "datasets": [ { "path": "Intel/orca_dpo_pairs", @@ -57,7 +57,7 @@ class TestMultiGPUQwen2: "flash_attention": True, "bf16": "auto", "tf32": True, - "gradient_checkpointing": True, + # "gradient_checkpointing": True, "gradient_checkpointing_kwargs": { "use_reentrant": False, },