Update dependencies and show slow tests in CI (#2492)

* use latest torchao, gradio, schedule-free * get info on slow tests * speed up tests by avoiding gradient checkpointing and reducing eval size
2025-04-05 17:41:31 -04:00
parent 949471039f
commit e7e0cd97ce
4 changed files with 24 additions and 24 deletions
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -44,7 +44,7 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -58,7 +58,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
@@ -108,7 +108,7 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -122,7 +122,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
@@ -169,7 +169,7 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -195,7 +195,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "warmup_steps": 0,
                "learning_rate": 0.00001,
@@ -247,7 +247,7 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -273,7 +273,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 4,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "warmup_steps": 0,
                "learning_rate": 0.00001,
@@ -334,7 +334,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -391,7 +391,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -405,7 +405,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -470,7 +470,7 @@ class TestMultiGPULlama:
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -485,7 +485,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -567,7 +567,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -640,7 +640,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -713,7 +713,7 @@ class TestMultiGPULlama:
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
@@ -788,7 +788,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
--- a/tests/e2e/multigpu/test_qwen2.py
+++ b/tests/e2e/multigpu/test_qwen2.py
@@ -37,7 +37,7 @@ class TestMultiGPUQwen2:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "Intel/orca_dpo_pairs",
@@ -57,7 +57,7 @@ class TestMultiGPUQwen2:
                "flash_attention": True,
                "bf16": "auto",
                "tf32": True,
-                "gradient_checkpointing": True,
+                # "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {
                    "use_reentrant": False,
                },