diff --git a/requirements.txt b/requirements.txt index 6e0d98c5e..1fc3a9ff7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ # START section of dependencies that don't install on Darwin/MacOS -bitsandbytes==0.45.4 +bitsandbytes==0.46.0 triton>=3.0.0 mamba-ssm==1.2.0.post1 xformers>=0.0.23.post1 @@ -15,7 +15,7 @@ huggingface_hub==0.32.2 peft==0.15.2 transformers==4.52.4 tokenizers>=0.21.1 -accelerate==1.7.0 +accelerate==1.8.1 datasets==3.6.0 deepspeed>=0.17.0 trl==0.18.2 diff --git a/src/axolotl/train.py b/src/axolotl/train.py index a476385d0..a731316b6 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -223,8 +223,9 @@ def execute_training( ) LOG.info("Starting trainer...") - if cfg.bf16: - torch.set_default_dtype(torch.bfloat16) + # TODO: disabling for now as not compatible with FSDP2 + torchao low bit optimizers + # if cfg.bf16: + # torch.set_default_dtype(torch.bfloat16) trainer.train(resume_from_checkpoint=resume_from_checkpoint) diff --git a/tests/conftest.py b/tests/conftest.py index 8ab8fd6a4..12e79c0e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ import sys import tempfile import time from pathlib import Path +from typing import Generator import datasets import pytest @@ -411,7 +412,7 @@ def tokenizer_mistral_7b_instruct_chatml(tokenizer_mistral_7b_instruct): @pytest.fixture -def temp_dir(): +def temp_dir() -> Generator[str, None, None]: # Create a temporary directory _temp_dir = tempfile.mkdtemp() yield _temp_dir diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py index 8883e0135..31a728f20 100644 --- a/tests/e2e/multigpu/patched/test_sp.py +++ b/tests/e2e/multigpu/patched/test_sp.py @@ -54,6 +54,7 @@ class TestSequenceParallelism: "micro_batch_size": micro_batch_size, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index c8f14330d..b892fe213 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -54,6 +54,7 @@ class TestPackedFlex: "gradient_accumulation_steps": 2, "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py index 1daf58472..c595d3fc0 100644 --- a/tests/e2e/multigpu/solo/test_grpo.py +++ b/tests/e2e/multigpu/solo/test_grpo.py @@ -309,6 +309,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs): "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -400,6 +401,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs): "warmup_steps": 10, "val_set_size": 0.0, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py index 379562e40..d6429cf63 100644 --- a/tests/e2e/multigpu/test_eval.py +++ b/tests/e2e/multigpu/test_eval.py @@ -38,12 +38,13 @@ class TestMultiGPUEval: "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": ["embed_tokens", "lm_head"], - "val_set_size": 0.004, + "val_set_size": 0.05, "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [ { "path": "teknium/GPT4-LLM-Cleaned", "type": "alpaca", + "split": "train[:5%]", }, ], "num_epochs": 1, @@ -51,6 +52,7 @@ class TestMultiGPUEval: "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", @@ -107,12 +109,13 @@ class TestMultiGPUEval: "lora_dropout": 0.05, "lora_target_linear": True, "lora_modules_to_save": ["embed_tokens", "lm_head"], - "val_set_size": 0.0004, + "val_set_size": 0.01, "special_tokens": {"pad_token": "<|endoftext|>"}, "datasets": [ { "path": "teknium/GPT4-LLM-Cleaned", "type": "alpaca", + "split": "train[:5%]", }, ], "num_epochs": 1, @@ -120,6 +123,7 @@ class TestMultiGPUEval: "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py index b4cb6e59d..3868d90f0 100644 --- a/tests/e2e/multigpu/test_gemma3.py +++ b/tests/e2e/multigpu/test_gemma3.py @@ -64,6 +64,7 @@ class TestMultiGPUGemma3: }, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.0001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index a8ed6bda0..d84505714 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -62,6 +62,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", @@ -127,6 +128,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": gradient_accumulation_steps, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", @@ -200,6 +202,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "warmup_steps": 0, "learning_rate": 0.00001, "optimizer": "adamw_8bit", @@ -278,6 +281,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "warmup_steps": 0, "learning_rate": 0.00001, "optimizer": "adamw_8bit", @@ -340,6 +344,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": gradient_accumulation_steps, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -412,6 +417,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -491,6 +497,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_8bit", "lr_scheduler": "cosine", @@ -573,6 +580,7 @@ class TestMultiGPULlama: "gradient_accumulation_steps": 2, # "gradient_checkpointing": True, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -669,6 +677,7 @@ class TestMultiGPULlama: "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -743,6 +752,7 @@ class TestMultiGPULlama: "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", @@ -817,6 +827,7 @@ class TestMultiGPULlama: "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py index fa4efa32b..bd57dbcef 100644 --- a/tests/e2e/multigpu/test_qwen2.py +++ b/tests/e2e/multigpu/test_qwen2.py @@ -46,6 +46,7 @@ class TestMultiGPUQwen2: "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py index 22023507a..43a722b48 100644 --- a/tests/e2e/multigpu/test_ray.py +++ b/tests/e2e/multigpu/test_ray.py @@ -48,6 +48,7 @@ class TestMultiGPURay: "micro_batch_size": 4, "gradient_accumulation_steps": 2, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_8bit", "lr_scheduler": "cosine", @@ -107,6 +108,7 @@ class TestMultiGPURay: "micro_batch_size": 1, "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", "learning_rate": 0.00001, "optimizer": "adamw_torch", "lr_scheduler": "cosine",