diff --git a/ds_config.json b/ds_config.json index 65955377c..1e150ba95 100644 --- a/ds_config.json +++ b/ds_config.json @@ -1,35 +1,6 @@ { - "bf16": { - "enabled": "auto" - }, - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "optimizer": { - "type": "Adam", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto", - "total_num_steps": "auto" - } - }, "zero_optimization": { - "stage": 2, + "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": true @@ -39,20 +10,48 @@ "pin_memory": true }, "overlap_comm": true, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, "contiguous_gradients": true, + "sub_group_size": 0, "reduce_bucket_size": "auto", - "reduce_scatter": true, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 0, "stage3_max_reuse_distance": 0, "stage3_gather_16bit_weights_on_model_save": true }, - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 5, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-8, + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "OneCycle", + "params": { + "cycle_min_lr": 0.00001, + "cycle_max_lr": 0.00003, + "cycle_first_step_size": 120 + } + }, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false, - "round_robin_gradients": true + "wall_clock_breakdown": false } diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 774802a7d..07872a16e 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -125,7 +125,7 @@ def load_model( load_in_4bit=True, llm_int8_threshold=6.0, llm_int8_has_fp16_weight=False, - bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_compute_dtype=torch_dtype, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) @@ -174,7 +174,7 @@ def load_model( load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, torch_dtype=torch_dtype, - device_map=cfg.device_map, + device_map="auto" if cfg.world_size == 1 else cfg.device_map, **model_kwargs, ) # elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention: @@ -273,13 +273,13 @@ def load_model( if ( torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) > 1 - and cfg.gptq + and (cfg.gptq or cfg.load_in_4bit) ): # llama is PROBABLY model parallelizable, but the default isn't that it is # so let's only set it for the 4bit, see # https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133 - model.is_parallelizable = True - model.model_parallel = True + setattr(model, 'is_parallelizable', True) + setattr(model, 'model_parallel', True) requires_grad = [] for name, param in model.named_parameters(recurse=True): diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index cb67eac7d..97b02baba 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -113,7 +113,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): output_dir=cfg.output_dir, save_total_limit=3, load_best_model_at_end=True - if cfg.val_set_size > 0 + if cfg.load_best_model_at_end is not False # if explicitly set to False, it should be resort to False + and cfg.val_set_size > 0 and save_steps is not None and save_steps % eval_steps == 0 and cfg.load_in_8bit is not True @@ -218,7 +219,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): trainer_cls = ( OneCycleLRSchedulerTrainer - if cfg.lr_scheduler == "one_cycle" and cfg.fsdp + if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora") else transformers.Trainer ) trainer = trainer_cls( diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py index 76ff59a0c..7595ffbe4 100644 --- a/tests/test_prompt_tokenizers.py +++ b/tests/test_prompt_tokenizers.py @@ -1,6 +1,7 @@ import json import logging import unittest +from pathlib import Path from transformers import AutoTokenizer @@ -22,10 +23,11 @@ class TestPromptTokenizationStrategies(unittest.TestCase): ) def test_sharegpt_integration(self): - with open("./fixtures/conversation.json", "r") as fin: + print(Path(__file__).parent) + with open(Path(__file__).parent / "fixtures/conversation.json", "r") as fin: data = fin.read() conversation = json.loads(data) - with open("./fixtures/conversation.tokenized.json", "r") as fin: + with open(Path(__file__).parent / "fixtures/conversation.tokenized.json", "r") as fin: data = fin.read() tokenized_conversation = json.loads(data) prompter = ShareGPTPrompter("chat")