From df3eb645da88fe18f026c1ae201540da2622f984 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 25 Jul 2023 10:22:05 -0400 Subject: [PATCH] better handling of variance in multipack dataloader length and trainer hanging when it runs out of data --- src/axolotl/utils/dataloader.py | 6 ++++-- src/axolotl/utils/trainer.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/dataloader.py b/src/axolotl/utils/dataloader.py index 77cc933b3..f1ba51bb7 100644 --- a/src/axolotl/utils/dataloader.py +++ b/src/axolotl/utils/dataloader.py @@ -193,11 +193,13 @@ class MultipackDistributedDataloader: def __len__(self): batches, _ = self.generate_batches() - return len(batches) + return ( + len(batches) * 0.99 + ) # shave off 1% for dealing with variance in packing and dataset length def num_batches(self): batches, _ = self.generate_batches() - return len(batches) + return len(batches) * 0.99 def efficiency(self): return self.eff_total_used / self.eff_total_slots diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 0d05db337..8507d604f 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -320,7 +320,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None, save_steps=cfg.save_steps, output_dir=cfg.output_dir, - save_total_limit=3, + save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4, load_best_model_at_end=( cfg.load_best_model_at_end is not False and cfg.val_set_size > 0