diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md index d28d5e77d..123ffa710 100644 --- a/examples/pythia-12b/README.md +++ b/examples/pythia-12b/README.md @@ -1,4 +1,4 @@ -# Python 12B +# Pythia 12B - Single-GPU A100 only (?) diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml index 28e822c77..3b3d91630 100644 --- a/examples/pythia-12b/config.yml +++ b/examples/pythia-12b/config.yml @@ -22,7 +22,7 @@ lora_dropout: 0.0 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific -wandb_project: pythia-12b +wandb_project: wandb_watch: wandb_run_id: wandb_log_model: @@ -45,5 +45,5 @@ resume_from_checkpoint: local_rank: gradient_checkpointing: true fsdp: -fsdp_transformer_layer_cls_to_wrap: +fsdp_config: collator_pad_to_longest: true diff --git a/scripts/finetune.py b/scripts/finetune.py index ab226f68f..47aada411 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -208,7 +208,10 @@ def train( ) else: train_dataset = load_pretraining_dataset( - cfg.pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len + cfg.pretraining_dataset, + tokenizer, + max_tokens=cfg.sequence_len, + seed=cfg.seed, ) # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230 train_dataset = train_dataset.with_format("torch") diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 492d8059b..058c24bcd 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -505,10 +505,10 @@ def encode_pretraining(tokenizer, max_tokens, examples): return ret -def load_pretraining_dataset(path, tokenizer, max_tokens=2048): +def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42): encode = functools.partial(encode_pretraining, tokenizer, max_tokens) dataset = load_dataset(path, streaming=True, split="train") - dataset = dataset.shuffle(seed=42, buffer_size=10_000) + dataset = dataset.shuffle(seed=seed, buffer_size=10_000) # TODO dynamically figure out which columns/features to remove dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"]) return dataset diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 59b1dc803..57a08aa53 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -1,7 +1,6 @@ """Module containing the Trainer class and related functions""" import importlib -import logging import math import os import sys @@ -232,7 +231,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): callbacks.append(SavePeftModelCallback) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: - logging.info("Setting up SaveBetterTransformerModelCallback.") callbacks.append(SaveBetterTransformerModelCallback) data_collator_kwargs = {