diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 5391904fc..662b64896 100755 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -290,6 +290,18 @@ class AxolotlTrainer(Trainer): if self.args.orpo_alpha: self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none") + def _wrap_model(self, model, training=True, dataloader=None): + if self.args.torch_compile: + torch._dynamo.config.accumulated_cache_size_limit = ( # pylint: disable=protected-access + 256 + ) + model = torch.compile( + model, + backend=self.args.torch_compile_backend, + mode=self.args.torch_compile_mode, + ) + return super()._wrap_model(model, training=training, dataloader=dataloader) + def create_optimizer(self): if ( self.args.loraplus_lr_ratio is None diff --git a/src/axolotl/train.py b/src/axolotl/train.py index 99a9b0ba9..5de1bc114 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -52,6 +52,13 @@ class TrainDatasetMeta: def train( *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta ) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]: + # enable expandable segments for cuda allocation to improve VRAM usage + # torch_version = torch.__version__.split(".") + # torch_major, torch_minor = int(torch_version[0]), int(torch_version[1]) + # if torch_major == 2 and torch_minor >= 2: + # if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None: + # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + # load the tokenizer first LOG.debug( f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",