diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
index 5391904fc..662b64896 100755
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -290,6 +290,18 @@ class AxolotlTrainer(Trainer):
         if self.args.orpo_alpha:
             self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
+    def _wrap_model(self, model, training=True, dataloader=None):
+        if self.args.torch_compile:
+            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
+                256
+            )
+            model = torch.compile(
+                model,
+                backend=self.args.torch_compile_backend,
+                mode=self.args.torch_compile_mode,
+            )
+        return super()._wrap_model(model, training=training, dataloader=dataloader)
+
     def create_optimizer(self):
         if (
             self.args.loraplus_lr_ratio is None
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index 99a9b0ba9..5de1bc114 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -52,6 +52,13 @@ class TrainDatasetMeta:
 def train(
     *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
 ) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
+    # enable expandable segments for cuda allocation to improve VRAM usage
+    # torch_version = torch.__version__.split(".")
+    # torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
+    # if torch_major == 2 and torch_minor >= 2:
+    #     if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
+    #         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
     # load the tokenizer first
     LOG.debug(
         f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",