diff --git a/docs/config.qmd b/docs/config.qmd
index 3a11666a5..4686b8b3a 100644
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -78,6 +78,9 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
+# Tensor parallel
+tp_size: 1 # should be set to the number of cuda devices available
+
 # Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
 gpu_memory_limit: 20GiB
 # Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 755d60908..16e5c2c73 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -543,18 +543,11 @@ def setup_fsdp_envs(cfg):
         ] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
 
 
-def setup_tp_envs():
-    os.environ["ACCELERATE_USE_TP"] = "true"
-
-
 def prepare_optim_env(cfg):
     if not check_cuda_p2p_ib_support():
         if os.getenv("NCCL_P2P_DISABLE") is None:
             os.environ["NCCL_P2P_DISABLE"] = "1"
 
-    if cfg.tp_size > 1:
-        setup_tp_envs()
-
     if cfg.fsdp:
         setup_fsdp_envs(cfg)
     elif cfg.deepspeed: