From a54c1be9722f8b93d3b24334c40c174374e63fda Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Mon, 4 Aug 2025 21:23:36 +0700 Subject: [PATCH] Fix: shorten mem logs to 2 decimal places and renamed nd docs (#3011) [skip ci] * fix: shorten memory logs * fix: title name --- docs/nd_parallelism.qmd | 4 +++- src/axolotl/core/trainers/base.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/nd_parallelism.qmd b/docs/nd_parallelism.qmd index 7c2d2e0cb..d27a15663 100644 --- a/docs/nd_parallelism.qmd +++ b/docs/nd_parallelism.qmd @@ -1,4 +1,6 @@ -# N-D Parallelism +--- +title: "N-D Parallelism" +--- Axolotl enables training models at scale by composing different parallelism techniques. This is essential when: diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py index f739d19e9..617506eb2 100644 --- a/src/axolotl/core/trainers/base.py +++ b/src/axolotl/core/trainers/base.py @@ -567,10 +567,10 @@ class AxolotlTrainer( # Add memory usage try: active, allocated, reserved = get_gpu_memory_usage() - logs["memory/max_memory_active"] = active - logs["memory/max_memory_allocated"] = allocated - logs["memory/device_memory_reserved"] = reserved - except (ValueError, FileNotFoundError): + logs["memory/max_memory_active(gib)"] = round(active, 2) + logs["memory/max_memory_allocated(gib)"] = round(allocated, 2) + logs["memory/device_memory_reserved(gib)"] = round(reserved, 2) + except (ValueError, TypeError, FileNotFoundError): pass del self._stored_metrics[train_eval]