diff --git a/docs/nd_parallelism.qmd b/docs/nd_parallelism.qmd index 7c2d2e0cb..d27a15663 100644 --- a/docs/nd_parallelism.qmd +++ b/docs/nd_parallelism.qmd @@ -1,4 +1,6 @@ -# N-D Parallelism +--- +title: "N-D Parallelism" +--- Axolotl enables training models at scale by composing different parallelism techniques. This is essential when: diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py index f739d19e9..617506eb2 100644 --- a/src/axolotl/core/trainers/base.py +++ b/src/axolotl/core/trainers/base.py @@ -567,10 +567,10 @@ class AxolotlTrainer( # Add memory usage try: active, allocated, reserved = get_gpu_memory_usage() - logs["memory/max_memory_active"] = active - logs["memory/max_memory_allocated"] = allocated - logs["memory/device_memory_reserved"] = reserved - except (ValueError, FileNotFoundError): + logs["memory/max_memory_active(gib)"] = round(active, 2) + logs["memory/max_memory_allocated(gib)"] = round(allocated, 2) + logs["memory/device_memory_reserved(gib)"] = round(reserved, 2) + except (ValueError, TypeError, FileNotFoundError): pass del self._stored_metrics[train_eval]