Fix: shorten mem logs to 2 decimal places and renamed nd docs (#3011) [skip ci]
* fix: shorten memory logs * fix: title name
This commit is contained in:
@@ -1,4 +1,6 @@
|
|||||||
# N-D Parallelism
|
---
|
||||||
|
title: "N-D Parallelism"
|
||||||
|
---
|
||||||
|
|
||||||
Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:
|
Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:
|
||||||
|
|
||||||
|
|||||||
@@ -567,10 +567,10 @@ class AxolotlTrainer(
|
|||||||
# Add memory usage
|
# Add memory usage
|
||||||
try:
|
try:
|
||||||
active, allocated, reserved = get_gpu_memory_usage()
|
active, allocated, reserved = get_gpu_memory_usage()
|
||||||
logs["memory/max_memory_active"] = active
|
logs["memory/max_memory_active(gib)"] = round(active, 2)
|
||||||
logs["memory/max_memory_allocated"] = allocated
|
logs["memory/max_memory_allocated(gib)"] = round(allocated, 2)
|
||||||
logs["memory/device_memory_reserved"] = reserved
|
logs["memory/device_memory_reserved(gib)"] = round(reserved, 2)
|
||||||
except (ValueError, FileNotFoundError):
|
except (ValueError, TypeError, FileNotFoundError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
del self._stored_metrics[train_eval]
|
del self._stored_metrics[train_eval]
|
||||||
|
|||||||
Reference in New Issue
Block a user