From 87726322bf45690ca5e50ee3d5c0c264ba817df8 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 30 Apr 2025 03:32:44 -0400 Subject: [PATCH] upload the deepspeed json to wandb (#2593) [skip ci] --- src/axolotl/utils/callbacks/__init__.py | 40 +++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py index ffe4699f8..21b14d986 100644 --- a/src/axolotl/utils/callbacks/__init__.py +++ b/src/axolotl/utils/callbacks/__init__.py @@ -3,6 +3,7 @@ from __future__ import annotations import gc +import json import logging import os import traceback @@ -808,11 +809,44 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback): artifact.add_file(temp_file.name) wandb.log_artifact(artifact) wandb.save(temp_file.name) - LOG.info( - "The Axolotl config has been saved to the WandB run under files." - ) + LOG.info( + "The Axolotl config has been saved to the WandB run under files." + ) except (FileNotFoundError, ConnectionError) as err: LOG.warning(f"Error while saving Axolotl config to WandB: {err}") + + if args.deepspeed: + try: + # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later. + with NamedTemporaryFile( + mode="w", + delete=False, + suffix=".json", + prefix="deepspeed_config_", + ) as temp_file: + skip_upload = False + if isinstance(args.deepspeed, dict): + json.dump(args.deepspeed, temp_file, indent=4) + elif isinstance(args.deepspeed, str) and os.path.exists( + args.deepspeed + ): + copyfile(args.deepspeed, temp_file.name) + else: + skip_upload = True + if not skip_upload: + artifact = wandb.Artifact( + f"deepspeed-config-{wandb.run.id}", + type="deepspeed-config", + ) + artifact.add_file(temp_file.name) + wandb.log_artifact(artifact) + wandb.save(temp_file.name) + LOG.info( + "The DeepSpeed config has been saved to the WandB run under files." + ) + except (FileNotFoundError, ConnectionError) as err: + LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}") + return control