Feat(wandb): Refactor to be more flexible (#767)

* Feat: Update to handle wandb env better * chore: rename wandb_run_id to wandb_name * feat: add new recommendation and update config * fix: indent and pop disabled env if project passed * feat: test env set for wandb and recommendation * feat: update to use wandb_name and allow id * chore: add info to readme
2023-12-04 22:17:25 +09:00
parent 58ec8b1113
commit a1da39cd48
39 changed files with 140 additions and 50 deletions
--- a/README.md
+++ b/README.md
@@ -659,7 +659,8 @@ wandb_mode: # "offline" to save run metadata locally and not sync to the server,
 wandb_project: # Your wandb project name
 wandb_entity: # A wandb Team name if using a Team
 wandb_watch:
-wandb_run_id: # Set the name of your wandb run
+wandb_name: # Set the name of your wandb run
+wandb_run_id: # Set the ID of your wandb run
 wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training

 # Where to save the full-finetuned model to
@@ -955,7 +956,7 @@ wandb_mode:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 ```

--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -35,7 +35,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 output_dir: btlm-out
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -24,7 +24,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 batch_size: 4
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -26,7 +26,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./falcon-7b
 batch_size: 2
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -40,7 +40,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out

--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -26,7 +26,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./falcon-7b
 batch_size: 2
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -21,7 +21,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 2
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -19,7 +19,7 @@ lora_fan_in_fan_out: false
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
 gradient_accumulation_steps: 1
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -32,7 +32,7 @@ lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./model-out
 gradient_accumulation_steps: 1
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -35,7 +35,7 @@ relora_cpu_offload: false
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/llama-2/tiny-llama.yml
+++ b/examples/llama-2/tiny-llama.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -21,7 +21,7 @@ pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -38,7 +38,7 @@ lora_target_modules:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -21,7 +21,7 @@ lora_fan_in_fan_out: false
 wandb_project: mpt-alpaca-7b
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./mpt-alpaca-7b
 gradient_accumulation_steps: 1
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -23,7 +23,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./openllama-out
 gradient_accumulation_steps: 1
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -29,7 +29,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./lora-out
 gradient_accumulation_steps: 1
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -23,7 +23,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out
 gradient_accumulation_steps: 1
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -24,7 +24,7 @@ lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./pythia-12b
 gradient_accumulation_steps: 1
--- a/examples/pythia/lora.yml
+++ b/examples/pythia/lora.yml
@@ -18,7 +18,7 @@ lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -31,7 +31,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 4
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -22,7 +22,7 @@ lora_fan_in_fan_out: false
 wandb_project: redpajama-alpaca-3b
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./redpajama-alpaca-3b
 batch_size: 4
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -21,7 +21,7 @@ lora_fan_in_fan_out:
 wandb_project: lora-replit
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./lora-replit
 batch_size: 8
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -38,7 +38,7 @@ lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
-wandb_run_id:
+wandb_name:
 wandb_log_model:
 output_dir: ./qlora-out

--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -647,7 +647,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
        training_arguments_kwargs["report_to"] = "wandb" if self.cfg.use_wandb else None
        training_arguments_kwargs["run_name"] = (
-            self.cfg.wandb_run_id if self.cfg.use_wandb else None
+            self.cfg.wandb_name if self.cfg.use_wandb else None
        )
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -397,6 +397,13 @@ def validate_config(cfg):
            "Gradient checkpointing is broken for Qwen models for transformers>=4.35.0, except main branch."
        )

+    if cfg.wandb_run_id and not cfg.wandb_name:
+        cfg.wandb_name = cfg.wandb_run_id
+
+        LOG.warning(
+            "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
+        )
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/wandb_.py
+++ b/src/axolotl/utils/wandb_.py
@@ -2,20 +2,20 @@

 import os

+from axolotl.utils.dict import DictDefault

-def setup_wandb_env_vars(cfg):
-    if cfg.wandb_mode and cfg.wandb_mode == "offline":
-        os.environ["WANDB_MODE"] = cfg.wandb_mode
-    elif cfg.wandb_project and len(cfg.wandb_project) > 0:
-        os.environ["WANDB_PROJECT"] = cfg.wandb_project
+
+def setup_wandb_env_vars(cfg: DictDefault):
+    for key in cfg.keys():
+        if key.startswith("wandb_"):
+            value = cfg.get(key, "")
+
+            if value and isinstance(value, str) and len(value) > 0:
+                os.environ[key.upper()] = value
+
+    # Enable wandb if project name is present
+    if cfg.wandb_project and len(cfg.wandb_project) > 0:
        cfg.use_wandb = True
-        if cfg.wandb_entity and len(cfg.wandb_entity) > 0:
-            os.environ["WANDB_ENTITY"] = cfg.wandb_entity
-        if cfg.wandb_watch and len(cfg.wandb_watch) > 0:
-            os.environ["WANDB_WATCH"] = cfg.wandb_watch
-        if cfg.wandb_log_model and len(cfg.wandb_log_model) > 0:
-            os.environ["WANDB_LOG_MODEL"] = cfg.wandb_log_model
-        if cfg.wandb_run_id and len(cfg.wandb_run_id) > 0:
-            os.environ["WANDB_RUN_ID"] = cfg.wandb_run_id
+        os.environ.pop("WANDB_DISABLED", None)  # Remove if present
    else:
        os.environ["WANDB_DISABLED"] = "true"
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -1,6 +1,7 @@
 """Module for testing the validation module"""

 import logging
+import os
 import unittest
 from typing import Optional

@@ -8,6 +9,7 @@ import pytest

 from axolotl.utils.config import validate_config
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.wandb_ import setup_wandb_env_vars


 class ValidationTest(unittest.TestCase):
@@ -679,3 +681,83 @@ class ValidationTest(unittest.TestCase):
        )

        validate_config(cfg)
+
+
+class ValidationWandbTest(ValidationTest):
+    """
+    Validation test for wandb
+    """
+
+    def test_wandb_set_run_id_to_name(self):
+        cfg = DictDefault(
+            {
+                "wandb_run_id": "foo",
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
+                in record.message
+                for record in self._caplog.records
+            )
+
+            assert cfg.wandb_name == "foo" and cfg.wandb_run_id == "foo"
+
+        cfg = DictDefault(
+            {
+                "wandb_name": "foo",
+            }
+        )
+
+        validate_config(cfg)
+
+        assert cfg.wandb_name == "foo" and cfg.wandb_run_id is None
+
+    def test_wandb_sets_env(self):
+        cfg = DictDefault(
+            {
+                "wandb_project": "foo",
+                "wandb_name": "bar",
+                "wandb_run_id": "bat",
+                "wandb_entity": "baz",
+                "wandb_mode": "online",
+                "wandb_watch": "false",
+                "wandb_log_model": "checkpoint",
+            }
+        )
+
+        validate_config(cfg)
+
+        setup_wandb_env_vars(cfg)
+
+        assert os.environ.get("WANDB_PROJECT", "") == "foo"
+        assert os.environ.get("WANDB_NAME", "") == "bar"
+        assert os.environ.get("WANDB_RUN_ID", "") == "bat"
+        assert os.environ.get("WANDB_ENTITY", "") == "baz"
+        assert os.environ.get("WANDB_MODE", "") == "online"
+        assert os.environ.get("WANDB_WATCH", "") == "false"
+        assert os.environ.get("WANDB_LOG_MODEL", "") == "checkpoint"
+        assert os.environ.get("WANDB_DISABLED", "") != "true"
+
+    def test_wandb_set_disabled(self):
+        cfg = DictDefault({})
+
+        validate_config(cfg)
+
+        setup_wandb_env_vars(cfg)
+
+        assert os.environ.get("WANDB_DISABLED", "") == "true"
+
+        cfg = DictDefault(
+            {
+                "wandb_project": "foo",
+            }
+        )
+
+        validate_config(cfg)
+
+        setup_wandb_env_vars(cfg)
+
+        assert os.environ.get("WANDB_DISABLED", "") != "true"