diff --git a/src/axolotl/cli/config.py b/src/axolotl/cli/config.py index 43c030f4f..cb0eece7f 100644 --- a/src/axolotl/cli/config.py +++ b/src/axolotl/cli/config.py @@ -13,7 +13,6 @@ import torch import yaml from transformers.utils import is_torch_bf16_gpu_available -from axolotl.cli.redaction import redact_sensitive_info from axolotl.integrations.base import PluginManager from axolotl.utils.comet_ import setup_comet_env_vars from axolotl.utils.config import ( @@ -29,6 +28,8 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars LOG = get_logger(__name__) +API_KEY_FIELDS = {"comet_api_key"} + def check_remote_config(config: Union[str, Path]) -> Union[str, Path]: """ @@ -234,12 +235,15 @@ def load_cfg( setup_comet_env_vars(cfg) plugin_set_cfg(cfg) - redacted_cfg = redact_sensitive_info(cfg) - redacted_cfg = {k: v for k, v in redacted_cfg.items() if v is not None} + cfg_to_log = { + k: "[REDACTED]" if k in API_KEY_FIELDS else v + for k, v in cfg.items() + if v is not None + } LOG.info( "config:\n%s", - json.dumps(redacted_cfg, indent=2, default=str, sort_keys=True), + json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True), ) return cfg diff --git a/src/axolotl/cli/redaction.py b/src/axolotl/cli/redaction.py deleted file mode 100644 index 51ce73a9b..000000000 --- a/src/axolotl/cli/redaction.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Utils for redaction of sensitive information in config.""" - -from pathlib import Path -from typing import Any - -import yaml - -# NOTE: Borrowed from the telemetry logic. Should be unified with it once merged. -WHITELIST_PATH = str(Path(__file__).parent / "redaction_whitelist.yaml") - -with open(WHITELIST_PATH, encoding="utf-8") as f: - WHITELIST = yaml.safe_load(f) - - # Send org strings to lowercase since model names are case insensitive - WHITELIST["organizations"] = {org.lower() for org in WHITELIST["organizations"]} - - -# NOTE: Need to keep these up to date with any config schema changes. -FIELDS_TO_REDACT = { - "base_model", - "tokenizer_config", - "base_model_config", - "pretraining_dataset", # NOTE: this field may be a string or a dictionary. - "resume_from_checkpoint", - "hub_model_id", -} -PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"} -PATH_INDICATORS = {"path", "dir"} - - -def is_whitelisted(value: str) -> bool: - """ - Check if model / dataset / etc. org is in whitelist. - - This logic is borrowed from the telemetry logic. Should be unified with it once - merged. - - Args: - value: Value for one of `FIELDS_WITH_ORGS` ("base_model", etc.). - - Returns: - Boolean indicating whitelist membership. - """ - # NOTE: This membership-checking logic can be improved. - # What happens when a local model path matches a whitelisted org? - parts = value.split("/") - if len(parts) < 2: - return False - org = parts[0] - whitelisted = org.lower() in WHITELIST["organizations"] - - return whitelisted - - -def redact_sensitive_info(properties: dict[str, Any]) -> dict[str, Any]: - """ - Redact properties to remove any paths, API keys, etc., so as to avoid collecting - private or personally identifiable information (PII). - - This logic is borrowed from the telemetry logic. It can be unified with it once - merged. - - Args: - properties: Dictionary of properties to redact. - - Returns: - Properties dictionary with redaction applied. - """ - if not properties: - return {} - - def redact_value(value: Any, key: str = "") -> Any: - """Recursively sanitize values, redacting those with path-like keys""" - if isinstance(key, str) and isinstance(value, str): - # Other redaction special cases - if ( - key in FIELDS_TO_REDACT - or any(prefix in key for prefix in PREFIXES_TO_REDACT) - or any(indicator in key.lower() for indicator in PATH_INDICATORS) - ): - # Fields with whitelisted orgs don't need to be redacted - if not is_whitelisted(value): - return "[REDACTED]" - - # Handle nested values - if isinstance(value, dict): - return {k: redact_value(v, k) for k, v in value.items()} - if isinstance(value, list): - return [redact_value(item) for item in value] - - return value - - # Create new dict with redacted values - redacted = {k: redact_value(v, k) for k, v in properties.items()} - - return redacted diff --git a/src/axolotl/cli/redaction_whitelist.yaml b/src/axolotl/cli/redaction_whitelist.yaml deleted file mode 100644 index 62eb2139e..000000000 --- a/src/axolotl/cli/redaction_whitelist.yaml +++ /dev/null @@ -1,17 +0,0 @@ -organizations: - - "axolotl-ai-co" - - "meta-llama" - - "huggingface" - - "nvidia" - - "facebook" - - "google" - - "microsoft" - - "deepseek-ai" - - "HuggingFaceTB" - - "mistralai" - - "Qwen" - - "unsloth" - - "NousResearch" - - "allenai" - - "amd" - - "tiiuae"