just redact api keys

This commit is contained in:
Dan Saunders
2025-06-24 14:19:00 -04:00
parent 700791deb9
commit b594f18f6e
3 changed files with 8 additions and 117 deletions

View File

@@ -13,7 +13,6 @@ import torch
import yaml
from transformers.utils import is_torch_bf16_gpu_available
from axolotl.cli.redaction import redact_sensitive_info
from axolotl.integrations.base import PluginManager
from axolotl.utils.comet_ import setup_comet_env_vars
from axolotl.utils.config import (
@@ -29,6 +28,8 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars
LOG = get_logger(__name__)
API_KEY_FIELDS = {"comet_api_key"}
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
"""
@@ -234,12 +235,15 @@ def load_cfg(
setup_comet_env_vars(cfg)
plugin_set_cfg(cfg)
redacted_cfg = redact_sensitive_info(cfg)
redacted_cfg = {k: v for k, v in redacted_cfg.items() if v is not None}
cfg_to_log = {
k: "[REDACTED]" if k in API_KEY_FIELDS else v
for k, v in cfg.items()
if v is not None
}
LOG.info(
"config:\n%s",
json.dumps(redacted_cfg, indent=2, default=str, sort_keys=True),
json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
)
return cfg

View File

@@ -1,96 +0,0 @@
"""Utils for redaction of sensitive information in config."""
from pathlib import Path
from typing import Any
import yaml
# NOTE: Borrowed from the telemetry logic. Should be unified with it once merged.
WHITELIST_PATH = str(Path(__file__).parent / "redaction_whitelist.yaml")
with open(WHITELIST_PATH, encoding="utf-8") as f:
WHITELIST = yaml.safe_load(f)
# Send org strings to lowercase since model names are case insensitive
WHITELIST["organizations"] = {org.lower() for org in WHITELIST["organizations"]}
# NOTE: Need to keep these up to date with any config schema changes.
FIELDS_TO_REDACT = {
"base_model",
"tokenizer_config",
"base_model_config",
"pretraining_dataset", # NOTE: this field may be a string or a dictionary.
"resume_from_checkpoint",
"hub_model_id",
}
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
PATH_INDICATORS = {"path", "dir"}
def is_whitelisted(value: str) -> bool:
"""
Check if model / dataset / etc. org is in whitelist.
This logic is borrowed from the telemetry logic. Should be unified with it once
merged.
Args:
value: Value for one of `FIELDS_WITH_ORGS` ("base_model", etc.).
Returns:
Boolean indicating whitelist membership.
"""
# NOTE: This membership-checking logic can be improved.
# What happens when a local model path matches a whitelisted org?
parts = value.split("/")
if len(parts) < 2:
return False
org = parts[0]
whitelisted = org.lower() in WHITELIST["organizations"]
return whitelisted
def redact_sensitive_info(properties: dict[str, Any]) -> dict[str, Any]:
"""
Redact properties to remove any paths, API keys, etc., so as to avoid collecting
private or personally identifiable information (PII).
This logic is borrowed from the telemetry logic. It can be unified with it once
merged.
Args:
properties: Dictionary of properties to redact.
Returns:
Properties dictionary with redaction applied.
"""
if not properties:
return {}
def redact_value(value: Any, key: str = "") -> Any:
"""Recursively sanitize values, redacting those with path-like keys"""
if isinstance(key, str) and isinstance(value, str):
# Other redaction special cases
if (
key in FIELDS_TO_REDACT
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
or any(indicator in key.lower() for indicator in PATH_INDICATORS)
):
# Fields with whitelisted orgs don't need to be redacted
if not is_whitelisted(value):
return "[REDACTED]"
# Handle nested values
if isinstance(value, dict):
return {k: redact_value(v, k) for k, v in value.items()}
if isinstance(value, list):
return [redact_value(item) for item in value]
return value
# Create new dict with redacted values
redacted = {k: redact_value(v, k) for k, v in properties.items()}
return redacted

View File

@@ -1,17 +0,0 @@
organizations:
- "axolotl-ai-co"
- "meta-llama"
- "huggingface"
- "nvidia"
- "facebook"
- "google"
- "microsoft"
- "deepseek-ai"
- "HuggingFaceTB"
- "mistralai"
- "Qwen"
- "unsloth"
- "NousResearch"
- "allenai"
- "amd"
- "tiiuae"