just redact api keys
This commit is contained in:
@@ -13,7 +13,6 @@ import torch
|
|||||||
import yaml
|
import yaml
|
||||||
from transformers.utils import is_torch_bf16_gpu_available
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
from axolotl.cli.redaction import redact_sensitive_info
|
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||||
from axolotl.utils.config import (
|
from axolotl.utils.config import (
|
||||||
@@ -29,6 +28,8 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars
|
|||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
API_KEY_FIELDS = {"comet_api_key"}
|
||||||
|
|
||||||
|
|
||||||
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
||||||
"""
|
"""
|
||||||
@@ -234,12 +235,15 @@ def load_cfg(
|
|||||||
setup_comet_env_vars(cfg)
|
setup_comet_env_vars(cfg)
|
||||||
plugin_set_cfg(cfg)
|
plugin_set_cfg(cfg)
|
||||||
|
|
||||||
redacted_cfg = redact_sensitive_info(cfg)
|
cfg_to_log = {
|
||||||
redacted_cfg = {k: v for k, v in redacted_cfg.items() if v is not None}
|
k: "[REDACTED]" if k in API_KEY_FIELDS else v
|
||||||
|
for k, v in cfg.items()
|
||||||
|
if v is not None
|
||||||
|
}
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"config:\n%s",
|
"config:\n%s",
|
||||||
json.dumps(redacted_cfg, indent=2, default=str, sort_keys=True),
|
json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
return cfg
|
return cfg
|
||||||
|
|||||||
@@ -1,96 +0,0 @@
|
|||||||
"""Utils for redaction of sensitive information in config."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
# NOTE: Borrowed from the telemetry logic. Should be unified with it once merged.
|
|
||||||
WHITELIST_PATH = str(Path(__file__).parent / "redaction_whitelist.yaml")
|
|
||||||
|
|
||||||
with open(WHITELIST_PATH, encoding="utf-8") as f:
|
|
||||||
WHITELIST = yaml.safe_load(f)
|
|
||||||
|
|
||||||
# Send org strings to lowercase since model names are case insensitive
|
|
||||||
WHITELIST["organizations"] = {org.lower() for org in WHITELIST["organizations"]}
|
|
||||||
|
|
||||||
|
|
||||||
# NOTE: Need to keep these up to date with any config schema changes.
|
|
||||||
FIELDS_TO_REDACT = {
|
|
||||||
"base_model",
|
|
||||||
"tokenizer_config",
|
|
||||||
"base_model_config",
|
|
||||||
"pretraining_dataset", # NOTE: this field may be a string or a dictionary.
|
|
||||||
"resume_from_checkpoint",
|
|
||||||
"hub_model_id",
|
|
||||||
}
|
|
||||||
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
|
|
||||||
PATH_INDICATORS = {"path", "dir"}
|
|
||||||
|
|
||||||
|
|
||||||
def is_whitelisted(value: str) -> bool:
|
|
||||||
"""
|
|
||||||
Check if model / dataset / etc. org is in whitelist.
|
|
||||||
|
|
||||||
This logic is borrowed from the telemetry logic. Should be unified with it once
|
|
||||||
merged.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
value: Value for one of `FIELDS_WITH_ORGS` ("base_model", etc.).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Boolean indicating whitelist membership.
|
|
||||||
"""
|
|
||||||
# NOTE: This membership-checking logic can be improved.
|
|
||||||
# What happens when a local model path matches a whitelisted org?
|
|
||||||
parts = value.split("/")
|
|
||||||
if len(parts) < 2:
|
|
||||||
return False
|
|
||||||
org = parts[0]
|
|
||||||
whitelisted = org.lower() in WHITELIST["organizations"]
|
|
||||||
|
|
||||||
return whitelisted
|
|
||||||
|
|
||||||
|
|
||||||
def redact_sensitive_info(properties: dict[str, Any]) -> dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Redact properties to remove any paths, API keys, etc., so as to avoid collecting
|
|
||||||
private or personally identifiable information (PII).
|
|
||||||
|
|
||||||
This logic is borrowed from the telemetry logic. It can be unified with it once
|
|
||||||
merged.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
properties: Dictionary of properties to redact.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Properties dictionary with redaction applied.
|
|
||||||
"""
|
|
||||||
if not properties:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def redact_value(value: Any, key: str = "") -> Any:
|
|
||||||
"""Recursively sanitize values, redacting those with path-like keys"""
|
|
||||||
if isinstance(key, str) and isinstance(value, str):
|
|
||||||
# Other redaction special cases
|
|
||||||
if (
|
|
||||||
key in FIELDS_TO_REDACT
|
|
||||||
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
|
|
||||||
or any(indicator in key.lower() for indicator in PATH_INDICATORS)
|
|
||||||
):
|
|
||||||
# Fields with whitelisted orgs don't need to be redacted
|
|
||||||
if not is_whitelisted(value):
|
|
||||||
return "[REDACTED]"
|
|
||||||
|
|
||||||
# Handle nested values
|
|
||||||
if isinstance(value, dict):
|
|
||||||
return {k: redact_value(v, k) for k, v in value.items()}
|
|
||||||
if isinstance(value, list):
|
|
||||||
return [redact_value(item) for item in value]
|
|
||||||
|
|
||||||
return value
|
|
||||||
|
|
||||||
# Create new dict with redacted values
|
|
||||||
redacted = {k: redact_value(v, k) for k, v in properties.items()}
|
|
||||||
|
|
||||||
return redacted
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
organizations:
|
|
||||||
- "axolotl-ai-co"
|
|
||||||
- "meta-llama"
|
|
||||||
- "huggingface"
|
|
||||||
- "nvidia"
|
|
||||||
- "facebook"
|
|
||||||
- "google"
|
|
||||||
- "microsoft"
|
|
||||||
- "deepseek-ai"
|
|
||||||
- "HuggingFaceTB"
|
|
||||||
- "mistralai"
|
|
||||||
- "Qwen"
|
|
||||||
- "unsloth"
|
|
||||||
- "NousResearch"
|
|
||||||
- "allenai"
|
|
||||||
- "amd"
|
|
||||||
- "tiiuae"
|
|
||||||
Reference in New Issue
Block a user