simplifying

This commit is contained in:
Dan Saunders
2025-02-28 10:19:40 -05:00
committed by Dan Saunders
parent 2d36c11264
commit 035e7a2f4c
2 changed files with 16 additions and 16 deletions

View File

@@ -21,7 +21,7 @@ POSTHOG_HOST = "https://app.posthog.com"
POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y" POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y"
OPT_IN_WARNING_SLEEP_SECONDS = 10 OPT_IN_WARNING_SLEEP_SECONDS = 10
OPT_IN_INFO = ( OPT_IN_WARNING = (
"\nTelemetry is currently disabled by default. If you'd like to help improve " "\nTelemetry is currently disabled by default. If you'd like to help improve "
"Axolotl, consider enabling it by setting AXOLOTL_DO_NOT_TRACK=0 in your environment.\n\n" "Axolotl, consider enabling it by setting AXOLOTL_DO_NOT_TRACK=0 in your environment.\n\n"
"Telemetry data helps us understand:\n" "Telemetry data helps us understand:\n"
@@ -38,14 +38,15 @@ OPT_IN_INFO = (
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml") WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
# NOTE: Keep these up to date with any config schema changes # NOTE: Need to keep these up to date with any config schema changes
FIELDS_WITH_ORGS = { FIELDS_TO_REDACT = {
"base_model", "base_model",
"tokenizer_config", "tokenizer_config",
"base_model_config", "base_model_config",
"pretraining_dataset", # NOTE: this field may be a string or a dictionary "pretraining_dataset", # NOTE: this field may be a string or a dictionary
"resume_from_checkpoint",
"hub_model_id",
} }
FIELDS_TO_REDACT = {"resume_from_checkpoint", "hub_model_id"}
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"} PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
PATH_INDICATORS = {"path", "dir"} PATH_INDICATORS = {"path", "dir"}
@@ -187,7 +188,7 @@ class TelemetryManager:
): ):
# Print opt-in info message for main process only # Print opt-in info message for main process only
if is_main_process(): if is_main_process():
LOG.info(OPT_IN_INFO) LOG.warning(OPT_IN_WARNING)
time.sleep(OPT_IN_WARNING_SLEEP_SECONDS) time.sleep(OPT_IN_WARNING_SLEEP_SECONDS)
return False return False
@@ -224,7 +225,8 @@ class TelemetryManager:
Check if model / dataset / etc. org is in whitelist. Check if model / dataset / etc. org is in whitelist.
Args: Args:
value: Value for one of FIELDS_WITH_ORGS ("base_model", etc.). value: Value for one of `axolotl.telemetry.manager.FIELDS_WITH_ORGS`
("base_model", etc.).
Returns: Returns:
Boolean indicating whitelist membership. Boolean indicating whitelist membership.
@@ -259,20 +261,17 @@ class TelemetryManager:
def redact_value(value: Any, key: str = "") -> Any: def redact_value(value: Any, key: str = "") -> Any:
"""Recursively sanitize values, redacting those with path-like keys""" """Recursively sanitize values, redacting those with path-like keys"""
if isinstance(key, str) and isinstance(value, str): if isinstance(key, str) and isinstance(value, str):
# Fields that should be redacted if org is not whitelisted
if key in FIELDS_WITH_ORGS:
if not self._is_whitelisted(value):
return "[REDACTED]"
# Other redaction special cases # Other redaction special cases
if ( if (
key in FIELDS_TO_REDACT key in FIELDS_TO_REDACT
or any(prefix in key for prefix in PREFIXES_TO_REDACT) or any(prefix in key for prefix in PREFIXES_TO_REDACT)
or any(indicator in key.lower() for indicator in PATH_INDICATORS) or any(indicator in key.lower() for indicator in PATH_INDICATORS)
): ):
return "[REDACTED]" # Fields with whitelisted orgs don't need to be redacted
if not self._is_whitelisted(value):
return "[REDACTED]"
# Handle nested structures # Handle nested values
if isinstance(value, dict): if isinstance(value, dict):
return {k: redact_value(v, k) for k, v in value.items()} return {k: redact_value(v, k) for k, v in value.items()}
if isinstance(value, list): if isinstance(value, list):

View File

@@ -106,11 +106,12 @@ def test_telemetry_disabled_for_non_main_process(telemetry_manager_class):
def test_opt_in_info_displayed(telemetry_manager_class): def test_opt_in_info_displayed(telemetry_manager_class):
"""Test that opt-in info is displayed when telemetry is not configured""" """Test that opt-in info is displayed when telemetry is not configured"""
with patch.dict(os.environ, {"RANK": "0"}, clear=True), patch( with patch.dict(os.environ, {"RANK": "0"}, clear=True), patch(
"logging.Logger.info" "logging.Logger.warning"
) as mock_info, patch("time.sleep"): ) as mock_warning, patch("time.sleep"):
telemetry_manager_class() telemetry_manager_class()
info_displayed = False info_displayed = False
for call in mock_info.call_args_list: for call in mock_warning.call_args_list:
print(f"call: {call}")
if "Telemetry is currently disabled by default" in str(call): if "Telemetry is currently disabled by default" in str(call):
info_displayed = True info_displayed = True
break break