improved redaction, send system info during model config load telemetry, etc.

This commit is contained in:
Dan Saunders
2025-02-24 15:39:02 +00:00
parent 9dd1092f8f
commit 71ae6f9f87
4 changed files with 46 additions and 16 deletions

View File

@@ -42,6 +42,14 @@ ENABLED_WARNING = (
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml") WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
FIELDS_WITH_ORGS = [
"base_model",
"tokenizer_config",
"base_model_config",
]
FIELDS_TO_REDACT = ["resume_from_checkpoint", "hub_model_id"]
PREFIXES_TO_REDACT = ["wandb_", "comet_", "mlflow_", "gradio_"]
class TelemetryManager: class TelemetryManager:
"""Manages telemetry collection and transmission""" """Manages telemetry collection and transmission"""
@@ -154,13 +162,14 @@ class TelemetryManager:
def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]: def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
""" """
Redact properties to remove any paths, so as to avoid inadvertently collecting Redact properties to remove any paths, so as to avoid inadvertently collecting
private or personally identifiable information (PII). private or personally identifiable information (PII). We also remove
information related to Wandb, MLflow, etc. configuration.
Args: Args:
properties: Dictionary of properties to redact. properties: Dictionary of properties to redact.
Returns: Returns:
Properties dictionary with paths redacted. Properties dictionary with redaction applied.
""" """
if not properties: if not properties:
return {} return {}
@@ -170,15 +179,19 @@ class TelemetryManager:
def redact_value(value: Any, key: str = "") -> Any: def redact_value(value: Any, key: str = "") -> Any:
"""Recursively sanitize values, redacting those with path-like keys""" """Recursively sanitize values, redacting those with path-like keys"""
# Special case: base_model should be redacted if org is not whitelisted if isinstance(key, str) and isinstance(value, str):
if key == "base_model": # Fields that should be redacted if org is not whitelisted
org = value.split("/")[0] if key in FIELDS_WITH_ORGS:
if org not in self.whitelist["organizations"]: org = value.split("/")[0]
return "[REDACTED]" if org not in self.whitelist["organizations"]:
return "[REDACTED]"
if isinstance(value, str): # Other redaction special cases
# If the key suggests this is a path, redact it if (
if any(indicator in key.lower() for indicator in path_indicators): key in FIELDS_TO_REDACT
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
or any(indicator in key.lower() for indicator in path_indicators)
):
return "[REDACTED]" return "[REDACTED]"
# Handle nested structures # Handle nested structures
@@ -231,17 +244,21 @@ class TelemetryManager:
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage # Wrap PostHog errors in try / except to not raise errors during Axolotl usage
try: try:
LOG.warning(f"*** Sending telemetry for {event_type} ***")
# Send event via PostHog # Send event via PostHog
posthog.capture( posthog.capture(
distinct_id=self.run_id, distinct_id=self.run_id,
event=event_type, event=event_type,
properties=properties, properties=properties,
disable_geoip=True,
) )
except Exception as e: # pylint: disable=broad-exception-caught except Exception as e: # pylint: disable=broad-exception-caught
LOG.warning(f"Failed to send telemetry event: {e}") LOG.warning(f"Failed to send telemetry event: {e}")
# Additionally, send system info telemetry when loading config.
# NOTE: Is this the best place for this?
if event_type == "config-loaded":
self.send_system_info()
def send_system_info(self): def send_system_info(self):
"""Helper method for sending system info""" """Helper method for sending system info"""
self.send_event(event_type="system-info", properties=self.system_info) self.send_event(event_type="system-info", properties=self.system_info)

View File

@@ -1,6 +1,4 @@
""" """Shared pytest fixtures"""
shared pytest fixtures
"""
import functools import functools
import importlib import importlib
@@ -559,3 +557,9 @@ def test_load_fixtures(
download_llama2_model_fixture, download_llama2_model_fixture,
): ):
pass pass
@pytest.fixture(autouse=True)
def disable_telemetry(monkeypatch):
monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1")
yield

View File

@@ -0,0 +1,9 @@
"""Shared pytest fixtures for telemetry tests."""
import pytest
@pytest.fixture(autouse=True)
def disable_telemetry(monkeypatch):
monkeypatch.delenv("AXOLOTL_DO_NOT_TRACK")
yield

View File

@@ -146,7 +146,7 @@ def test_is_whitelisted(manager, mock_whitelist):
def test_system_info_collection(manager): def test_system_info_collection(manager):
"""Test system information collection""" """Test system information collection"""
system_info = manager.system_info system_info = manager._get_system_info()
# Check essential keys # Check essential keys
assert "os" in system_info assert "os" in system_info