diff --git a/src/axolotl/telemetry/manager.py b/src/axolotl/telemetry/manager.py index 06699ebff..7f2d6da44 100644 --- a/src/axolotl/telemetry/manager.py +++ b/src/axolotl/telemetry/manager.py @@ -42,6 +42,14 @@ ENABLED_WARNING = ( WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml") +FIELDS_WITH_ORGS = [ + "base_model", + "tokenizer_config", + "base_model_config", +] +FIELDS_TO_REDACT = ["resume_from_checkpoint", "hub_model_id"] +PREFIXES_TO_REDACT = ["wandb_", "comet_", "mlflow_", "gradio_"] + class TelemetryManager: """Manages telemetry collection and transmission""" @@ -154,13 +162,14 @@ class TelemetryManager: def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]: """ Redact properties to remove any paths, so as to avoid inadvertently collecting - private or personally identifiable information (PII). + private or personally identifiable information (PII). We also remove + information related to Wandb, MLflow, etc. configuration. Args: properties: Dictionary of properties to redact. Returns: - Properties dictionary with paths redacted. + Properties dictionary with redaction applied. """ if not properties: return {} @@ -170,15 +179,19 @@ class TelemetryManager: def redact_value(value: Any, key: str = "") -> Any: """Recursively sanitize values, redacting those with path-like keys""" - # Special case: base_model should be redacted if org is not whitelisted - if key == "base_model": - org = value.split("/")[0] - if org not in self.whitelist["organizations"]: - return "[REDACTED]" + if isinstance(key, str) and isinstance(value, str): + # Fields that should be redacted if org is not whitelisted + if key in FIELDS_WITH_ORGS: + org = value.split("/")[0] + if org not in self.whitelist["organizations"]: + return "[REDACTED]" - if isinstance(value, str): - # If the key suggests this is a path, redact it - if any(indicator in key.lower() for indicator in path_indicators): + # Other redaction special cases + if ( + key in FIELDS_TO_REDACT + or any(prefix in key for prefix in PREFIXES_TO_REDACT) + or any(indicator in key.lower() for indicator in path_indicators) + ): return "[REDACTED]" # Handle nested structures @@ -231,17 +244,21 @@ class TelemetryManager: # Wrap PostHog errors in try / except to not raise errors during Axolotl usage try: - LOG.warning(f"*** Sending telemetry for {event_type} ***") - # Send event via PostHog posthog.capture( distinct_id=self.run_id, event=event_type, properties=properties, + disable_geoip=True, ) except Exception as e: # pylint: disable=broad-exception-caught LOG.warning(f"Failed to send telemetry event: {e}") + # Additionally, send system info telemetry when loading config. + # NOTE: Is this the best place for this? + if event_type == "config-loaded": + self.send_system_info() + def send_system_info(self): """Helper method for sending system info""" self.send_event(event_type="system-info", properties=self.system_info) diff --git a/tests/conftest.py b/tests/conftest.py index 8ab8fd6a4..12014e78e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,4 @@ -""" -shared pytest fixtures -""" +"""Shared pytest fixtures""" import functools import importlib @@ -559,3 +557,9 @@ def test_load_fixtures( download_llama2_model_fixture, ): pass + + +@pytest.fixture(autouse=True) +def disable_telemetry(monkeypatch): + monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1") + yield diff --git a/tests/telemetry/conftest.py b/tests/telemetry/conftest.py new file mode 100644 index 000000000..0055eeb52 --- /dev/null +++ b/tests/telemetry/conftest.py @@ -0,0 +1,9 @@ +"""Shared pytest fixtures for telemetry tests.""" + +import pytest + + +@pytest.fixture(autouse=True) +def disable_telemetry(monkeypatch): + monkeypatch.delenv("AXOLOTL_DO_NOT_TRACK") + yield diff --git a/tests/telemetry/test_manager.py b/tests/telemetry/test_manager.py index 4608b337c..b813b8892 100644 --- a/tests/telemetry/test_manager.py +++ b/tests/telemetry/test_manager.py @@ -146,7 +146,7 @@ def test_is_whitelisted(manager, mock_whitelist): def test_system_info_collection(manager): """Test system information collection""" - system_info = manager.system_info + system_info = manager._get_system_info() # Check essential keys assert "os" in system_info