improved redaction, send system info during model config load telemetry, etc.
This commit is contained in:
@@ -42,6 +42,14 @@ ENABLED_WARNING = (
|
|||||||
|
|
||||||
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
|
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
|
||||||
|
|
||||||
|
FIELDS_WITH_ORGS = [
|
||||||
|
"base_model",
|
||||||
|
"tokenizer_config",
|
||||||
|
"base_model_config",
|
||||||
|
]
|
||||||
|
FIELDS_TO_REDACT = ["resume_from_checkpoint", "hub_model_id"]
|
||||||
|
PREFIXES_TO_REDACT = ["wandb_", "comet_", "mlflow_", "gradio_"]
|
||||||
|
|
||||||
|
|
||||||
class TelemetryManager:
|
class TelemetryManager:
|
||||||
"""Manages telemetry collection and transmission"""
|
"""Manages telemetry collection and transmission"""
|
||||||
@@ -154,13 +162,14 @@ class TelemetryManager:
|
|||||||
def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
|
def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Redact properties to remove any paths, so as to avoid inadvertently collecting
|
Redact properties to remove any paths, so as to avoid inadvertently collecting
|
||||||
private or personally identifiable information (PII).
|
private or personally identifiable information (PII). We also remove
|
||||||
|
information related to Wandb, MLflow, etc. configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
properties: Dictionary of properties to redact.
|
properties: Dictionary of properties to redact.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Properties dictionary with paths redacted.
|
Properties dictionary with redaction applied.
|
||||||
"""
|
"""
|
||||||
if not properties:
|
if not properties:
|
||||||
return {}
|
return {}
|
||||||
@@ -170,15 +179,19 @@ class TelemetryManager:
|
|||||||
|
|
||||||
def redact_value(value: Any, key: str = "") -> Any:
|
def redact_value(value: Any, key: str = "") -> Any:
|
||||||
"""Recursively sanitize values, redacting those with path-like keys"""
|
"""Recursively sanitize values, redacting those with path-like keys"""
|
||||||
# Special case: base_model should be redacted if org is not whitelisted
|
if isinstance(key, str) and isinstance(value, str):
|
||||||
if key == "base_model":
|
# Fields that should be redacted if org is not whitelisted
|
||||||
org = value.split("/")[0]
|
if key in FIELDS_WITH_ORGS:
|
||||||
if org not in self.whitelist["organizations"]:
|
org = value.split("/")[0]
|
||||||
return "[REDACTED]"
|
if org not in self.whitelist["organizations"]:
|
||||||
|
return "[REDACTED]"
|
||||||
|
|
||||||
if isinstance(value, str):
|
# Other redaction special cases
|
||||||
# If the key suggests this is a path, redact it
|
if (
|
||||||
if any(indicator in key.lower() for indicator in path_indicators):
|
key in FIELDS_TO_REDACT
|
||||||
|
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
|
||||||
|
or any(indicator in key.lower() for indicator in path_indicators)
|
||||||
|
):
|
||||||
return "[REDACTED]"
|
return "[REDACTED]"
|
||||||
|
|
||||||
# Handle nested structures
|
# Handle nested structures
|
||||||
@@ -231,17 +244,21 @@ class TelemetryManager:
|
|||||||
|
|
||||||
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage
|
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage
|
||||||
try:
|
try:
|
||||||
LOG.warning(f"*** Sending telemetry for {event_type} ***")
|
|
||||||
|
|
||||||
# Send event via PostHog
|
# Send event via PostHog
|
||||||
posthog.capture(
|
posthog.capture(
|
||||||
distinct_id=self.run_id,
|
distinct_id=self.run_id,
|
||||||
event=event_type,
|
event=event_type,
|
||||||
properties=properties,
|
properties=properties,
|
||||||
|
disable_geoip=True,
|
||||||
)
|
)
|
||||||
except Exception as e: # pylint: disable=broad-exception-caught
|
except Exception as e: # pylint: disable=broad-exception-caught
|
||||||
LOG.warning(f"Failed to send telemetry event: {e}")
|
LOG.warning(f"Failed to send telemetry event: {e}")
|
||||||
|
|
||||||
|
# Additionally, send system info telemetry when loading config.
|
||||||
|
# NOTE: Is this the best place for this?
|
||||||
|
if event_type == "config-loaded":
|
||||||
|
self.send_system_info()
|
||||||
|
|
||||||
def send_system_info(self):
|
def send_system_info(self):
|
||||||
"""Helper method for sending system info"""
|
"""Helper method for sending system info"""
|
||||||
self.send_event(event_type="system-info", properties=self.system_info)
|
self.send_event(event_type="system-info", properties=self.system_info)
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
"""
|
"""Shared pytest fixtures"""
|
||||||
shared pytest fixtures
|
|
||||||
"""
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import importlib
|
import importlib
|
||||||
@@ -559,3 +557,9 @@ def test_load_fixtures(
|
|||||||
download_llama2_model_fixture,
|
download_llama2_model_fixture,
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def disable_telemetry(monkeypatch):
|
||||||
|
monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1")
|
||||||
|
yield
|
||||||
|
|||||||
9
tests/telemetry/conftest.py
Normal file
9
tests/telemetry/conftest.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
"""Shared pytest fixtures for telemetry tests."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def disable_telemetry(monkeypatch):
|
||||||
|
monkeypatch.delenv("AXOLOTL_DO_NOT_TRACK")
|
||||||
|
yield
|
||||||
@@ -146,7 +146,7 @@ def test_is_whitelisted(manager, mock_whitelist):
|
|||||||
|
|
||||||
def test_system_info_collection(manager):
|
def test_system_info_collection(manager):
|
||||||
"""Test system information collection"""
|
"""Test system information collection"""
|
||||||
system_info = manager.system_info
|
system_info = manager._get_system_info()
|
||||||
|
|
||||||
# Check essential keys
|
# Check essential keys
|
||||||
assert "os" in system_info
|
assert "os" in system_info
|
||||||
|
|||||||
Reference in New Issue
Block a user