updated sanitization logic, tests

This commit is contained in:
Dan Saunders
2025-02-24 20:05:55 +00:00
parent 1edd6b9524
commit d8b0522ea0
4 changed files with 666 additions and 97 deletions

View File

@@ -1,6 +1,7 @@
"""Telemetry utilities for exception and traceback information."""
import logging
import os
import re
import traceback
from functools import wraps
@@ -16,13 +17,16 @@ ERROR_HANDLED = False
def sanitize_stack_trace(stack_trace: str) -> str:
"""
Remove personal information from stack trace messages while keeping Axolotl codepaths.
Remove personal information from stack trace messages while keeping Python package codepaths.
This function identifies Python packages by looking for common patterns in virtual environment
and site-packages directories, preserving the package path while removing user-specific paths.
Args:
stack_trace: The original stack trace string.
Returns:
A sanitized version of the stack trace with only axolotl paths preserved.
A sanitized version of the stack trace with Python package paths preserved.
"""
# Split the stack trace into lines to process each file path separately
lines = stack_trace.split("\n")
@@ -31,23 +35,66 @@ def sanitize_stack_trace(stack_trace: str) -> str:
# Regular expression to find file paths in the stack trace
path_pattern = re.compile(r'(?:File ")(.*?)(?:")')
# Regular expression to identify paths in site-packages or dist-packages
# This matches path segments like "site-packages/package_name" or "dist-packages/package_name"
site_packages_pattern = re.compile(
r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
)
# Additional common virtual environment patterns
venv_lib_pattern = re.compile(
r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
)
for line in lines:
# Check if this line contains a file path
path_match = path_pattern.search(line)
if path_match:
full_path = path_match.group(1)
sanitized_path = ""
if "axolotl/" in full_path:
# Keep only the 'axolotl' part and onward
axolotl_idx = full_path.rfind("axolotl/")
if axolotl_idx >= 0:
# Replace the original path with the sanitized one
sanitized_path = full_path[axolotl_idx:]
line = line.replace(full_path, sanitized_path)
# Try to match site-packages pattern
site_packages_match = site_packages_pattern.search(full_path)
venv_lib_match = venv_lib_pattern.search(full_path)
if site_packages_match:
# Find the index where the matched pattern starts
idx = full_path.find("site-packages")
if idx == -1:
idx = full_path.find("dist-packages")
# Keep from 'site-packages' onward
if idx >= 0:
sanitized_path = full_path[idx:]
elif venv_lib_match:
# For other virtual environment patterns, find the package directory
match_idx = venv_lib_match.start(1)
if match_idx > 0:
# Keep from the package name onward
package_name = venv_lib_match.group(1)
idx = full_path.rfind(
package_name, 0, match_idx + len(package_name)
)
if idx >= 0:
sanitized_path = full_path[idx:]
# If we couldn't identify a package pattern but path contains 'axolotl'
elif "axolotl" in full_path:
idx = full_path.rfind("axolotl")
if idx >= 0:
sanitized_path = full_path[idx:]
# Apply the sanitization to the line
if sanitized_path:
line = line.replace(full_path, sanitized_path)
else:
# For non-axolotl paths, replace with an empty string or a placeholder
line = line.replace(full_path, "")
# If we couldn't identify a package pattern, just keep the filename
filename = os.path.basename(full_path)
if filename:
line = line.replace(full_path, filename)
else:
line = line.replace(full_path, "")
sanitized_lines.append(line)
@@ -72,6 +119,7 @@ def send_errors(func: Callable) -> Callable:
@wraps(func)
def wrapper(*args, **kwargs) -> Any:
telemetry_manager = TelemetryManager.get_instance()
if not telemetry_manager.enabled:
return func(*args, **kwargs)
@@ -79,7 +127,7 @@ def send_errors(func: Callable) -> Callable:
return func(*args, **kwargs)
except Exception as exception:
# Only track if we're not already handling an error. This prevents us from
# capturing an error more than once in nested decorated function calls.
# capturing an error more than once in nested decorated function calls.=
global ERROR_HANDLED # pylint: disable=global-statement
if not ERROR_HANDLED:
ERROR_HANDLED = True

View File

@@ -4,6 +4,7 @@ import atexit
import logging
import os
import platform
import re
import time
import uuid
from dataclasses import dataclass
@@ -122,6 +123,9 @@ class TelemetryManager:
axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
do_not_track = os.getenv("DO_NOT_TRACK")
# If explicitly enabled, we'll disable the telemetry warning message
explicit_enabled = axolotl_do_not_track in ["0", "false"]
if axolotl_do_not_track is None:
axolotl_do_not_track = "0"
@@ -134,9 +138,6 @@ class TelemetryManager:
"true",
) and do_not_track.lower() not in ("1", "true")
# If explicitly enabled, we'll disable the telemetry warning message
explicit_enabled = axolotl_do_not_track in ["0", "false"]
return enabled, explicit_enabled
def _load_whitelist(self) -> dict:
@@ -145,7 +146,7 @@ class TelemetryManager:
return yaml.safe_load(f)
def _is_whitelisted(self, base_model: str) -> bool:
"""Check if model/org is in whitelist"""
"""Check if model org is in whitelist"""
if not base_model:
return False
@@ -159,9 +160,66 @@ class TelemetryManager:
posthog.project_api_key = POSTHOG_WRITE_KEY
posthog.host = self.config.host
def _sanitize_path(self, path: str) -> str:
"""Remove personal information from file paths"""
return Path(path).name
def _sanitize_properties(self, properties: dict[str, Any]) -> dict[str, Any]:
"""
Sanitize properties to remove any personally identifiable information such as:
- File paths
- URLs / Links
- Cloud storage locations
Args:
properties: Dictionary of properties to sanitize.
Returns:
Sanitized properties dictionary.
"""
if not properties:
return {}
# Define regex patterns for different types of personal information
patterns = {
# File paths (Unix and Windows)
"file_path": re.compile(r"(?:/|\\)(?:[^/\\]+(?:/|\\))+[^/\\]+"),
# URLs/Links
"url": re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^/\s]*)*"),
# Cloud storage paths (S3, GCS, Azure)
"cloud_path": re.compile(r"s3://|gs://|azure://|blob.core.windows.net"),
}
# Deep copy isn't needed; we'll create a new dict with sanitized values
sanitized = {}
def sanitize_value(value):
"""Recursively sanitize values within nested structures"""
if isinstance(value, str):
# For file paths, extract just the filename
path_match = patterns["file_path"].search(value)
if path_match:
try:
# Try to extract just the filename
path_str = path_match.group(0)
value = value.replace(path_str, Path(path_str).name)
except (ValueError, RuntimeError):
# If path extraction fails, just redact the path
value = patterns["file_path"].sub("[REDACTED_PATH]", value)
# Redact other sensitive information
value = patterns["url"].sub("[REDACTED_URL]", value)
value = patterns["cloud_path"].sub("[REDACTED_CLOUD]", value)
return value
if isinstance(value, dict):
return {k: sanitize_value(v) for k, v in value.items()}
if isinstance(value, list):
return [sanitize_value(item) for item in value]
return value
# Apply the sanitization to all properties
for key, value in properties.items():
sanitized[key] = sanitize_value(value)
return sanitized
def _get_system_info(self) -> dict[str, Any]:
"""Collect system information"""
@@ -195,6 +253,9 @@ class TelemetryManager:
if properties is None:
properties = {}
# Sanitize properties to remove PII
properties = self._sanitize_properties(properties)
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage
try:
LOG.warning(f"*** Sending telemetry for {event_type} ***")