updated sanitization logic, tests
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"""Telemetry utilities for exception and traceback information."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
from functools import wraps
|
||||
@@ -16,13 +17,16 @@ ERROR_HANDLED = False
|
||||
|
||||
def sanitize_stack_trace(stack_trace: str) -> str:
|
||||
"""
|
||||
Remove personal information from stack trace messages while keeping Axolotl codepaths.
|
||||
Remove personal information from stack trace messages while keeping Python package codepaths.
|
||||
|
||||
This function identifies Python packages by looking for common patterns in virtual environment
|
||||
and site-packages directories, preserving the package path while removing user-specific paths.
|
||||
|
||||
Args:
|
||||
stack_trace: The original stack trace string.
|
||||
|
||||
Returns:
|
||||
A sanitized version of the stack trace with only axolotl paths preserved.
|
||||
A sanitized version of the stack trace with Python package paths preserved.
|
||||
"""
|
||||
# Split the stack trace into lines to process each file path separately
|
||||
lines = stack_trace.split("\n")
|
||||
@@ -31,23 +35,66 @@ def sanitize_stack_trace(stack_trace: str) -> str:
|
||||
# Regular expression to find file paths in the stack trace
|
||||
path_pattern = re.compile(r'(?:File ")(.*?)(?:")')
|
||||
|
||||
# Regular expression to identify paths in site-packages or dist-packages
|
||||
# This matches path segments like "site-packages/package_name" or "dist-packages/package_name"
|
||||
site_packages_pattern = re.compile(
|
||||
r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
|
||||
)
|
||||
|
||||
# Additional common virtual environment patterns
|
||||
venv_lib_pattern = re.compile(
|
||||
r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
|
||||
)
|
||||
|
||||
for line in lines:
|
||||
# Check if this line contains a file path
|
||||
path_match = path_pattern.search(line)
|
||||
|
||||
if path_match:
|
||||
full_path = path_match.group(1)
|
||||
sanitized_path = ""
|
||||
|
||||
if "axolotl/" in full_path:
|
||||
# Keep only the 'axolotl' part and onward
|
||||
axolotl_idx = full_path.rfind("axolotl/")
|
||||
if axolotl_idx >= 0:
|
||||
# Replace the original path with the sanitized one
|
||||
sanitized_path = full_path[axolotl_idx:]
|
||||
line = line.replace(full_path, sanitized_path)
|
||||
# Try to match site-packages pattern
|
||||
site_packages_match = site_packages_pattern.search(full_path)
|
||||
venv_lib_match = venv_lib_pattern.search(full_path)
|
||||
|
||||
if site_packages_match:
|
||||
# Find the index where the matched pattern starts
|
||||
idx = full_path.find("site-packages")
|
||||
if idx == -1:
|
||||
idx = full_path.find("dist-packages")
|
||||
|
||||
# Keep from 'site-packages' onward
|
||||
if idx >= 0:
|
||||
sanitized_path = full_path[idx:]
|
||||
elif venv_lib_match:
|
||||
# For other virtual environment patterns, find the package directory
|
||||
match_idx = venv_lib_match.start(1)
|
||||
if match_idx > 0:
|
||||
# Keep from the package name onward
|
||||
package_name = venv_lib_match.group(1)
|
||||
idx = full_path.rfind(
|
||||
package_name, 0, match_idx + len(package_name)
|
||||
)
|
||||
if idx >= 0:
|
||||
sanitized_path = full_path[idx:]
|
||||
|
||||
# If we couldn't identify a package pattern but path contains 'axolotl'
|
||||
elif "axolotl" in full_path:
|
||||
idx = full_path.rfind("axolotl")
|
||||
if idx >= 0:
|
||||
sanitized_path = full_path[idx:]
|
||||
|
||||
# Apply the sanitization to the line
|
||||
if sanitized_path:
|
||||
line = line.replace(full_path, sanitized_path)
|
||||
else:
|
||||
# For non-axolotl paths, replace with an empty string or a placeholder
|
||||
line = line.replace(full_path, "")
|
||||
# If we couldn't identify a package pattern, just keep the filename
|
||||
filename = os.path.basename(full_path)
|
||||
if filename:
|
||||
line = line.replace(full_path, filename)
|
||||
else:
|
||||
line = line.replace(full_path, "")
|
||||
|
||||
sanitized_lines.append(line)
|
||||
|
||||
@@ -72,6 +119,7 @@ def send_errors(func: Callable) -> Callable:
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs) -> Any:
|
||||
telemetry_manager = TelemetryManager.get_instance()
|
||||
|
||||
if not telemetry_manager.enabled:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
@@ -79,7 +127,7 @@ def send_errors(func: Callable) -> Callable:
|
||||
return func(*args, **kwargs)
|
||||
except Exception as exception:
|
||||
# Only track if we're not already handling an error. This prevents us from
|
||||
# capturing an error more than once in nested decorated function calls.
|
||||
# capturing an error more than once in nested decorated function calls.=
|
||||
global ERROR_HANDLED # pylint: disable=global-statement
|
||||
if not ERROR_HANDLED:
|
||||
ERROR_HANDLED = True
|
||||
|
||||
@@ -4,6 +4,7 @@ import atexit
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
@@ -122,6 +123,9 @@ class TelemetryManager:
|
||||
axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
|
||||
do_not_track = os.getenv("DO_NOT_TRACK")
|
||||
|
||||
# If explicitly enabled, we'll disable the telemetry warning message
|
||||
explicit_enabled = axolotl_do_not_track in ["0", "false"]
|
||||
|
||||
if axolotl_do_not_track is None:
|
||||
axolotl_do_not_track = "0"
|
||||
|
||||
@@ -134,9 +138,6 @@ class TelemetryManager:
|
||||
"true",
|
||||
) and do_not_track.lower() not in ("1", "true")
|
||||
|
||||
# If explicitly enabled, we'll disable the telemetry warning message
|
||||
explicit_enabled = axolotl_do_not_track in ["0", "false"]
|
||||
|
||||
return enabled, explicit_enabled
|
||||
|
||||
def _load_whitelist(self) -> dict:
|
||||
@@ -145,7 +146,7 @@ class TelemetryManager:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def _is_whitelisted(self, base_model: str) -> bool:
|
||||
"""Check if model/org is in whitelist"""
|
||||
"""Check if model org is in whitelist"""
|
||||
if not base_model:
|
||||
return False
|
||||
|
||||
@@ -159,9 +160,66 @@ class TelemetryManager:
|
||||
posthog.project_api_key = POSTHOG_WRITE_KEY
|
||||
posthog.host = self.config.host
|
||||
|
||||
def _sanitize_path(self, path: str) -> str:
|
||||
"""Remove personal information from file paths"""
|
||||
return Path(path).name
|
||||
def _sanitize_properties(self, properties: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Sanitize properties to remove any personally identifiable information such as:
|
||||
- File paths
|
||||
- URLs / Links
|
||||
- Cloud storage locations
|
||||
|
||||
Args:
|
||||
properties: Dictionary of properties to sanitize.
|
||||
|
||||
Returns:
|
||||
Sanitized properties dictionary.
|
||||
"""
|
||||
if not properties:
|
||||
return {}
|
||||
|
||||
# Define regex patterns for different types of personal information
|
||||
patterns = {
|
||||
# File paths (Unix and Windows)
|
||||
"file_path": re.compile(r"(?:/|\\)(?:[^/\\]+(?:/|\\))+[^/\\]+"),
|
||||
# URLs/Links
|
||||
"url": re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^/\s]*)*"),
|
||||
# Cloud storage paths (S3, GCS, Azure)
|
||||
"cloud_path": re.compile(r"s3://|gs://|azure://|blob.core.windows.net"),
|
||||
}
|
||||
|
||||
# Deep copy isn't needed; we'll create a new dict with sanitized values
|
||||
sanitized = {}
|
||||
|
||||
def sanitize_value(value):
|
||||
"""Recursively sanitize values within nested structures"""
|
||||
if isinstance(value, str):
|
||||
# For file paths, extract just the filename
|
||||
path_match = patterns["file_path"].search(value)
|
||||
if path_match:
|
||||
try:
|
||||
# Try to extract just the filename
|
||||
path_str = path_match.group(0)
|
||||
value = value.replace(path_str, Path(path_str).name)
|
||||
except (ValueError, RuntimeError):
|
||||
# If path extraction fails, just redact the path
|
||||
value = patterns["file_path"].sub("[REDACTED_PATH]", value)
|
||||
|
||||
# Redact other sensitive information
|
||||
value = patterns["url"].sub("[REDACTED_URL]", value)
|
||||
value = patterns["cloud_path"].sub("[REDACTED_CLOUD]", value)
|
||||
|
||||
return value
|
||||
if isinstance(value, dict):
|
||||
return {k: sanitize_value(v) for k, v in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [sanitize_value(item) for item in value]
|
||||
|
||||
return value
|
||||
|
||||
# Apply the sanitization to all properties
|
||||
for key, value in properties.items():
|
||||
sanitized[key] = sanitize_value(value)
|
||||
|
||||
return sanitized
|
||||
|
||||
def _get_system_info(self) -> dict[str, Any]:
|
||||
"""Collect system information"""
|
||||
@@ -195,6 +253,9 @@ class TelemetryManager:
|
||||
if properties is None:
|
||||
properties = {}
|
||||
|
||||
# Sanitize properties to remove PII
|
||||
properties = self._sanitize_properties(properties)
|
||||
|
||||
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage
|
||||
try:
|
||||
LOG.warning(f"*** Sending telemetry for {event_type} ***")
|
||||
|
||||
Reference in New Issue
Block a user