diff --git a/docs/telemetry.qmd b/docs/telemetry.qmd index 0837dd25f..aab3ff529 100644 --- a/docs/telemetry.qmd +++ b/docs/telemetry.qmd @@ -31,8 +31,8 @@ Telemetry is implemented using PostHog and consists of: telemetry system and provides methods for tracking events. - `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and sends sanitized stack traces. -- `axolotl.telemetry.runtime_metrics.RuntimeMetrics`: A dataclass that tracks runtime -metrics during training. +- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks +runtime metrics during training. - `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends runtime metrics telemetry. @@ -44,7 +44,7 @@ aware of data collection, unless telemetry is explicitly enabled or disabled. Telemetry is **enabled by default** on an opt-out basis. To disable it, set either: - `AXOLOTL_DO_NOT_TRACK=1` (Axolotl-specific) -- `DO_NOT_TRACK=1` (Global standard) +- `DO_NOT_TRACK=1` (Global standard; see https://consoledonottrack.com/) To acknowledge and explicitly enable telemetry (and remove the warning message), set: `AXOLOTL_DO_NOT_TRACK=0`. diff --git a/src/axolotl/telemetry/callbacks.py b/src/axolotl/telemetry/callbacks.py index ca0aaae92..7650ec7c9 100644 --- a/src/axolotl/telemetry/callbacks.py +++ b/src/axolotl/telemetry/callbacks.py @@ -131,17 +131,20 @@ class TelemetryCallback(TrainerCallback): # Update memory metrics self.tracker.update_memory_metrics() + loss = state.log_history[-1].get("loss", 0) if state.log_history else 0 + learning_rate = ( + state.log_history[-1].get("learning_rate", 0) + if state.log_history + else 0 + ) + # Prepare metrics to report metrics = { "step": step, "epoch": self.current_epoch, "progress": state.epoch, # Fractional epoch progress - "loss": state.log_history[-1].get("loss", 0) - if state.log_history - else 0, - "learning_rate": state.log_history[-1].get("learning_rate", 0) - if state.log_history - else 0, + "loss": loss, + "learning_rate": learning_rate, "steps_per_second": steps_per_second, "elapsed_time": current_time - self.start_time, "time_since_last_report": time_since_last_report, @@ -149,7 +152,7 @@ class TelemetryCallback(TrainerCallback): # Add memory metrics memory_metrics = self.tracker.get_memory_metrics() - metrics.update(memory_metrics) + metrics.update({"memory": memory_metrics}) # Send telemetry self.telemetry_manager.send_event( diff --git a/src/axolotl/telemetry/manager.py b/src/axolotl/telemetry/manager.py index 7f2d6da44..301c99c3d 100644 --- a/src/axolotl/telemetry/manager.py +++ b/src/axolotl/telemetry/manager.py @@ -1,6 +1,7 @@ """Telemetry manager and associated utilities.""" import atexit +import importlib import logging import os import platform @@ -12,10 +13,8 @@ from typing import Any import posthog import psutil import torch -import transformers import yaml -import axolotl from axolotl.utils.distributed import is_main_process LOG = logging.getLogger(__name__) @@ -32,7 +31,7 @@ ENABLED_WARNING = ( "This data helps us prioritize features, optimize performance, and fix bugs.\n\n" "To disable telemetry, set either:\n" "- AXOLOTL_DO_NOT_TRACK=1 (Axolotl-specific)\n" - "- DO_NOT_TRACK=1 (Global standard)\n\n" + "- DO_NOT_TRACK=1 (Global standard; see https://consoledonottrack.com/)\n\n" "To remove this warning and continue with telemetry enabled," "explicitly set AXOLOTL_DO_NOT_TRACK=0 (and leave DO_NOT_TRACK unset / set to 0)\n\n" "No personally identifiable information is collected." @@ -42,13 +41,39 @@ ENABLED_WARNING = ( WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml") -FIELDS_WITH_ORGS = [ +# NOTE: Keep these up to date with any config schema changes +FIELDS_WITH_ORGS = { "base_model", "tokenizer_config", "base_model_config", -] -FIELDS_TO_REDACT = ["resume_from_checkpoint", "hub_model_id"] -PREFIXES_TO_REDACT = ["wandb_", "comet_", "mlflow_", "gradio_"] + "pretraining_dataset", # NOTE: this field may be a string or a dictionary +} +FIELDS_TO_REDACT = {"resume_from_checkpoint", "hub_model_id"} +PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"} +PATH_INDICATORS = {"path", "dir"} + +RELEVANT_PACKAGES = { + "torch", + "transformers", + "trl", + "datasets", + "peft", + "bitsandbytes", + "accelerate", + "optimum", + "deepspeed", + "ray", + "axolotl", + "triton", + "mamba-ssm", + "flash-attn", + "xformers", + "autoawq", + "tokenizers", + "sentencepiece", + "torchao", + "lm_eval", +} class TelemetryManager: @@ -78,7 +103,13 @@ class TelemetryManager: if self.enabled: self.run_id = str(uuid.uuid4()) self.whitelist = self._load_whitelist() - self.system_info = self._get_system_info() + + try: + self.system_info = self._get_system_info() + except Exception as e: # pylint: disable=broad-exception-caught + LOG.warning(f"Error during system info collection: {e}") + self.system_info = None + self._init_posthog() # Register shutdown method to flush posthog telemetry @@ -174,9 +205,6 @@ class TelemetryManager: if not properties: return {} - # NOTE: Keep this up to date with any config schema changes - path_indicators = {"path", "dir"} - def redact_value(value: Any, key: str = "") -> Any: """Recursively sanitize values, redacting those with path-like keys""" if isinstance(key, str) and isinstance(value, str): @@ -190,7 +218,7 @@ class TelemetryManager: if ( key in FIELDS_TO_REDACT or any(prefix in key for prefix in PREFIXES_TO_REDACT) - or any(indicator in key.lower() for indicator in path_indicators) + or any(indicator in key.lower() for indicator in PATH_INDICATORS) ): return "[REDACTED]" @@ -208,27 +236,100 @@ class TelemetryManager: return redacted def _get_system_info(self) -> dict[str, Any]: - """Collect system information""" + """Collect system information for various hardware accelerators""" gpu_info = [] + accelerator_type = "none" + + # NVIDIA GPUs if torch.cuda.is_available(): + accelerator_type = "cuda" for i in range(torch.cuda.device_count()): gpu_info.append( { "name": torch.cuda.get_device_name(i), "memory": torch.cuda.get_device_properties(i).total_memory, + "type": "cuda", } ) + # AMD GPUs + elif hasattr(torch, "hip") and torch.hip.is_available(): + accelerator_type = "hip" + for i in range(torch.hip.device_count()): + gpu_info.append( + { + "name": torch.hip.get_device_name(i), + "memory": torch.hip.get_device_properties(i).total_memory + if hasattr(torch.hip, "get_device_properties") + else None, + "type": "hip", + } + ) + + # Apple Silicon + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + accelerator_type = "mps" + gpu_info.append( + { + "name": "Apple Silicon", + # NOTE: this is memory allocated to this process, not total memory + "memory": torch.mps.driver_allocated_memory(), + "type": "mps", + } + ) + + # Intel GPUs + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + accelerator_type = "xpu" + for i in range(torch.xpu.device_count()): + memory = None + if hasattr(torch.xpu, "get_device_properties"): + memory = torch.xpu.get_device_properties(i).total_memory + + gpu_info.append( + { + "name": torch.xpu.get_device_name(i), + "memory": memory, + "type": "xpu", + } + ) + + # NPUs + elif hasattr(torch, "npu") and torch.npu.is_available(): + accelerator_type = "npu" + for i in range(torch.npu.device_count()): + name = getattr(torch.npu, "get_device_name", lambda x: "NPU")(i) + + memory = None + if hasattr(torch.npu, "get_device_properties"): + memory = torch.npu.get_device_properties(i).total_memory + + gpu_info.append( + { + "name": name, + "memory": memory, + "type": "npu", + } + ) + + # Get relevant package versions + installed_packages = {} + for package in RELEVANT_PACKAGES: + try: + version = importlib.metadata.version(package) + installed_packages[f"{package}_version"] = version + except importlib.metadata.PackageNotFoundError: + pass + return { "os": platform.system(), "python_version": platform.python_version(), - "pytorch_version": torch.__version__, - "transformers_version": transformers.__version__, - "axolotl_version": axolotl.__version__, "cpu_count": psutil.cpu_count(), "memory_total": psutil.virtual_memory().total, - "gpu_count": len(gpu_info), - "gpu_info": gpu_info, + "accelerator_type": accelerator_type, + "accelerator_count": len(gpu_info), + "accelerator_info": gpu_info, + **installed_packages, } def send_event(self, event_type: str, properties: dict[str, Any] | None = None): diff --git a/src/axolotl/telemetry/runtime_metrics.py b/src/axolotl/telemetry/runtime_metrics.py index d0f52b88b..229c74edf 100644 --- a/src/axolotl/telemetry/runtime_metrics.py +++ b/src/axolotl/telemetry/runtime_metrics.py @@ -112,6 +112,8 @@ class RuntimeMetrics: class RuntimeMetricsTracker: """Tracker for runtime metrics during training.""" + update_interval = 100 + def __init__(self): """Initialize the runtime metrics tracker.""" self.metrics = RuntimeMetrics(start_time=time.time()) @@ -132,23 +134,62 @@ class RuntimeMetricsTracker: self.metrics.current_step = step self.metrics.total_steps += 1 - # Periodically update memory metrics (e.g., every 100 steps) - if step % 100 == 0: + # Periodically update memory metrics + if step % self.update_interval == 0: self.update_memory_metrics() + def _get_allocated_memory(self) -> dict[int, int]: + """ + Helper function for getting accelerator-agnostic allocated memory. + + Returns: + A dictionary mapping device IDs to allocated memory in bytes + """ + memory_used: dict[int, int] = {} + + # NVIDIA GPUs + if torch.cuda.is_available(): + for i in range(torch.cuda.device_count()): + memory_used[i] = torch.cuda.memory_allocated(i) + + # AMD GPUs + elif hasattr(torch, "hip") and torch.hip.is_available(): + for i in range(torch.hip.device_count()): + if hasattr(torch.hip, "memory_allocated"): + memory_used[i] = torch.hip.memory_allocated(i) + + # Apple Silicon + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + # MPS doesn't have per-device memory stats since there's only one device + if hasattr(torch.mps, "current_allocated_memory"): + memory_used[0] = torch.mps.current_allocated_memory() + + # Intel GPUs + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + for i in range(torch.xpu.device_count()): + if hasattr(torch.xpu, "memory_allocated"): + memory_used[i] = torch.xpu.memory_allocated(i) + + # NPUs + elif hasattr(torch, "npu") and torch.npu.is_available(): + for i in range(torch.npu.device_count()): + if hasattr(torch.npu, "memory_allocated"): + memory_used[i] = torch.npu.memory_allocated(i) + + return memory_used + def update_memory_metrics(self): """Update peak memory usage metrics.""" # CPU memory cpu_memory = psutil.Process().memory_info().rss self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory) - # GPU memory if available - if torch.cuda.is_available(): - for i in range(torch.cuda.device_count()): - memory_used = torch.cuda.memory_allocated(i) - self.metrics.peak_gpu_memory[i] = max( - self.metrics.peak_gpu_memory.get(i, 0), memory_used - ) + # GPU memory (if available) + memory_used = self._get_allocated_memory() + for i, memory in memory_used.items(): + self.metrics.peak_gpu_memory[i] = max( + self.metrics.peak_gpu_memory.get(i, 0), memory + ) def get_memory_metrics(self) -> dict[str, Any]: """Get the current memory metrics as a dictionary.""" @@ -157,11 +198,12 @@ class RuntimeMetricsTracker: "peak_cpu_memory_bytes": self.metrics.peak_cpu_memory, } - if torch.cuda.is_available(): - for i in range(torch.cuda.device_count()): - memory_metrics[f"gpu_{i}_memory_bytes"] = torch.cuda.memory_allocated(i) - memory_metrics[ - f"gpu_{i}_peak_memory_bytes" - ] = self.metrics.peak_gpu_memory.get(i, 0) + # GPU memory (if available) + memory_used = self._get_allocated_memory() + for i, memory in memory_used.items(): + memory_metrics[f"gpu_{i}_memory_bytes"] = memory + memory_metrics[ + f"gpu_{i}_peak_memory_bytes" + ] = self.metrics.peak_gpu_memory.get(i, 0) - return {"memory": memory_metrics} + return memory_metrics diff --git a/src/axolotl/telemetry/whitelist.yaml b/src/axolotl/telemetry/whitelist.yaml index ab13522f5..88d72054e 100644 --- a/src/axolotl/telemetry/whitelist.yaml +++ b/src/axolotl/telemetry/whitelist.yaml @@ -1,9 +1,15 @@ organizations: + - "axolotl-ai-co" - "meta-llama" - "huggingface" - "nvidia" - "facebook" + - "google" + - "microsoft" + - "deepseek-ai" + - "HuggingFaceTB" - "mistralai" + - "Qwen" - "briaai" - "unsloth" - "NousResearch" diff --git a/src/axolotl/train.py b/src/axolotl/train.py index 79c558f24..441f8997d 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -69,7 +69,7 @@ def setup_model_and_tokenizer( """ # Load tokenizer LOG.debug( - f"loading tokenizer... {cfg.tokenizer_cocnfig or cfg.base_model_config}", + f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}", main_process_only=True, ) tokenizer = load_tokenizer(cfg) @@ -597,9 +597,7 @@ def train( setup_model_card(cfg) # Execute the training - TELEMETRY_MANAGER.send_event(event_type="train-start") execute_training(cfg, trainer, resume_from_checkpoint) - TELEMETRY_MANAGER.send_event(event_type="train-end") # Save the trained model and cleanup save_trained_model(cfg, trainer, model, safe_serialization) diff --git a/tests/telemetry/test_manager.py b/tests/telemetry/test_manager.py index b813b8892..441d94a10 100644 --- a/tests/telemetry/test_manager.py +++ b/tests/telemetry/test_manager.py @@ -151,12 +151,12 @@ def test_system_info_collection(manager): # Check essential keys assert "os" in system_info assert "python_version" in system_info - assert "pytorch_version" in system_info + assert "torch_version" in system_info assert "transformers_version" in system_info assert "axolotl_version" in system_info assert "cpu_count" in system_info assert "memory_total" in system_info - assert "gpu_count" in system_info + assert "accelerator_count" in system_info def test_send_event(manager): diff --git a/tests/telemetry/test_runtime_metrics.py b/tests/telemetry/test_runtime_metrics.py index 11c7faf98..5a6ef5a08 100644 --- a/tests/telemetry/test_runtime_metrics.py +++ b/tests/telemetry/test_runtime_metrics.py @@ -331,30 +331,26 @@ class TestRuntimeMetricsTracker: } # Get memory metrics - result = tracker.get_memory_metrics() - - # Verify structure - assert "memory" in result - memory = result["memory"] + memory_metrics = tracker.get_memory_metrics() # Verify CPU memory assert ( - memory["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024 + memory_metrics["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024 ) # Current value from mock assert ( - memory["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024 + memory_metrics["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024 ) # Peak value we set # Verify GPU memory assert ( - memory["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024 + memory_metrics["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024 ) # Current value from mock assert ( - memory["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024 + memory_metrics["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024 ) # Peak value we set assert ( - memory["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024 + memory_metrics["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024 ) # Current value from mock assert ( - memory["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024 + memory_metrics["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024 ) # Peak value we set