diff --git a/docs/telemetry.qmd b/docs/telemetry.qmd
index 0837dd25f..aab3ff529 100644
--- a/docs/telemetry.qmd
+++ b/docs/telemetry.qmd
@@ -31,8 +31,8 @@ Telemetry is implemented using PostHog and consists of:
telemetry system and provides methods for tracking events.
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
sends sanitized stack traces.
-- `axolotl.telemetry.runtime_metrics.RuntimeMetrics`: A dataclass that tracks runtime
-metrics during training.
+- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
+runtime metrics during training.
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
runtime metrics telemetry.
@@ -44,7 +44,7 @@ aware of data collection, unless telemetry is explicitly enabled or disabled.
Telemetry is **enabled by default** on an opt-out basis. To disable it, set either:
- `AXOLOTL_DO_NOT_TRACK=1` (Axolotl-specific)
-- `DO_NOT_TRACK=1` (Global standard)
+- `DO_NOT_TRACK=1` (Global standard; see https://consoledonottrack.com/)
To acknowledge and explicitly enable telemetry (and remove the warning message), set:
`AXOLOTL_DO_NOT_TRACK=0`.
diff --git a/src/axolotl/telemetry/callbacks.py b/src/axolotl/telemetry/callbacks.py
index d664931e0..6a1aa5235 100644
--- a/src/axolotl/telemetry/callbacks.py
+++ b/src/axolotl/telemetry/callbacks.py
@@ -131,17 +131,20 @@ class TelemetryCallback(TrainerCallback):
# Update memory metrics
self.tracker.update_memory_metrics()
+ loss = state.log_history[-1].get("loss", 0) if state.log_history else 0
+ learning_rate = (
+ state.log_history[-1].get("learning_rate", 0)
+ if state.log_history
+ else 0
+ )
+
# Prepare metrics to report
metrics = {
"step": step,
"epoch": self.current_epoch,
"progress": state.epoch, # Fractional epoch progress
- "loss": state.log_history[-1].get("loss", 0)
- if state.log_history
- else 0,
- "learning_rate": state.log_history[-1].get("learning_rate", 0)
- if state.log_history
- else 0,
+ "loss": loss,
+ "learning_rate": learning_rate,
"steps_per_second": steps_per_second,
"elapsed_time": current_time - self.start_time,
"time_since_last_report": time_since_last_report,
@@ -149,7 +152,7 @@ class TelemetryCallback(TrainerCallback):
# Add memory metrics
memory_metrics = self.tracker.get_memory_metrics()
- metrics.update(memory_metrics)
+ metrics.update({"memory": memory_metrics})
# Send telemetry
self.telemetry_manager.send_event(
diff --git a/src/axolotl/telemetry/manager.py b/src/axolotl/telemetry/manager.py
index 7f2d6da44..301c99c3d 100644
--- a/src/axolotl/telemetry/manager.py
+++ b/src/axolotl/telemetry/manager.py
@@ -1,6 +1,7 @@
"""Telemetry manager and associated utilities."""
import atexit
+import importlib
import logging
import os
import platform
@@ -12,10 +13,8 @@ from typing import Any
import posthog
import psutil
import torch
-import transformers
import yaml
-import axolotl
from axolotl.utils.distributed import is_main_process
LOG = logging.getLogger(__name__)
@@ -32,7 +31,7 @@ ENABLED_WARNING = (
"This data helps us prioritize features, optimize performance, and fix bugs.\n\n"
"To disable telemetry, set either:\n"
"- AXOLOTL_DO_NOT_TRACK=1 (Axolotl-specific)\n"
- "- DO_NOT_TRACK=1 (Global standard)\n\n"
+ "- DO_NOT_TRACK=1 (Global standard; see https://consoledonottrack.com/)\n\n"
"To remove this warning and continue with telemetry enabled,"
"explicitly set AXOLOTL_DO_NOT_TRACK=0 (and leave DO_NOT_TRACK unset / set to 0)\n\n"
"No personally identifiable information is collected."
@@ -42,13 +41,39 @@ ENABLED_WARNING = (
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
-FIELDS_WITH_ORGS = [
+# NOTE: Keep these up to date with any config schema changes
+FIELDS_WITH_ORGS = {
"base_model",
"tokenizer_config",
"base_model_config",
-]
-FIELDS_TO_REDACT = ["resume_from_checkpoint", "hub_model_id"]
-PREFIXES_TO_REDACT = ["wandb_", "comet_", "mlflow_", "gradio_"]
+ "pretraining_dataset", # NOTE: this field may be a string or a dictionary
+}
+FIELDS_TO_REDACT = {"resume_from_checkpoint", "hub_model_id"}
+PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
+PATH_INDICATORS = {"path", "dir"}
+
+RELEVANT_PACKAGES = {
+ "torch",
+ "transformers",
+ "trl",
+ "datasets",
+ "peft",
+ "bitsandbytes",
+ "accelerate",
+ "optimum",
+ "deepspeed",
+ "ray",
+ "axolotl",
+ "triton",
+ "mamba-ssm",
+ "flash-attn",
+ "xformers",
+ "autoawq",
+ "tokenizers",
+ "sentencepiece",
+ "torchao",
+ "lm_eval",
+}
class TelemetryManager:
@@ -78,7 +103,13 @@ class TelemetryManager:
if self.enabled:
self.run_id = str(uuid.uuid4())
self.whitelist = self._load_whitelist()
- self.system_info = self._get_system_info()
+
+ try:
+ self.system_info = self._get_system_info()
+ except Exception as e: # pylint: disable=broad-exception-caught
+ LOG.warning(f"Error during system info collection: {e}")
+ self.system_info = None
+
self._init_posthog()
# Register shutdown method to flush posthog telemetry
@@ -174,9 +205,6 @@ class TelemetryManager:
if not properties:
return {}
- # NOTE: Keep this up to date with any config schema changes
- path_indicators = {"path", "dir"}
-
def redact_value(value: Any, key: str = "") -> Any:
"""Recursively sanitize values, redacting those with path-like keys"""
if isinstance(key, str) and isinstance(value, str):
@@ -190,7 +218,7 @@ class TelemetryManager:
if (
key in FIELDS_TO_REDACT
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
- or any(indicator in key.lower() for indicator in path_indicators)
+ or any(indicator in key.lower() for indicator in PATH_INDICATORS)
):
return "[REDACTED]"
@@ -208,27 +236,100 @@ class TelemetryManager:
return redacted
def _get_system_info(self) -> dict[str, Any]:
- """Collect system information"""
+ """Collect system information for various hardware accelerators"""
gpu_info = []
+ accelerator_type = "none"
+
+ # NVIDIA GPUs
if torch.cuda.is_available():
+ accelerator_type = "cuda"
for i in range(torch.cuda.device_count()):
gpu_info.append(
{
"name": torch.cuda.get_device_name(i),
"memory": torch.cuda.get_device_properties(i).total_memory,
+ "type": "cuda",
}
)
+ # AMD GPUs
+ elif hasattr(torch, "hip") and torch.hip.is_available():
+ accelerator_type = "hip"
+ for i in range(torch.hip.device_count()):
+ gpu_info.append(
+ {
+ "name": torch.hip.get_device_name(i),
+ "memory": torch.hip.get_device_properties(i).total_memory
+ if hasattr(torch.hip, "get_device_properties")
+ else None,
+ "type": "hip",
+ }
+ )
+
+ # Apple Silicon
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ accelerator_type = "mps"
+ gpu_info.append(
+ {
+ "name": "Apple Silicon",
+ # NOTE: this is memory allocated to this process, not total memory
+ "memory": torch.mps.driver_allocated_memory(),
+ "type": "mps",
+ }
+ )
+
+ # Intel GPUs
+ elif hasattr(torch, "xpu") and torch.xpu.is_available():
+ accelerator_type = "xpu"
+ for i in range(torch.xpu.device_count()):
+ memory = None
+ if hasattr(torch.xpu, "get_device_properties"):
+ memory = torch.xpu.get_device_properties(i).total_memory
+
+ gpu_info.append(
+ {
+ "name": torch.xpu.get_device_name(i),
+ "memory": memory,
+ "type": "xpu",
+ }
+ )
+
+ # NPUs
+ elif hasattr(torch, "npu") and torch.npu.is_available():
+ accelerator_type = "npu"
+ for i in range(torch.npu.device_count()):
+ name = getattr(torch.npu, "get_device_name", lambda x: "NPU")(i)
+
+ memory = None
+ if hasattr(torch.npu, "get_device_properties"):
+ memory = torch.npu.get_device_properties(i).total_memory
+
+ gpu_info.append(
+ {
+ "name": name,
+ "memory": memory,
+ "type": "npu",
+ }
+ )
+
+ # Get relevant package versions
+ installed_packages = {}
+ for package in RELEVANT_PACKAGES:
+ try:
+ version = importlib.metadata.version(package)
+ installed_packages[f"{package}_version"] = version
+ except importlib.metadata.PackageNotFoundError:
+ pass
+
return {
"os": platform.system(),
"python_version": platform.python_version(),
- "pytorch_version": torch.__version__,
- "transformers_version": transformers.__version__,
- "axolotl_version": axolotl.__version__,
"cpu_count": psutil.cpu_count(),
"memory_total": psutil.virtual_memory().total,
- "gpu_count": len(gpu_info),
- "gpu_info": gpu_info,
+ "accelerator_type": accelerator_type,
+ "accelerator_count": len(gpu_info),
+ "accelerator_info": gpu_info,
+ **installed_packages,
}
def send_event(self, event_type: str, properties: dict[str, Any] | None = None):
diff --git a/src/axolotl/telemetry/runtime_metrics.py b/src/axolotl/telemetry/runtime_metrics.py
index d0f52b88b..229c74edf 100644
--- a/src/axolotl/telemetry/runtime_metrics.py
+++ b/src/axolotl/telemetry/runtime_metrics.py
@@ -112,6 +112,8 @@ class RuntimeMetrics:
class RuntimeMetricsTracker:
"""Tracker for runtime metrics during training."""
+ update_interval = 100
+
def __init__(self):
"""Initialize the runtime metrics tracker."""
self.metrics = RuntimeMetrics(start_time=time.time())
@@ -132,23 +134,62 @@ class RuntimeMetricsTracker:
self.metrics.current_step = step
self.metrics.total_steps += 1
- # Periodically update memory metrics (e.g., every 100 steps)
- if step % 100 == 0:
+ # Periodically update memory metrics
+ if step % self.update_interval == 0:
self.update_memory_metrics()
+ def _get_allocated_memory(self) -> dict[int, int]:
+ """
+ Helper function for getting accelerator-agnostic allocated memory.
+
+ Returns:
+ A dictionary mapping device IDs to allocated memory in bytes
+ """
+ memory_used: dict[int, int] = {}
+
+ # NVIDIA GPUs
+ if torch.cuda.is_available():
+ for i in range(torch.cuda.device_count()):
+ memory_used[i] = torch.cuda.memory_allocated(i)
+
+ # AMD GPUs
+ elif hasattr(torch, "hip") and torch.hip.is_available():
+ for i in range(torch.hip.device_count()):
+ if hasattr(torch.hip, "memory_allocated"):
+ memory_used[i] = torch.hip.memory_allocated(i)
+
+ # Apple Silicon
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ # MPS doesn't have per-device memory stats since there's only one device
+ if hasattr(torch.mps, "current_allocated_memory"):
+ memory_used[0] = torch.mps.current_allocated_memory()
+
+ # Intel GPUs
+ elif hasattr(torch, "xpu") and torch.xpu.is_available():
+ for i in range(torch.xpu.device_count()):
+ if hasattr(torch.xpu, "memory_allocated"):
+ memory_used[i] = torch.xpu.memory_allocated(i)
+
+ # NPUs
+ elif hasattr(torch, "npu") and torch.npu.is_available():
+ for i in range(torch.npu.device_count()):
+ if hasattr(torch.npu, "memory_allocated"):
+ memory_used[i] = torch.npu.memory_allocated(i)
+
+ return memory_used
+
def update_memory_metrics(self):
"""Update peak memory usage metrics."""
# CPU memory
cpu_memory = psutil.Process().memory_info().rss
self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory)
- # GPU memory if available
- if torch.cuda.is_available():
- for i in range(torch.cuda.device_count()):
- memory_used = torch.cuda.memory_allocated(i)
- self.metrics.peak_gpu_memory[i] = max(
- self.metrics.peak_gpu_memory.get(i, 0), memory_used
- )
+ # GPU memory (if available)
+ memory_used = self._get_allocated_memory()
+ for i, memory in memory_used.items():
+ self.metrics.peak_gpu_memory[i] = max(
+ self.metrics.peak_gpu_memory.get(i, 0), memory
+ )
def get_memory_metrics(self) -> dict[str, Any]:
"""Get the current memory metrics as a dictionary."""
@@ -157,11 +198,12 @@ class RuntimeMetricsTracker:
"peak_cpu_memory_bytes": self.metrics.peak_cpu_memory,
}
- if torch.cuda.is_available():
- for i in range(torch.cuda.device_count()):
- memory_metrics[f"gpu_{i}_memory_bytes"] = torch.cuda.memory_allocated(i)
- memory_metrics[
- f"gpu_{i}_peak_memory_bytes"
- ] = self.metrics.peak_gpu_memory.get(i, 0)
+ # GPU memory (if available)
+ memory_used = self._get_allocated_memory()
+ for i, memory in memory_used.items():
+ memory_metrics[f"gpu_{i}_memory_bytes"] = memory
+ memory_metrics[
+ f"gpu_{i}_peak_memory_bytes"
+ ] = self.metrics.peak_gpu_memory.get(i, 0)
- return {"memory": memory_metrics}
+ return memory_metrics
diff --git a/src/axolotl/telemetry/whitelist.yaml b/src/axolotl/telemetry/whitelist.yaml
index ab13522f5..88d72054e 100644
--- a/src/axolotl/telemetry/whitelist.yaml
+++ b/src/axolotl/telemetry/whitelist.yaml
@@ -1,9 +1,15 @@
organizations:
+ - "axolotl-ai-co"
- "meta-llama"
- "huggingface"
- "nvidia"
- "facebook"
+ - "google"
+ - "microsoft"
+ - "deepseek-ai"
+ - "HuggingFaceTB"
- "mistralai"
+ - "Qwen"
- "briaai"
- "unsloth"
- "NousResearch"
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index b6d4304c7..45b35bebb 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,5 +1,6 @@
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
+import importlib
import inspect
import os
import signal
@@ -13,7 +14,6 @@ import transformers.modelcard
from accelerate.logging import get_logger
from accelerate.utils import save_fsdp_model
from peft import PeftModel
-from pkg_resources import get_distribution # type: ignore
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
@@ -50,7 +50,7 @@ def train(
) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
# Load tokenizer
LOG.debug(
- f"loading tokenizer... {cfg.tokenizer_cocnfig or cfg.base_model_config}",
+ f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
main_process_only=True,
)
tokenizer = load_tokenizer(cfg)
@@ -179,7 +179,7 @@ def train(
if getattr(cfg, "axolotl_config_path"):
raw_axolotl_cfg = Path(cfg.axolotl_config_path)
- version = get_distribution("axolotl").version
+ version = importlib.metadata.version("axolotl")
if raw_axolotl_cfg.is_file():
transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\nSee axolotl config
\n\naxolotl version: `{version}`\n```yaml\n{raw_axolotl_cfg.read_text(encoding='utf-8')}\n```\n\n
\n"
diff --git a/tests/telemetry/test_manager.py b/tests/telemetry/test_manager.py
index b813b8892..441d94a10 100644
--- a/tests/telemetry/test_manager.py
+++ b/tests/telemetry/test_manager.py
@@ -151,12 +151,12 @@ def test_system_info_collection(manager):
# Check essential keys
assert "os" in system_info
assert "python_version" in system_info
- assert "pytorch_version" in system_info
+ assert "torch_version" in system_info
assert "transformers_version" in system_info
assert "axolotl_version" in system_info
assert "cpu_count" in system_info
assert "memory_total" in system_info
- assert "gpu_count" in system_info
+ assert "accelerator_count" in system_info
def test_send_event(manager):
diff --git a/tests/telemetry/test_runtime_metrics.py b/tests/telemetry/test_runtime_metrics.py
index 11c7faf98..5a6ef5a08 100644
--- a/tests/telemetry/test_runtime_metrics.py
+++ b/tests/telemetry/test_runtime_metrics.py
@@ -331,30 +331,26 @@ class TestRuntimeMetricsTracker:
}
# Get memory metrics
- result = tracker.get_memory_metrics()
-
- # Verify structure
- assert "memory" in result
- memory = result["memory"]
+ memory_metrics = tracker.get_memory_metrics()
# Verify CPU memory
assert (
- memory["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024
+ memory_metrics["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024
) # Current value from mock
assert (
- memory["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024
+ memory_metrics["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024
) # Peak value we set
# Verify GPU memory
assert (
- memory["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024
+ memory_metrics["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024
) # Current value from mock
assert (
- memory["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024
+ memory_metrics["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024
) # Peak value we set
assert (
- memory["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024
+ memory_metrics["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024
) # Current value from mock
assert (
- memory["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024
+ memory_metrics["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024
) # Peak value we set