From 06ac407b92168f79c512df33d42387b6af5aa7f9 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 10 Feb 2026 23:01:34 +0700 Subject: [PATCH] feat: improve telemetry log (#3398) * fix: redact trackio and data_files * fix: add new orgs to whitelist * feat: add run id to logs for users to easily share * fix: update to add more metrics * fix: add missed experiment tracker * chore: formatting in main --- src/axolotl/telemetry/callbacks.py | 18 ++++++++++++++++-- src/axolotl/telemetry/errors.py | 4 ++++ src/axolotl/telemetry/manager.py | 4 ++-- src/axolotl/telemetry/whitelist.yaml | 7 +++++++ src/axolotl/utils/datasets.py | 2 ++ 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/axolotl/telemetry/callbacks.py b/src/axolotl/telemetry/callbacks.py index 0ce52ffa4..1c13bf0cd 100644 --- a/src/axolotl/telemetry/callbacks.py +++ b/src/axolotl/telemetry/callbacks.py @@ -153,13 +153,27 @@ class TelemetryCallback(TrainerCallback): self.last_report_step = step def _extract_last_metrics(self, state: TrainerState) -> dict: - """Extract last loss, learning_rate, and grad_norm from log history.""" + """Extract last loss, learning_rate, grad_norm, and token metrics from log history.""" if not state.log_history: - return {"loss": 0, "learning_rate": 0, "grad_norm": 0} + return { + "loss": 0, + "ppl": 0, + "learning_rate": 0, + "grad_norm": 0, + "tokens/total": 0, + "tokens/trainable": 0, + "tokens/train_per_sec_per_gpu": 0, + } last_log = state.log_history[-1] return { "loss": last_log.get("loss", 0), + "ppl": last_log.get("ppl", 0), "learning_rate": last_log.get("learning_rate", 0), "grad_norm": last_log.get("grad_norm", 0), + "tokens/total": last_log.get("tokens/total", 0), + "tokens/trainable": last_log.get("tokens/trainable", 0), + "tokens/train_per_sec_per_gpu": last_log.get( + "tokens/train_per_sec_per_gpu", 0 + ), } diff --git a/src/axolotl/telemetry/errors.py b/src/axolotl/telemetry/errors.py index 27f2d2192..a0c868235 100644 --- a/src/axolotl/telemetry/errors.py +++ b/src/axolotl/telemetry/errors.py @@ -155,6 +155,10 @@ def send_errors(func: Callable) -> Callable: }, ) + LOG.error( + f"Error captured in telemetry. Run ID: {telemetry_manager.run_id}" + ) + raise return wrapper diff --git a/src/axolotl/telemetry/manager.py b/src/axolotl/telemetry/manager.py index 46ef389aa..0774b68a8 100644 --- a/src/axolotl/telemetry/manager.py +++ b/src/axolotl/telemetry/manager.py @@ -30,8 +30,8 @@ FIELDS_TO_REDACT = { "resume_from_checkpoint", "hub_model_id", } -PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"} -PATH_INDICATORS = {"path", "dir"} +PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_", "trackio_", "swanlab_"} +PATH_INDICATORS = {"path", "dir", "data_files"} # pylint: disable=duplicate-code RELEVANT_PACKAGES = { diff --git a/src/axolotl/telemetry/whitelist.yaml b/src/axolotl/telemetry/whitelist.yaml index 6c94d6e79..c75ee0fec 100644 --- a/src/axolotl/telemetry/whitelist.yaml +++ b/src/axolotl/telemetry/whitelist.yaml @@ -31,3 +31,10 @@ organizations: - "mistral-community" - "llava-hf" - "ByteDance-Seed" + - "ACE-Step" + - "openbmb" + - "MiniMaxAI" + - "stepfun-ai" + - "internlm" + - "katanemo" + - "XiaomiMiMo" diff --git a/src/axolotl/utils/datasets.py b/src/axolotl/utils/datasets.py index 19ad71640..7beeb2733 100644 --- a/src/axolotl/utils/datasets.py +++ b/src/axolotl/utils/datasets.py @@ -1,10 +1,12 @@ """helper functions for datasets""" import os + from axolotl.utils.logging import get_logger LOG = get_logger(__name__) + def get_default_process_count(): if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"): return int(axolotl_dataset_num_proc)