feat: improve telemetry log (#3398)
* fix: redact trackio and data_files * fix: add new orgs to whitelist * feat: add run id to logs for users to easily share * fix: update to add more metrics * fix: add missed experiment tracker * chore: formatting in main
This commit is contained in:
@@ -153,13 +153,27 @@ class TelemetryCallback(TrainerCallback):
|
|||||||
self.last_report_step = step
|
self.last_report_step = step
|
||||||
|
|
||||||
def _extract_last_metrics(self, state: TrainerState) -> dict:
|
def _extract_last_metrics(self, state: TrainerState) -> dict:
|
||||||
"""Extract last loss, learning_rate, and grad_norm from log history."""
|
"""Extract last loss, learning_rate, grad_norm, and token metrics from log history."""
|
||||||
if not state.log_history:
|
if not state.log_history:
|
||||||
return {"loss": 0, "learning_rate": 0, "grad_norm": 0}
|
return {
|
||||||
|
"loss": 0,
|
||||||
|
"ppl": 0,
|
||||||
|
"learning_rate": 0,
|
||||||
|
"grad_norm": 0,
|
||||||
|
"tokens/total": 0,
|
||||||
|
"tokens/trainable": 0,
|
||||||
|
"tokens/train_per_sec_per_gpu": 0,
|
||||||
|
}
|
||||||
|
|
||||||
last_log = state.log_history[-1]
|
last_log = state.log_history[-1]
|
||||||
return {
|
return {
|
||||||
"loss": last_log.get("loss", 0),
|
"loss": last_log.get("loss", 0),
|
||||||
|
"ppl": last_log.get("ppl", 0),
|
||||||
"learning_rate": last_log.get("learning_rate", 0),
|
"learning_rate": last_log.get("learning_rate", 0),
|
||||||
"grad_norm": last_log.get("grad_norm", 0),
|
"grad_norm": last_log.get("grad_norm", 0),
|
||||||
|
"tokens/total": last_log.get("tokens/total", 0),
|
||||||
|
"tokens/trainable": last_log.get("tokens/trainable", 0),
|
||||||
|
"tokens/train_per_sec_per_gpu": last_log.get(
|
||||||
|
"tokens/train_per_sec_per_gpu", 0
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -155,6 +155,10 @@ def send_errors(func: Callable) -> Callable:
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
LOG.error(
|
||||||
|
f"Error captured in telemetry. Run ID: {telemetry_manager.run_id}"
|
||||||
|
)
|
||||||
|
|
||||||
raise
|
raise
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|||||||
@@ -30,8 +30,8 @@ FIELDS_TO_REDACT = {
|
|||||||
"resume_from_checkpoint",
|
"resume_from_checkpoint",
|
||||||
"hub_model_id",
|
"hub_model_id",
|
||||||
}
|
}
|
||||||
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
|
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_", "trackio_", "swanlab_"}
|
||||||
PATH_INDICATORS = {"path", "dir"}
|
PATH_INDICATORS = {"path", "dir", "data_files"}
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
RELEVANT_PACKAGES = {
|
RELEVANT_PACKAGES = {
|
||||||
|
|||||||
@@ -31,3 +31,10 @@ organizations:
|
|||||||
- "mistral-community"
|
- "mistral-community"
|
||||||
- "llava-hf"
|
- "llava-hf"
|
||||||
- "ByteDance-Seed"
|
- "ByteDance-Seed"
|
||||||
|
- "ACE-Step"
|
||||||
|
- "openbmb"
|
||||||
|
- "MiniMaxAI"
|
||||||
|
- "stepfun-ai"
|
||||||
|
- "internlm"
|
||||||
|
- "katanemo"
|
||||||
|
- "XiaomiMiMo"
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
"""helper functions for datasets"""
|
"""helper functions for datasets"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def get_default_process_count():
|
def get_default_process_count():
|
||||||
if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
|
if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
|
||||||
return int(axolotl_dataset_num_proc)
|
return int(axolotl_dataset_num_proc)
|
||||||
|
|||||||
Reference in New Issue
Block a user