This commit is contained in:
Dan Saunders
2025-06-05 23:41:46 +00:00
parent 25fa4df70f
commit f0dde8e2d5
14 changed files with 81 additions and 1465 deletions

View File

@@ -11,12 +11,11 @@ from accelerate.logging import get_logger
from datasets import Dataset
from transformers.trainer import Trainer
from axolotl.telemetry.errors import send_errors
from axolotl.train import (
TrainDatasetMeta,
setup_model_and_tokenizer,
)
from axolotl.telemetry.errors import send_errors
from axolotl.train import TrainDatasetMeta
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import cleanup_distributed
from axolotl.utils.trainer import setup_trainer

View File

@@ -19,6 +19,7 @@ from peft import (
from transformers import PreTrainedModel
from axolotl.loaders.utils import get_linear_embedding_layers
from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

View File

@@ -46,6 +46,7 @@ from axolotl.loaders.utils import (
load_model_config,
)
from axolotl.models.mamba import fix_mamba_attn_for_loss
from axolotl.telemetry.errors import send_errors
from axolotl.utils.bench import log_gpu_memory_usage
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import (

View File

@@ -8,6 +8,7 @@ from transformers import (
PreTrainedTokenizerBase,
)
from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

View File

@@ -12,6 +12,7 @@ from transformers import (
from axolotl.integrations.base import PluginManager
from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
from axolotl.telemetry.errors import send_errors
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.distributed import (
barrier,

View File

@@ -59,12 +59,14 @@ class TelemetryCallback(TrainerCallback):
self.telemetry_manager.send_event(
event_type="train-end",
properties={
"loss": state.log_history[-1].get("loss", 0)
if state.log_history
else None,
"learning_rate": state.log_history[-1].get("learning_rate", 0)
if state.log_history
else None,
"loss": (
state.log_history[-1].get("loss", 0) if state.log_history else None
),
"learning_rate": (
state.log_history[-1].get("learning_rate", 0)
if state.log_history
else None
),
}
| self.tracker.metrics.to_dict(),
)

View File

@@ -307,9 +307,11 @@ class TelemetryManager:
gpu_info.append(
{
"name": torch.hip.get_device_name(i),
"memory": torch.hip.get_device_properties(i).total_memory
if hasattr(torch.hip, "get_device_properties")
else None,
"memory": (
torch.hip.get_device_properties(i).total_memory
if hasattr(torch.hip, "get_device_properties")
else None
),
}
)

View File

@@ -202,8 +202,8 @@ class RuntimeMetricsTracker:
memory_used = self._get_allocated_memory()
for i, memory in memory_used.items():
memory_metrics[f"gpu_{i}_memory_bytes"] = memory
memory_metrics[
f"gpu_{i}_peak_memory_bytes"
] = self.metrics.peak_gpu_memory.get(i, 0)
memory_metrics[f"gpu_{i}_peak_memory_bytes"] = (
self.metrics.peak_gpu_memory.get(i, 0)
)
return memory_metrics

View File

@@ -33,7 +33,6 @@ from axolotl.loaders import (
load_tokenizer,
)
from axolotl.telemetry.errors import send_errors
from axolotl.telemetry.errors import send_errors
from axolotl.telemetry.manager import TelemetryManager
from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
from axolotl.utils.dict import DictDefault
@@ -91,11 +90,11 @@ def setup_model_and_tokenizer(
if model.generation_config is not None:
model.generation_config.do_sample = True
TELEMETRY_MANAGER.track_event(
TELEMETRY_MANAGER.send_event(
event_type="model-load", properties=model.config.to_dict()
)
if peft_config:
TELEMETRY_MANAGER.track_event(
TELEMETRY_MANAGER.send_event(
event_type="peft-config-load", properties=peft_config.to_dict()
)

File diff suppressed because it is too large Load Diff