Distributed Muon Optimizer (#3264)
* init * working * updating configs * removing unneeded files * lint * comments * lint * fix regex match * bump contribs version * comments * fixing tests and imports * muon imports in test v2 * test cleanup * bump contribs version --------- Co-authored-by: Salman Mohammadi <“salman.mohammadi@outlook.com”>
This commit is contained in:
@@ -281,11 +281,22 @@ class TrainerBuilderBase(abc.ABC):
|
||||
adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
|
||||
|
||||
if self.cfg.optimizer == "muon":
|
||||
from axolotl.contribs.mit.muon import (
|
||||
MuonOptimizerFactory,
|
||||
)
|
||||
_, device_mesh = build_parallelism_config(self.cfg)
|
||||
|
||||
if device_mesh is not None:
|
||||
from axolotl.contribs.mit.muon.dist_muon import (
|
||||
DistMuonOptimizerFactory,
|
||||
)
|
||||
|
||||
optimizer_cls = DistMuonOptimizerFactory
|
||||
optimizer_kwargs["device_mesh"] = device_mesh
|
||||
else:
|
||||
from axolotl.contribs.mit.muon import (
|
||||
MuonOptimizerFactory,
|
||||
)
|
||||
|
||||
optimizer_cls = MuonOptimizerFactory
|
||||
|
||||
optimizer_cls = MuonOptimizerFactory
|
||||
optimizer_kwargs.update(adam_kwargs)
|
||||
elif self.cfg.optimizer == "dion":
|
||||
from axolotl.contribs.mit.dion import (
|
||||
|
||||
@@ -751,12 +751,19 @@ class OptimizationValidationMixin:
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_muon_deepspeed_fsdp(cls, data):
|
||||
if data.get("optimizer") == "muon" and (
|
||||
data.get("deepspeed") or data.get("fsdp") or data.get("fsdp_config")
|
||||
):
|
||||
raise ValueError(
|
||||
"Muon optimizer is currently incompatible with DeepSpeed and FSDP"
|
||||
)
|
||||
if data.get("optimizer") == "muon":
|
||||
if data.get("deepspeed"):
|
||||
raise ValueError(
|
||||
"Muon optimizer is currently incompatible with DeepSpeed"
|
||||
)
|
||||
if data.get("fsdp") or data.get("fsdp_config"):
|
||||
fsdp_version = data.get("fsdp_version")
|
||||
if fsdp_version is None:
|
||||
fsdp_version = data.get("fsdp_config", {}).get("fsdp_version", 1)
|
||||
if str(fsdp_version) != "2":
|
||||
raise ValueError(
|
||||
"Muon optimizer is only compatible with FSDP2. Set fsdp_version: 2 to use Muon with FSDP."
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@@ -840,40 +847,6 @@ class OptimizationValidationMixin:
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_fsdp_version_in_fsdp_config(cls, data):
|
||||
fsdp_config = data.get("fsdp_config") or {}
|
||||
if fsdp_config and fsdp_config.get("fsdp_version"):
|
||||
LOG.warning(
|
||||
"Configuring `fsdp_version` in `fsdp_config` is deprecated. "
|
||||
"Please configure `fsdp_version` as a top-level field."
|
||||
)
|
||||
data["fsdp_version"] = fsdp_config.pop("fsdp_version")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_fsdp_config_kwargs_prefix(cls, data):
|
||||
if fsdp_config := data.get("fsdp_config"):
|
||||
should_fix = False
|
||||
for key, _ in fsdp_config.items():
|
||||
if key.startswith("fsdp_"):
|
||||
should_fix = True
|
||||
LOG.warning_once(
|
||||
"Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
|
||||
"Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
|
||||
)
|
||||
if should_fix:
|
||||
update_fsdp_config = {}
|
||||
for key, value in fsdp_config.items():
|
||||
if key.startswith("fsdp_") and key != "fsdp_version":
|
||||
update_fsdp_config[key.replace("fsdp_", "")] = value
|
||||
else:
|
||||
update_fsdp_config[key] = value
|
||||
data["fsdp_config"] = update_fsdp_config
|
||||
return data
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_fsdp_offload_w_8bit_optimizer(self):
|
||||
if (
|
||||
@@ -975,6 +948,40 @@ class OptimizationValidationMixin:
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_fsdp_version_in_fsdp_config(cls, data):
|
||||
fsdp_config = data.get("fsdp_config") or {}
|
||||
if fsdp_config and fsdp_config.get("fsdp_version"):
|
||||
LOG.warning(
|
||||
"Configuring `fsdp_version` in `fsdp_config` is deprecated. "
|
||||
"Please configure `fsdp_version` as a top-level field."
|
||||
)
|
||||
data["fsdp_version"] = fsdp_config.pop("fsdp_version")
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_fsdp_config_kwargs_prefix(cls, data):
|
||||
if fsdp_config := data.get("fsdp_config"):
|
||||
should_fix = False
|
||||
for key, _ in fsdp_config.items():
|
||||
if key.startswith("fsdp_"):
|
||||
should_fix = True
|
||||
LOG.warning_once(
|
||||
"Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
|
||||
"Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
|
||||
)
|
||||
if should_fix:
|
||||
update_fsdp_config = {}
|
||||
for key, value in fsdp_config.items():
|
||||
if key.startswith("fsdp_") and key != "fsdp_version":
|
||||
update_fsdp_config[key.replace("fsdp_", "")] = value
|
||||
else:
|
||||
update_fsdp_config[key] = value
|
||||
data["fsdp_config"] = update_fsdp_config
|
||||
return data
|
||||
|
||||
|
||||
class SystemValidationMixin:
|
||||
"""Validation methods related to system and hardware configuration."""
|
||||
|
||||
Reference in New Issue
Block a user