log warning re: logged losses / gradient scaling per rank

2025-04-07 18:46:58 +00:00
parent c64c881460
commit 954b989e88
1 changed files with 12 additions and 0 deletions
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -1171,6 +1171,18 @@ class AxolotlInputConfig(
                    "or `pip install ring-flash-attn>=0.1.4`."
                ) from exception
            # TODO: monkeypatch / callback to average losses correctly across SP ranks
            # / fix gradient scaling across SP ranks. Losses, grads should be scaled
            # according to the proportion of non-padding tokens per rank.
            LOG.warning(
                "Sequence parallelism (SP) is enabled with "
                f"sequence_parallel_degree={value}. Please note that logged losses may "
                "differ slightly to the non-SP losses due to transformers Trainer "
                "implementation details. Please see "
                "https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
                "for more details."
            )
        return value
    @model_validator(mode="before")