Sequence parallelism quick follow-ups; remove ModelCallback (#2450)
* guard return if ring attn alrady registered * add docs link, bits in multi-gpu docs, remove save model callback (subsumed by HF trainers) * configurable heads_k_stride from ring-flash-attn hf adapter
This commit is contained in:
@@ -243,6 +243,7 @@ website:
|
|||||||
- docs/unsloth.qmd
|
- docs/unsloth.qmd
|
||||||
- docs/torchao.qmd
|
- docs/torchao.qmd
|
||||||
- docs/custom_integrations.qmd
|
- docs/custom_integrations.qmd
|
||||||
|
- docs/sequence_parallelism.qmd
|
||||||
|
|
||||||
- section: "Troubleshooting"
|
- section: "Troubleshooting"
|
||||||
contents:
|
contents:
|
||||||
|
|||||||
@@ -658,6 +658,9 @@ ddp_broadcast_buffers:
|
|||||||
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||||
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
||||||
sequence_parallel_degree:
|
sequence_parallel_degree:
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
# Must evenly divide the number of KV heads in your model.
|
||||||
|
heads_k_stride: 1
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ Axolotl supports several methods for multi-GPU training:
|
|||||||
|
|
||||||
- DeepSpeed (recommended)
|
- DeepSpeed (recommended)
|
||||||
- FSDP (Fully Sharded Data Parallel)
|
- FSDP (Fully Sharded Data Parallel)
|
||||||
|
- Sequence parallelism
|
||||||
- FSDP + QLoRA
|
- FSDP + QLoRA
|
||||||
|
|
||||||
## DeepSpeed {#sec-deepspeed}
|
## DeepSpeed {#sec-deepspeed}
|
||||||
@@ -66,6 +67,28 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Sequence parallelism {#sec-sequence-parallelism}
|
||||||
|
|
||||||
|
We support sequence parallelism (SP) via the
|
||||||
|
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
|
||||||
|
allows one to split up sequences across GPUs, which is useful in the event that a
|
||||||
|
single sequence causes OOM errors during model training.
|
||||||
|
|
||||||
|
First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
|
||||||
|
or from source with `pip install .[ring-flash-attn]`.
|
||||||
|
|
||||||
|
Your Axolotl YAML config should contain the following lines:
|
||||||
|
|
||||||
|
```{.yaml}
|
||||||
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
|
flash_attention: true # Required with sequence parallelism
|
||||||
|
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
```
|
||||||
|
|
||||||
|
See our [dedicated guide](sequence_parallelism.qmd) for more details.
|
||||||
|
|
||||||
### FSDP + QLoRA {#sec-fsdp-qlora}
|
### FSDP + QLoRA {#sec-fsdp-qlora}
|
||||||
|
|
||||||
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
|
||||||
|
|||||||
@@ -25,6 +25,8 @@ To enable sequence parallelism, add the following to your configuration file:
|
|||||||
```yaml
|
```yaml
|
||||||
# Set to a divisor (> 1) of the number of GPUs available
|
# Set to a divisor (> 1) of the number of GPUs available
|
||||||
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
```
|
```
|
||||||
|
|
||||||
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
||||||
@@ -58,11 +60,16 @@ To use sequence parallelism, you need:
|
|||||||
## Example
|
## Example
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# Example config with sequence parallelism
|
|
||||||
base_model: meta-llama/Llama-3-8B-Instruct
|
base_model: meta-llama/Llama-3-8B-Instruct
|
||||||
sequence_len: 8192
|
sequence_len: 8192
|
||||||
sequence_parallel_degree: 2 # Split each sequence into 4 parts
|
|
||||||
|
...
|
||||||
|
|
||||||
|
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
|
||||||
flash_attention: true # Required with sequence parallelism
|
flash_attention: true # Required with sequence parallelism
|
||||||
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
|
heads_k_stride: 1
|
||||||
|
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -69,7 +69,6 @@ from axolotl.utils.callbacks import (
|
|||||||
LossWatchDogCallback,
|
LossWatchDogCallback,
|
||||||
SaveAxolotlConfigtoWandBCallback,
|
SaveAxolotlConfigtoWandBCallback,
|
||||||
SaveBetterTransformerModelCallback,
|
SaveBetterTransformerModelCallback,
|
||||||
SaveModelCallback,
|
|
||||||
bench_eval_callback_factory,
|
bench_eval_callback_factory,
|
||||||
causal_lm_bench_eval_callback_factory,
|
causal_lm_bench_eval_callback_factory,
|
||||||
log_prediction_callback_factory,
|
log_prediction_callback_factory,
|
||||||
@@ -249,7 +248,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
|
|
||||||
if self.cfg.gc_steps:
|
if self.cfg.gc_steps:
|
||||||
callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
|
callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
|
||||||
callbacks.append(SaveModelCallback())
|
|
||||||
|
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
@@ -937,7 +935,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
|
|
||||||
def get_callbacks(self):
|
def get_callbacks(self):
|
||||||
callbacks = super().get_callbacks()
|
callbacks = super().get_callbacks()
|
||||||
callbacks.append(SaveModelCallback())
|
|
||||||
|
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
|
|||||||
@@ -38,13 +38,19 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
|
|||||||
RING_ATTN_GROUP = ring_attn_group
|
RING_ATTN_GROUP = ring_attn_group
|
||||||
|
|
||||||
|
|
||||||
def register_ring_attn(sequence_parallel_degree: int):
|
def register_ring_attn(sequence_parallel_degree: int, heads_k_stride: int | None):
|
||||||
"""
|
"""
|
||||||
Create ring attention group and substitute flash attn with ring flash attn.
|
Create ring attention group and substitute flash attn with ring flash attn.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence_parallel_degree: Sequence parallelism factor.
|
sequence_parallel_degree: Sequence parallelism factor.
|
||||||
|
heads_k_stride: Sequence parallelism K head stride size. Passed
|
||||||
|
through to `ring_flash_attn.substitute_hf_flash_attn`.
|
||||||
"""
|
"""
|
||||||
|
if get_ring_attn_group() is not None:
|
||||||
|
LOG.info("Ring attention already registered, exiting early...")
|
||||||
|
return
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"Enabling ring attention sequence parallelism: "
|
"Enabling ring attention sequence parallelism: "
|
||||||
f"each sequence will be processed across {sequence_parallel_degree} GPUs"
|
f"each sequence will be processed across {sequence_parallel_degree} GPUs"
|
||||||
@@ -84,6 +90,11 @@ def register_ring_attn(sequence_parallel_degree: int):
|
|||||||
if rank == 0:
|
if rank == 0:
|
||||||
LOG.info(f"Sequence parallel group assignments: {group_assignments}")
|
LOG.info(f"Sequence parallel group assignments: {group_assignments}")
|
||||||
|
|
||||||
|
if heads_k_stride is None:
|
||||||
|
heads_k_stride = 1
|
||||||
|
|
||||||
from ring_flash_attn import substitute_hf_flash_attn
|
from ring_flash_attn import substitute_hf_flash_attn
|
||||||
|
|
||||||
substitute_hf_flash_attn(get_ring_attn_group(), sequence_parallel_degree)
|
substitute_hf_flash_attn(
|
||||||
|
process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride
|
||||||
|
)
|
||||||
|
|||||||
@@ -816,27 +816,6 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
|
|||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
class SaveModelCallback(TrainerCallback):
|
|
||||||
"""Callback to save model on train end"""
|
|
||||||
|
|
||||||
def on_step_end( # pylint: disable=unused-argument
|
|
||||||
self,
|
|
||||||
args: TrainingArguments,
|
|
||||||
state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
# Save
|
|
||||||
if state.global_step >= state.max_steps:
|
|
||||||
control.should_save = True
|
|
||||||
|
|
||||||
def on_train_end( # pylint: disable=unused-argument
|
|
||||||
self, args, state, control, **kwargs
|
|
||||||
):
|
|
||||||
control.should_save = True
|
|
||||||
return control
|
|
||||||
|
|
||||||
|
|
||||||
class GCCallback(TrainerCallback):
|
class GCCallback(TrainerCallback):
|
||||||
"""Callback to garbage collect torch cache"""
|
"""Callback to garbage collect torch cache"""
|
||||||
|
|
||||||
|
|||||||
@@ -609,7 +609,10 @@ class ModelLoader:
|
|||||||
# Initialize ring attn for sequence parallelism. This must be done after
|
# Initialize ring attn for sequence parallelism. This must be done after
|
||||||
# model init but before the first forward pass, since it modifies flash
|
# model init but before the first forward pass, since it modifies flash
|
||||||
# attn to use ring comm for SP training across multiple GPUs.
|
# attn to use ring comm for SP training across multiple GPUs.
|
||||||
register_ring_attn(self.cfg.sequence_parallel_degree)
|
register_ring_attn(
|
||||||
|
sequence_parallel_degree=self.cfg.sequence_parallel_degree,
|
||||||
|
heads_k_stride=self.cfg.heads_k_stride,
|
||||||
|
)
|
||||||
|
|
||||||
def patch_attention(self) -> None:
|
def patch_attention(self) -> None:
|
||||||
if hasattr(self.model_config, "model_type"):
|
if hasattr(self.model_config, "model_type"):
|
||||||
|
|||||||
@@ -248,6 +248,7 @@ class AxolotlInputConfig(
|
|||||||
val_set_size: float | None = Field(default=0.0)
|
val_set_size: float | None = Field(default=0.0)
|
||||||
|
|
||||||
sequence_parallel_degree: int | None = None
|
sequence_parallel_degree: int | None = None
|
||||||
|
heads_k_stride: int | None = None
|
||||||
|
|
||||||
special_tokens: SpecialTokensConfig | None = None
|
special_tokens: SpecialTokensConfig | None = None
|
||||||
tokens: list[str] | None = None
|
tokens: list[str] | None = None
|
||||||
@@ -1108,7 +1109,7 @@ class AxolotlInputConfig(
|
|||||||
|
|
||||||
@field_validator("sequence_parallel_degree", mode="before")
|
@field_validator("sequence_parallel_degree", mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_sequence_parallel_config(cls, value, info):
|
def check_sequence_parallel_degree(cls, value, info):
|
||||||
if not value:
|
if not value:
|
||||||
value = 1
|
value = 1
|
||||||
|
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ class TestRingAttention:
|
|||||||
mock_new_group.return_value = mock_group
|
mock_new_group.return_value = mock_group
|
||||||
|
|
||||||
# Call register_ring_attn with size 4
|
# Call register_ring_attn with size 4
|
||||||
register_ring_attn(sequence_parallel_degree=4)
|
register_ring_attn(sequence_parallel_degree=4, heads_k_stride=1)
|
||||||
|
|
||||||
# Verify the number of calls without examining the arguments
|
# Verify the number of calls without examining the arguments
|
||||||
assert mock_new_group.call_count == 2
|
assert mock_new_group.call_count == 2
|
||||||
|
|||||||
Reference in New Issue
Block a user