bump transformers to 4.51.3

2025-04-14 07:49:18 -07:00
53 changed files with 151 additions and 1359 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,14 +0,0 @@
 [run]
 source = axolotl
 omit =
    */tests/*
    setup.py
 [report]
 exclude_lines =
    pragma: no cover
    def __repr__
    raise NotImplementedError
    if __name__ == .__main__.:
    pass
    raise ImportError
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,7 @@ jobs:
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras: vllm
+            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -102,16 +102,9 @@ jobs:
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/patched/
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/cli/
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,59 +3,10 @@ set -e
 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
-# Run unit tests with initial coverage report
+pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
-pytest -v --durations=10 -n8 \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels  # running these with the other patches causes a failure
-  --ignore=tests/e2e/ \
+pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
-  --ignore=tests/patched/ \
+pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
-  --ignore=tests/cli \
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-  /workspace/axolotl/tests/ \
+pytest -v --durations=10 /workspace/axolotl/tests/cli
-  --cov=axolotl \
+pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
  --cov-report=xml:coverage.xml
 # Run lora kernels tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/patched/lora_kernels \
  --cov=axolotl \
  --cov-append
 # Run patched tests excluding lora kernels with coverage append
 pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
  --cov-append
 # Run solo tests with coverage append
 pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/solo/ \
  --cov=axolotl \
  --cov-append
 # Run integration tests with coverage append
 pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/integrations/ \
  --cov=axolotl \
  --cov-append
 pytest -v --durations=10 /workspace/axolotl/tests/cli \
  --cov=axolotl \
  --cov-append
 # Run remaining e2e tests with coverage append and final report
 pytest -v --durations=10 \
  --ignore=tests/e2e/solo/ \
  --ignore=tests/e2e/patched/ \
  --ignore=tests/e2e/multigpu/ \
  --ignore=tests/e2e/integrations/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:coverage.xml
 # Upload coverage to Codecov
 if [ -f e2e-coverage.xml ]; then
  codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -4,22 +4,3 @@ set -e
 # only run one test at a time so as not to OOM the GPU
 pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
 pytest -v -n2 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
  /workspace/axolotl/tests/e2e/multigpu/ \
  --cov=axolotl \
  --cov-report=xml:multigpu-coverage.xml
 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
 if [ -f multigpu-coverage.xml ]; then
  codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
 else
  echo "Coverage file not found. Coverage report may have failed."
 fi
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,51 +0,0 @@
 codecov:
  require_ci_to_pass: yes
 coverage:
  precision: 2
  round: down
  range: "70...100"
  status:
    project:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
    patch:
      default:
        # basic
        target: auto
        threshold: 0%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null
 parsers:
  gcov:
    branch_detection:
      conditional: yes
      loop: yes
      method: no
      macro: no
 comment:
  layout: "reach,diff,flags,files,footer"
  behavior: default
  require_changes: no
  require_base: no
  require_head: yes
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -693,9 +693,6 @@ sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
 heads_k_stride: 1
 # One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
 # in the sample packing case, and "batch_ring" in the non-sample packing case.
 ring_attn_func:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -27,9 +27,6 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -1,28 +1,16 @@
 # Llama 4 by Meta AI
 ## Flash Attention vs Flex Attention
 While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
 ## Available Examples
 ### Llama 4 Scout 17Bx16Experts (109B)
 - [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
 - [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
-Flex Attention
+Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd)
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
 - [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
 [//]: # (Flash Attention &#40;Do not use&#41;)
 [//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1]&#40;./scout-vision-qlora-fsdp.yaml&#41;)
 [//]: # (- [Text Single GPU &#40;H100&#41; QLoRA]&#40;./scout-qlora-single-h100.yaml&#41;)
 [//]: # (- [Text Multi GPU QLoRA w/ FSDP1]&#40;./scout-qlora-fsdp1.yaml&#41;)
 Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
 Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
 ### Llama 4 Maverick 17Bx128Experts (400B)
-Coming Soon
+- [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml)
 Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian)
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -1,86 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head
  # - embed_tokens
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -1,85 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 cut_cross_entropy: true
 llama4_linearized_experts: true  # needed with custom linearized experts model
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$  # optionally train the moe experts
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  # - lm_head  # needed if modifying vocabulary
  # - embed_tokens
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
 chat_template: llama4
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
 pad_to_sequence_len: true
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 torch_compile: true
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
 warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -1,89 +0,0 @@
 base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
 model_type: Llama4ForConditionalGeneration
 processor_type: Llama4Processor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 sequence_len: 4096
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 llama4_linearized_experts: true  # use Axolotl's customized model
 load_in_4bit: true
 adapter: qlora
 lora_r: 32
 lora_alpha: 64
 lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - vision_adapter.mlp.fc1
  - vision_adapter.mlp.fc2
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
 lora_modules_to_save:
  - lm_head
  - embed_tokens
 chat_template: llama4
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 1e-4
 bf16: true
 tf32: true
 logging_steps: 1
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - auto_wrap
  - full_shard
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 pre-commit
 black
 mypy
 pre-commit
 types-requests
 quartodoc
 jupyter
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,7 +1,5 @@
 codecov
 pytest
-pytest-cov
+pytest-xdist
 pytest-retry
 pytest-sugar
 pytest-xdist
 tbparse
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -25,5 +25,5 @@ if cce_spec:
 print(
    UNINSTALL_PREFIX
-    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"'
+    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"'
 )
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append("xformers==0.0.29.post2")
-                extras_require_map["vllm"] = ["vllm==0.8.3"]
+                extras_require_map["vllm"] = ["vllm==0.8.1"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
--- a/src/axolotl/cli/delinearize_llama4.py
+++ b/src/axolotl/cli/delinearize_llama4.py
@@ -1,156 +0,0 @@
 """
 CLI tool to delinearize quantized/Linearized Llama-4 models.
 """
 import os
 from pathlib import Path
 from typing import Generator, Union
 import fire
 import torch
 from accelerate import init_empty_weights
 from dotenv import load_dotenv
 from transformers import AutoProcessor
 def iter_convert_patched_to_hf(model_state_dict, num_experts) -> Generator:
    keys = list(model_state_dict.keys())
    for key in keys:
        if ".feed_forward.experts." not in key:
            yield key, model_state_dict[key]
        if ".feed_forward.experts.gate_projs" in key:
            # gate gets fused with up so skip the yield on this and we'll fuse it when asking for the up
            continue
        if ".feed_forward.experts.up_projs" in key:
            if ".feed_forward.experts.up_projs.0." in key:
                # handle the re-shape and fusing of gate and up, and conversion from linear to parameter
                prefix = key.split(".up_projs.0.")[0]
                key = f"{prefix}.gate_up_proj"
                # grab all the up_projs and gate_projs across all experts
                gate_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.gate_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                up_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.up_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                gate_up_proj = torch.cat((gate_stacked, up_stacked), dim=-1)
                del gate_stacked, up_stacked
                yield key, gate_up_proj
            else:
                del model_state_dict[key]
                continue
        if ".feed_forward.experts.down_projs" in key:
            if ".feed_forward.experts.down_projs.0." in key:
                # handle the re-shape and fusing of gate and up, and conversion from linear to parameter
                prefix = key.split(".down_projs.0.")[0]
                key = f"{prefix}.down_proj"
                # grab all the down_projs across all experts
                down_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.down_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                yield key, down_stacked
            else:
                del model_state_dict[key]
                continue
 def do_cli(model: Union[Path, str], output: Union[Path, str]) -> None:
    """
    Convert a patched HF format Llama4 model (with separated projections)
    back to the original HF format (with fused projections).
    Args:
        model: Path to the patched HF model
        output: Path to save the converted model
    """
    print(f"Loading model from {model}")
    from axolotl.monkeypatch.models.llama4.modeling import (
        patch_llama4_linearized_modeling,
    )
    unpatch_llama4 = patch_llama4_linearized_modeling()
    from transformers import Llama4ForConditionalGeneration
    model_ = Llama4ForConditionalGeneration.from_pretrained(
        model, torch_dtype=torch.bfloat16
    )
    processor = AutoProcessor.from_pretrained(model)
    processor.save_pretrained(output)
    device = model_.device.type
    if device == "cuda":
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    model_config = model_.config
    config = model_.config.get_text_config()
    # Get key dimensions from the config
    hidden_size = config.hidden_size
    intermediate_size = config.intermediate_size
    num_experts = config.num_local_experts
    print(
        f"Model dimensions: hidden_size={hidden_size}, intermediate_size={intermediate_size}, num_experts={num_experts}"
    )
    # Create output directory if it doesn't exist
    os.makedirs(output, exist_ok=True)
    # Get state dict
    state_dict = model_.state_dict()
    del model_
    # Create a new state dict for the converted model
    converted_state_dict = {}
    # First, copy all keys that don't need modification
    for key, value in iter_convert_patched_to_hf(state_dict, num_experts):
        converted_state_dict[key] = value
    del state_dict
    if device == "cuda":
        torch.cuda.empty_cache()
        print("State dict converted.")
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    # Ideally re-load the model import to load the converted state dict
    # Save the converted model
    with init_empty_weights():
        unpatch_llama4()
        model_ = Llama4ForConditionalGeneration(model_config)
    if device == "cuda":
        print("State dict loaded into model.")
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    model_.load_state_dict(converted_state_dict, strict=False, assign=True)
    print(f"Saving converted model to {output}...")
    model_.save_pretrained(output)
    print(f"Model successfully converted and saved to {output}")
 if __name__ == "__main__":
    load_dotenv()
    fire.Fire(do_cli)
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -330,15 +330,6 @@ def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
    do_vllm_serve(config, cli_args)
@cli.command()
@click.argument("model", type=click.Path(exists=True, path_type=str))
@click.argument("output", type=click.Path(exists=False, path_type=str))
 def delinearize_llama4(model: str, output: str) -> None:
    from axolotl.cli.delinearize_llama4 import do_cli as do_delinearize_llama4
    do_delinearize_llama4(model, output)
 cli.add_command(lm_eval)
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -40,7 +40,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        LOG.warning("Error raised: %s", e)
    model.generation_config.do_sample = True
    model.config.use_cache = True
    if cfg.local_rank == 0:
        LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...")
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -129,19 +129,17 @@ def load_preference_datasets(
        total_num_steps = None
    if cli_args.debug or cfg.debug:
-        if not cfg.rl == "grpo":
+        LOG.info("check_dataset_labels...")
            LOG.info("check_dataset_labels...")
-            tokenizer = load_tokenizer(cfg)
+        tokenizer = load_tokenizer(cfg)
-            train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
-
+        check_dataset_labels(
-            check_dataset_labels(
+            train_samples,
-                train_samples,
+            tokenizer,
-                tokenizer,
+            num_examples=cli_args.debug_num_examples,
-                num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
-                text_only=cli_args.debug_text_only,
+            rl_mode=True,
-                rl_mode=True,
+        )
            )
    return TrainDatasetMeta(
        train_dataset=train_dataset,
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -776,7 +776,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["sequence_parallel_degree"] = (
            self.cfg.sequence_parallel_degree
        )
        training_arguments_kwargs["ring_attn_func"] = self.cfg.ring_attn_func
        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
@@ -934,7 +933,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        kwargs["return_tensors"] = "pt"
        if issubclass(collator, DataCollatorForSeq2Seq):
            kwargs["sequence_parallel_degree"] = training_args.sequence_parallel_degree
            kwargs["ring_attn_func"] = training_args.ring_attn_func
        return collator(
            *collator_args,
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -9,8 +9,6 @@ from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
 class AxolotlTrainingMixins:
@@ -220,12 +218,6 @@ class AxolotlTrainingMixins:
        default=1,
        metadata={"help": "The number of workers to use in sequence parallelism"},
    )
    ring_attn_func: Optional[RingAttnFunc] = field(
        default=None,
        metadata={
            "help": "The ring-flash-attn function to use in sequence parallelism"
        },
    )
    # multi-modal section
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -12,14 +12,12 @@ See https://github.com/apple/ml-cross-entropy
 Run the following command to install `cut_cross_entropy[transformers]` if you don't have it already.
 - If you are in dev environment
 ```bash
 # if you are in dev environment
 python scripts/cutcrossentropy_install.py | sh
 ```
- If you are installing from pip
+# if you are not in dev environment
-```bash
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"
 pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"
 ```
 ## Usage
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -33,7 +33,7 @@ LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")
 _CCE_INSTALL_MESSAGE = (
    "Please install cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"`'
 )
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
@@ -165,7 +165,7 @@ def cce_forward(
 )
 def cce_forward_multimodal(
    self,
-    input_ids: torch.LongTensor | None = None,  # type: ignore
+    input_ids: torch.LongTensor | None = None,
    pixel_values: torch.FloatTensor | None = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
@@ -254,7 +254,7 @@ def cce_forward_multimodal(
        )
    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(input_ids)  # type: ignore
+        inputs_embeds = self.get_input_embeddings()(input_ids)
    if pixel_values is not None:
        image_features = self.get_image_features(
@@ -263,13 +263,13 @@ def cce_forward_multimodal(
            vision_feature_select_strategy=vision_feature_select_strategy,
            image_sizes=image_sizes,
        )
-        original_inputs_embeds_shape = inputs_embeds.shape  # type: ignore
+        original_inputs_embeds_shape = inputs_embeds.shape
        vision_flat = image_features.view(-1, image_features.size(-1))
        projected_vision_flat = self.multi_modal_projector(vision_flat)
        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-        final_mask = special_image_mask.to(inputs_embeds.device)  # type: ignore
+        final_mask = special_image_mask.to(inputs_embeds.device)
        inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))  # type: ignore
        final_mask_1d = final_mask[..., 0].reshape(-1)
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -49,7 +49,7 @@ def fsdp2_load_full_state_dict(accelerator, model: torch.nn.Module, full_sd: dic
            )
            sharded_sd[param_name] = sharded_tensor
-    model.load_state_dict(sharded_sd, assign=True)
+    model.load_state_dict(sharded_sd)
 def patch_accelerate_fsdp_utils():
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -7,11 +7,12 @@ import torch
 import transformers
-def patch_flex_wrapper(**flex_attn_compile_kwargs):
+def patch_flex_wrapper():
    # TODO remove this patch when transformers#37285 is merged and in a release
    is_torch_2_6 = torch.__version__.startswith("2.6")
    is_transformers_below_4_51 = transformers.__version__ < "4.51.0"
-    if not is_torch_2_6:
+    if not (is_torch_2_6 and is_transformers_below_4_51):
        return
    from torch.nn.attention.flex_attention import flex_attention
@@ -31,24 +32,17 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
                cls._instance = super().__new__(cls)
            return cls._instance
        @classmethod
        def del_singleton(cls):
            cls._instance = None
        @torch.compiler.disable(recursive=False)
-        def __init__(self, training):
+        def __init__(self):
            """
            Initialize or update the singleton instance.
            """
-            self.training = None
+            if not self._is_flex_compiled:
            if not self._is_flex_compiled or training != self.training:
                # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
                # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
                # see https://github.com/pytorch/pytorch/issues/146260 for training
                self.training = training
                self._compiled_flex_attention = torch.compile(
                    flex_attention,
-                    **flex_attn_compile_kwargs,
+                    dynamic=False,
                    mode="max-autotune-no-cudagraphs",
                    fullgraph=True,
                )
                self._is_flex_compiled = True
@@ -56,22 +50,15 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
            return self._compiled_flex_attention
    transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
    setattr(
        sys.modules["transformers.integrations.flex_attention"],
        "WrappedFlexAttention",
        WrappedFlexAttention,
    )
 def patch_flex_make_mask():
    is_torch_2_6 = torch.__version__.startswith("2.6")
    is_transformers_eq_4_51 = transformers.__version__ == "4.51.0"
-    if not is_torch_2_6:
+    if not (is_torch_2_6 and is_transformers_eq_4_51):
        return
    from torch.nn.attention.flex_attention import (
        _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size,
    )
    from torch.nn.attention.flex_attention import (
        BlockMask,
    )
@@ -117,16 +104,14 @@ def patch_flex_make_mask():
        if not query_length:
            query_length = total_seq_len
        attention_mask_2d = torch.nn.functional.pad(
-            attention_mask_2d,
+            attention_mask_2d, value=0, pad=(0, key_length)
            value=0,
            pad=(0, abs(total_seq_len - max(key_length, flex_default_block_size))),
        )
        device = attention_mask_2d.device
        document_ids = attention_mask_2d.clone()
        if attention_chunk_size is not None:
            # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
-            chunk_idxs = (document_ids.clone().fill_(1).cumsum(-1) - 1) // (
+            document_ids = (document_ids.fill_(1).cumsum(-1) - 1) // (
                attention_chunk_size
            )
@@ -153,18 +138,6 @@ def patch_flex_make_mask():
            final_mask = causal_mask & padding_mask & document_mask
            return final_mask
        def chunk_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
            """
            Combines the chunk mask with the causal mask for chunked attention.
            """
            chunk_mask = chunk_idxs[batch_idx, q_idx] == chunk_idxs[batch_idx, kv_idx]
            causal_doc_mask = causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx)
            return chunk_mask & causal_doc_mask
        mask_mod_maybe_combined = (
            causal_mask_mod if attention_chunk_size is None else chunk_causal_mask_mod
        )
        if offsets is not None:
            q_offset = offsets[0]
            kv_offset = offsets[1]
@@ -172,10 +145,10 @@ def patch_flex_make_mask():
            def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
                offset_q = q_idx + q_offset
                offset_kv = kv_idx + kv_offset
-                return mask_mod_maybe_combined(batch_idx, head_idx, offset_q, offset_kv)
+                return causal_mask_mod(batch_idx, head_idx, offset_q, offset_kv)
        else:
-            mask_mod = mask_mod_maybe_combined
+            mask_mod = causal_mask_mod
        return create_block_causal_mask_flex(
            mask_mod=mask_mod,
            B=batch_size,
@@ -187,16 +160,11 @@ def patch_flex_make_mask():
        )
    for n in tuple(sys.modules):
-        if ".modeling_" in n:
+        if ".modeling_" in n and "llama4" not in n:
            if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
                sys.modules[n].make_flex_block_causal_mask = (
                    patched_make_flex_block_causal_mask
                )
                setattr(
                    sys.modules[n],
                    "make_flex_block_causal_mask",
                    patched_make_flex_block_causal_mask,
                )
    transformers.integrations.flex_attention.make_flex_block_causal_mask = (
        patched_make_flex_block_causal_mask
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -6,8 +6,6 @@ package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patc
 their sequence parallel version of Flash Attention 2.
 """
 from enum import Enum
 import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
@@ -18,7 +16,6 @@ from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 configure_logging()
 LOG = get_logger(__name__)
 RING_ATTN_GROUP = None
@@ -43,22 +40,7 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    RING_ATTN_GROUP = ring_attn_group
-class RingAttnFunc(str, Enum):
+def register_ring_attn(sequence_parallel_degree: int, heads_k_stride: int | None):
    """Enum class for supported `ring-flash-attn` implementations"""
    # VARLEN_RING = "varlen_ring"
    # VARLEN_ZIGZAG = "varlen_zigzag"
    VARLEN_LLAMA3 = "varlen_llama3"
    BATCH_RING = "batch_ring"
    BATCH_ZIGZAG = "batch_zigzag"
    BATCH_STRIPE = "batch_stripe"
 def register_ring_attn(
    sequence_parallel_degree: int,
    heads_k_stride: int | None,
    ring_attn_func: RingAttnFunc | None,
 ):
    """
    Create ring attention group and substitute flash attn with ring flash attn.
@@ -66,9 +48,6 @@ def register_ring_attn(
        sequence_parallel_degree: Sequence parallelism factor.
        heads_k_stride: Sequence parallelism K head stride size. Passed
            through to `ring_flash_attn.substitute_hf_flash_attn`.
        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
            packing is enabled, it must be a `varlen` function; otherwise, it must be a
            `batch` function.
    """
    if get_ring_attn_group() is not None:
        LOG.info("Ring attention already registered, exiting early...")
@@ -79,9 +58,7 @@ def register_ring_attn(
        f"each sequence will be processed across {sequence_parallel_degree} GPUs"
    )
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    assert sequence_parallel_degree <= world_size, (
        f"sequence_parallel_degree ({sequence_parallel_degree}) "
        f"must be less than or equal to world_size ({world_size})"
@@ -91,8 +68,10 @@ def register_ring_attn(
        f"must evenly divide world_size ({world_size})"
    )
-    # Assign ranks to sequence parallel groups
+    # Detailed logging of group formation
    rank = dist.get_rank()
    group_assignments = {}
    for i in range(world_size // sequence_parallel_degree):
        ring_attn_ranks = list(
            range(
@@ -113,37 +92,35 @@ def register_ring_attn(
    if rank == 0:
        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
-    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
+    if heads_k_stride is None:
-        from ring_flash_attn import substitute_hf_flash_attn
+        heads_k_stride = 1
-        substitute_hf_flash_attn(
+    from ring_flash_attn import substitute_hf_flash_attn
            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
        )
    elif ring_attn_func in [
        RingAttnFunc.BATCH_RING,
        RingAttnFunc.BATCH_ZIGZAG,
        RingAttnFunc.BATCH_STRIPE,
    ]:
        from axolotl.monkeypatch.attention.ring_attn.adapters.batch import (
            substitute_hf_flash_attn,
        )
-        substitute_hf_flash_attn(
+    substitute_hf_flash_attn(
-            process_group=get_ring_attn_group(),
+        process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride
-            ring_attn_func=ring_attn_func,
+    )
        )
-def update_ring_attn_params(position_ids: torch.Tensor | None):
+def update_ring_attn_params(batch: dict[str, torch.Tensor]):
    """
    Calculate the cumulative sequence lengths for the current forward pass and pass the
    value to the substituted `ring_flash_attn`.
    Args:
-        position_ids: Optional tensor of position IDs (for sample packed data).
+        batch: A dictionary with a batch of data. May or may not contain `position_ids`
            data; if not, we compute it.
    """
    from ring_flash_attn import update_ring_flash_attn_params
    input_ids = batch["input_ids"]
    position_ids = batch.get("position_ids")
    if position_ids is None:
        seq_len = input_ids.shape[1]
        position_ids = torch.arange(
            0, seq_len, dtype=torch.long, device=input_ids.device
        ).unsqueeze(0)
    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
--- a/src/axolotl/monkeypatch/attention/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/init.py
@@ -1,12 +0,0 @@
 """Init for ring attention monkeypatch module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .patch import (
    RingAttnFunc,
    get_ring_attn_group,
    register_ring_attn,
    set_ring_attn_group,
    update_ring_attn_params,
 )
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
@@ -1,192 +0,0 @@
 """
 HuggingFace flash attention adapter for basic ring attention (batch API).
 Inspired by
 https://github.com/zhuzilin/ring-flash-attention/blob/ce9fd3935ca0e5f0592bb0826cbed18ec69da729/ring_flash_attn/adapters/hf_adapter.py.
 Our implementation closely follows the structure of that module, but we've minified it
 somewhat to support only the latest versions of transformers.
 """
 # pylint: disable=protected-access,cyclic-import
 import os
 from typing import Callable
 import torch
 import torch.distributed as dist
 import transformers
 import transformers.modeling_flash_attention_utils
 from ring_flash_attn import (
    ring_flash_attn_func,
    stripe_flash_attn_func,
    zigzag_ring_flash_attn_func,
 )
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import (
    _flash_supports_window_size,
    is_flash_attn_greater_or_equal,
 )
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
 RING_ATTN_FUNC_MAPPING = {
    RingAttnFunc.BATCH_RING: ring_flash_attn_func,
    RingAttnFunc.BATCH_ZIGZAG: zigzag_ring_flash_attn_func,
    RingAttnFunc.BATCH_STRIPE: stripe_flash_attn_func,
 }
 def create_flash_attn_forward(
    process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc
 ) -> Callable:
    """
    Create a ring flash attention forward function compatible with HuggingFace's
    interface.
    Args:
        process_group: A PyTorch distributed process group.
        ring_attn_func: Function from `ring_flash_attention` to replace HF flash
            attention with.
    Returns:
        A function that implements the ring flash attention forward pass with the
            signature expected by HuggingFace Transformers.
    """
    # transformers 4.48+
    # pylint: disable=unused-argument
    def _flash_attention_forward(
        query_states: torch.Tensor,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        attention_mask: torch.Tensor,
        query_length: int,
        is_causal: bool,
        dropout: float = 0.0,
        position_ids: torch.Tensor | None = None,
        softmax_scale: float | None = None,
        sliding_window: int | None = None,
        use_top_left_mask: bool = False,
        softcap: float | None = None,
        deterministic: bool = None,
        cu_seq_lens_q: torch.LongTensor | None = None,
        cu_seq_lens_k: torch.LongTensor | None = None,
        max_length_q: int | None = None,
        max_length_k: int | None = None,
        target_dtype: torch.dtype | None = None,
        **kwargs,
    ):
        """
        Calls the forward method of Ring Flash Attention.
        Args:
            query_states: Tensor containing the query vectors.
            key_states: Tensor containing the key vectors.
            value_states: Tensor containing the value vectors.
            attention_mask: Not used in this implementation.
            query_length: Integer representing the length of the query sequence.
            is_causal: Boolean indicating whether to apply a causal mask to the attention.
            dropout: Float representing the dropout probability. Default is 0.0.
            position_ids: Not used in this implementation.
            softmax_scale: Optional float value for the softmax scaling factor. Default is None.
            sliding_window: Optional integer defining the size of the sliding attention window.
                Default is None.
            use_top_left_mask: Boolean indicating whether to use a top-left mask for the attention.
                Default is False.
            softcap: Not used in this implementation.
            deterministic: Optional boolean to enforce deterministic computation. Default is None.
            cu_seq_lens_q: Not used in this implementation.
            cu_seq_lens_k: Not used in this implementation.
            max_length_q: Not used in this implementation.
            max_length_k: Not used in this implementation.
            target_dtype: Not used in this implementation.
            **kwargs: Additional keyword arguments. Not used in this implementation.
        Returns:
            torch.Tensor: The output of the attention mechanism, with shape
                `[batch_size, query_length, num_heads, head_dim]`.
        """
        if not use_top_left_mask:
            causal = is_causal
        else:
            causal = is_causal and query_length != 1
        # Handle sliding window
        use_sliding_windows = (
            _flash_supports_window_size
            and sliding_window is not None
            and key_states.shape[1] > sliding_window
        )
        window_size = (
            (sliding_window, sliding_window) if use_sliding_windows else (-1, -1)
        )
        # Handle deterministic mode
        if is_flash_attn_greater_or_equal("2.4.1"):
            if deterministic is None:
                deterministic = (
                    os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
                )
        # Call ring flash attention function
        attn_output = RING_ATTN_FUNC_MAPPING[ring_attn_func](
            query_states,
            key_states,
            value_states,
            dropout_p=dropout,
            softmax_scale=softmax_scale,
            causal=causal,
            window_size=window_size,
            alibi_slopes=None,
            deterministic=deterministic,
            return_attn_probs=False,
            group=process_group,
        )
        return attn_output
    return _flash_attention_forward
 def substitute_hf_flash_attn(
    process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc
 ):
    """
    Substitute HuggingFace's flash attention implementation with ring-based implementation.
    Args:
        process_group: PyTorch distributed process group for communication.
        ring_attn_func: Function from `ring_flash_attention` to replace HF flash
            attention with.
    """
    try:
        # Substitute flash attention
        old_flash_attention_forward = (
            transformers.modeling_flash_attention_utils._flash_attention_forward
        )
        new_flash_attention_forward = create_flash_attn_forward(
            process_group=process_group, ring_attn_func=ring_attn_func
        )
        if check_params(old_flash_attention_forward, new_flash_attention_forward):
            transformers.modeling_flash_attention_utils._flash_attention_forward = (
                new_flash_attention_forward
            )
        else:
            raise ValueError(
                "The signature of the new flash attention forward function does not match the old one."
            )
    except Exception as exception:
        raise ValueError(
            f"The current transformer version {transformers.__version__} is not supported. "
            "Please use pip install -U transformers to upgrade to the latest version. "
            "If the code failed with the latest version, "
            f"please file an issue."
        ) from exception
    # Register with ALL_ATTENTION_FUNCTIONS if available
    if ALL_ATTENTION_FUNCTIONS is not None:
        from ring_flash_attn.adapters.hf_adapter import flash_attention_forward
        ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
--- a/src/axolotl/monkeypatch/models/llama4/modeling.py
+++ b/src/axolotl/monkeypatch/models/llama4/modeling.py
@@ -93,20 +93,9 @@ def patch_llama4_linearized_modeling():
    """
    from transformers.models.llama4 import modeling_llama4
    old_lamma_4_text_experts = modeling_llama4.Llama4TextExperts
    modeling_llama4.Llama4TextExperts = Llama4TextExperts
    setattr(
        sys.modules["transformers.models.llama4"],
        "Llama4TextExperts",
        Llama4TextExperts,
    )
    def unpatch():
        modeling_llama4.Llama4TextExperts = old_lamma_4_text_experts
        setattr(
            sys.modules["transformers.models.llama4"],
            "Llama4TextExperts",
            old_lamma_4_text_experts,
        )
    return unpatch
--- a/src/axolotl/monkeypatch/trainer_eval_guard.py
+++ b/src/axolotl/monkeypatch/trainer_eval_guard.py
@@ -1,78 +0,0 @@
 """
 fix for FSDP2 evals when using torch.compile
 """
 import inspect
 import logging
 from transformers import Trainer
 from axolotl.monkeypatch.utils import detab_code
 LOG = logging.getLogger(__name__)
 ORIGINAL_TRAINER_CODE = """
    model.eval()
 """
 PATCHED_TRAINER_CODE = """
    if hasattr(model, "eval") and callable(model.eval):
        self.model.eval()
 """
 def get_evaluation_loop_code() -> str:
    training_loop = inspect.getsource(Trainer.evaluation_loop)
    return training_loop
 def check_evaluation_loop_is_patchable() -> bool:
    eval_loop = get_evaluation_loop_code()
    eval_loop, _ = detab_code(eval_loop)
    return ORIGINAL_TRAINER_CODE in eval_loop
 def patch_evaluation_loop_for_fsdp2():
    """
    monkeypatch for fixing the eval loop for fsdp2 with torch.compile
    """
    try:
        evaluation_loop = get_evaluation_loop_code()
    except OSError:
        return
    Trainer._original_evaluation_loop = (  # pylint: disable=protected-access
        evaluation_loop
    )
    evaluation_loop, _ = detab_code(evaluation_loop)
    if ORIGINAL_TRAINER_CODE not in evaluation_loop:
        return
    evaluation_loop = evaluation_loop.replace(
        ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE
    )
    evaluation_loop = evaluation_loop.replace(
        "def evaluation_loop(",
        "def _fixed_evaluation_loop(",
        1,
    )
    # load imports necessary
    import transformers.trainer
    items_to_import = []
    for item in dir(transformers.trainer):
        if item in evaluation_loop:
            items_to_import.append(item)
    exec(  # pylint: disable=exec-used  # nosec B102
        "from transformers.trainer import ("
        + ", ".join(x for x in items_to_import)
        + ")",
        globals(),
    )
    exec(evaluation_loop, globals())  # pylint: disable=exec-used  # nosec B102
    LOG.info("patching _inner_training_loop for fsdp optimizer save")
    Trainer.evaluation_loop = (  # pylint: disable=protected-access
        _fixed_evaluation_loop  # pylint: disable=undefined-variable  # noqa: F821
    )
--- a/src/axolotl/prompt_strategies/base.py
+++ b/src/axolotl/prompt_strategies/base.py
@@ -4,73 +4,30 @@ module for base dataset transform strategies
 import importlib
 import logging
 import sys
 LOG = logging.getLogger("axolotl")
 def import_from_path(module_name: str, file_path: str):
    """
    Import a module from a file path.
    Args:
        module_name: Name of the module.
        file_path: Path to the file.
    Returns:
        module: The imported module.
    """
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    if spec is None:
        raise ImportError(f"Could not create module spec for: {file_path}")
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    loader = importlib.machinery.SourceFileLoader(module_name, file_path)
    spec.loader = loader
    loader.exec_module(module)
    return module
 def load(strategy, cfg, module_base=None, **kwargs):
-    if len(strategy.split(".")) == 1:
+    try:
-        strategy = strategy + ".default"
+        if len(strategy.split(".")) == 1:
-    load_fn = strategy.split(".")[-1]
+            strategy = strategy + ".default"
-    func = None
+        load_fn = strategy.split(".")[-1]
-    if len(strategy.split(".")) > 1:
+        if len(strategy.split(".")) > 1:
-        try:
+            try:
-            mod = importlib.import_module(
+                importlib.import_module(
-                strategy.split(".")[-2],
+                    strategy.split(".")[-2],
-                ".".join(strategy.split(".")[:-2]),
+                    ".".join(strategy.split(".")[:-2]),
-            )
+                )
-            func = getattr(mod, load_fn)
+                module_base = ".".join(strategy.split(".")[:-2])
-            return func(cfg, **kwargs)
+                strategy = strategy.split(".")[-2]
-        except ModuleNotFoundError:
+            except ModuleNotFoundError:
-            pass
+                strategy = "." + ".".join(strategy.split(".")[:-1])
-
+        else:
-        try:
+            strategy = "." + ".".join(strategy.split(".")[:-1])
            mod = importlib.import_module(
                "." + ".".join(strategy.split(".")[:-1]), module_base
            )
            func = getattr(mod, load_fn)
            return func(cfg, **kwargs)
        except ModuleNotFoundError:
            pass
        try:
            file_path = "/".join(strategy.split(".")[:-1]) + ".py"
            module_name = strategy.split(".")[-2]
            mod = import_from_path(module_name, file_path)
            func = getattr(mod, load_fn)
            if func is not None:
                return func(cfg, **kwargs)
        except FileNotFoundError:
            pass
    else:
        strategy = "." + ".".join(strategy.split(".")[:-1])
        mod = importlib.import_module(strategy, module_base)
        func = getattr(mod, load_fn)
        return func(cfg, **kwargs)
-
+    except Exception:  # pylint: disable=broad-exception-caught
-    LOG.warning(f"unable to load strategy {strategy}")
+        LOG.warning(f"unable to load strategy {strategy}")
-    return func
+        return None
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -81,11 +81,6 @@ def setup_model_and_tokenizer(
    # Apply freezing if specified
    if cfg.unfrozen_parameters:
        freeze_layers_except(model, cfg.unfrozen_parameters)
        if any(
            any(embed in param for embed in ["lm_head", "embed_tokens"])
            for param in cfg.unfrozen_parameters
        ):
            model.enable_input_require_grads()
    return model, tokenizer, peft_config, processor
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -4,7 +4,7 @@ includes logic for handling sequence parallelism collation.
 """
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Optional, Union
 import numpy as np
 import torch
@@ -13,7 +13,6 @@ from transformers import PreTrainedTokenizerBase
 from transformers.utils import PaddingStrategy
 from axolotl.monkeypatch.attention.ring_attn import update_ring_attn_params
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
@@ -54,15 +53,14 @@ class DataCollatorForSeq2Seq:
    """
    tokenizer: PreTrainedTokenizerBase
-    model: Any | None = None
+    model: Optional[Any] = None
-    padding: bool | str | PaddingStrategy = True
+    padding: Union[bool, str, PaddingStrategy] = True
-    max_length: int | None = None
+    max_length: Optional[int] = None
-    pad_to_multiple_of: int | None = None
+    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    position_pad_token_id: int = 0
    return_tensors: str = "pt"
    sequence_parallel_degree: int = 1
    ring_attn_func: RingAttnFunc | None = None
    def __post_init__(self):
        if self.sequence_parallel_degree > 1:
@@ -159,41 +157,19 @@ class DataCollatorForSeq2Seq:
            Sliced batch dictionary.
        """
        # Get local (start, end) for sequence parallelism slicing
-        total_seq_len = batch["input_ids"].size(1)
+        total_seq_len = batch["input_ids"].shape[1]
        slice_size = total_seq_len // self.local_world_size
        start = self.local_rank * slice_size
        end = start + slice_size
-        # Update params for varlen ring attention calculation
+        # Update params for ring attention calculation
-        if batch.get("position_ids") is not None:
+        update_ring_attn_params(batch=batch)
            update_ring_attn_params(position_ids=batch["position_ids"])
        # Slice batch for sequence parallel processing
-        for key in batch:
+        keys_to_slice = ["input_ids", "attention_mask", "labels", "position_ids"]
-            if batch[key].size(1) == total_seq_len:
+        for key in keys_to_slice:
-                if self.ring_attn_func in [
+            if key in batch:
-                    RingAttnFunc.VARLEN_LLAMA3,
+                batch[key] = batch[key][:, start:end]
                    RingAttnFunc.BATCH_RING,
                ]:
                    batch[key] = (
                        batch[key]
                        .chunk(self.local_world_size, dim=1)[self.local_rank]
                        .contiguous()
                    )
                elif self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                    chunks = batch[key].chunk(2 * self.local_world_size, dim=1)
                    # Take rank's chunk and opposing chunk for zigzag pattern
                    selected_chunks = [
                        chunks[self.local_rank],
                        chunks[2 * self.local_world_size - self.local_rank - 1],
                    ]
                    batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
                elif self.ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                    # TODO(djsaunde): This doesn't seem to work as expected
                    # Split into striped data and stack
                    tensor = torch.stack(
                        batch[key].split(self.local_world_size, dim=1),
                        dim=1,
                    ).transpose(1, 2)
                    batch[key] = tensor[:, self.local_rank].contiguous()
        return batch
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -332,23 +332,16 @@ def load_tokenized_prepared_datasets(
        if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
            if isinstance(dataset, IterableDataset):
                num_workers = cfg.dataset_processes
-                def gen_from_iter_ds(_ds, worker_id: List[int], num_workers: List[int]):
+                def gen_from_iter_ds(_ds, _=None):
-                    """Generator function to correctly splice the dataset for each worker"""
+                    yield from _ds
                    for i, item in enumerate(_ds):
                        if i % num_workers[0] == worker_id[0]:
                            yield item
                ds_from_iter = Dataset.from_generator(
                    functools.partial(gen_from_iter_ds, dataset),
                    features=dataset.features,
-                    num_proc=num_workers,
+                    num_proc=cfg.dataset_processes,
                    split=split,
-                    gen_kwargs={
+                    gen_kwargs={"_": list(range(cfg.dataset_processes))},
                        "worker_id": list(range(num_workers)),
                        "num_workers": [num_workers] * num_workers,
                    },
                )
                ds_from_iter.save_to_disk(str(prepared_ds_path))
            else:
--- a/src/axolotl/utils/freeze.py
+++ b/src/axolotl/utils/freeze.py
@@ -2,14 +2,13 @@
 module to freeze/unfreeze parameters by name
 """
 import logging
 import re
 from typing import Callable, List, Tuple, Union
 from accelerate.logging import get_logger
 from axolotl.utils.distributed import is_main_process
-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl.utils.freeze")
 def freeze_layers_except(model, regex_patterns):
@@ -185,7 +184,7 @@ class LayerNamePattern:
        """
        self.raw_pattern = pattern
        name_pattern, self.range = self._parse_pattern(pattern)
-        self.name_regex = re.compile(re.sub(r"\.(?!\+)", "\\.", name_pattern))
+        self.name_regex = re.compile(name_pattern.replace(".", "\\."))
    def match(self, name: str) -> bool:
        """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -542,17 +542,6 @@ class ModelLoader:
            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
            patch_accelerate_fsdp_utils()
        if self.cfg.flex_attention:
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_make_mask,
                patch_flex_wrapper,
            )
            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
            patch_flex_wrapper(**flex_attn_compile_kwargs)
            patch_flex_make_mask()
        # patch gemma3 conditional generation forward before loading plugins
        # as it could be overridden by plugins
        if self.cfg.model_config_type == "llama4":
@@ -655,7 +644,6 @@ class ModelLoader:
            register_ring_attn(
                sequence_parallel_degree=self.cfg.sequence_parallel_degree,
                heads_k_stride=self.cfg.heads_k_stride,
                ring_attn_func=self.cfg.ring_attn_func,
            )
    def patch_attention(self) -> None:
@@ -917,6 +905,13 @@ class ModelLoader:
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "flex_attention"
            )
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_make_mask,
                patch_flex_wrapper,
            )
            patch_flex_wrapper()
            patch_flex_make_mask()
        elif self.cfg.flash_attention:
            if not self.cfg.sample_packing and self.cfg.s2_attention:
@@ -1120,7 +1115,7 @@ class ModelLoader:
        return skip_move_to_device
-    def adjust_model_config(self) -> None:
+    def ajust_model_config(self) -> None:
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "max_position_embeddings")
@@ -1280,7 +1275,7 @@ class ModelLoader:
        else:
            self.model.tie_weights()
-        self.adjust_model_config()
+        self.ajust_model_config()
        # log device memory usage
        if hasattr(self.model, "device") and self.model.device.type in (
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -225,7 +225,6 @@ class AxolotlInputConfig(
    sdp_attention: bool | None = None
    s2_attention: bool | None = None
    flex_attention: bool | None = None
    flex_attn_compile_kwargs: dict[str, Any] | None = None
    flash_attention: bool | None = None
    flash_attn_cross_entropy: bool | None = None
    flash_attn_rms_norm: bool | None = None
@@ -259,7 +258,6 @@ class AxolotlInputConfig(
    sequence_parallel_degree: int | None = None
    heads_k_stride: int | None = None
    ring_attn_func: str | None = None
    special_tokens: SpecialTokensConfig | None = None
    tokens: list[str] | None = None
@@ -1148,7 +1146,7 @@ class AxolotlInputConfig(
        return data
-    @field_validator("sequence_parallel_degree", mode="after")
+    @field_validator("sequence_parallel_degree", mode="before")
    @classmethod
    def check_sequence_parallel_degree(cls, value, info):
        if not value:
@@ -1160,12 +1158,9 @@ class AxolotlInputConfig(
                    "flash_attention: true must be set with sequence_parallel_degree > 1"
                )
-            if (
+            if not info.data["micro_batch_size"] == 1:
                info.data.get("sample_packing")
                and not info.data["micro_batch_size"] == 1
            ):
                raise ValueError(
-                    "micro_batch_size must be set to 1 when sample_packing is enabled"
+                    "micro_batch_size must be set to 1 "
                    "due to a `ring-flash-attn` requirement"
                )
@@ -1192,34 +1187,6 @@ class AxolotlInputConfig(
        return value
    @field_validator("ring_attn_func", mode="after")
    @classmethod
    def check_ring_attn_func(cls, value, info):
        if not info.data.get("sequence_parallel_degree", 1) > 1:
            return value
        from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
        if value is not None:
            # Set the ring attention function if passed in config
            valid_funcs = list(RingAttnFunc)
            if value in valid_funcs:
                value = RingAttnFunc(value)
            else:
                raise ValueError(
                    f"ring_attn_func: {value} must be one of {valid_funcs}"
                )
        else:
            # Default ring attention function selection
            sample_packing = info.data.get("sample_packing")
            value = (
                RingAttnFunc.VARLEN_LLAMA3
                if sample_packing
                else RingAttnFunc.BATCH_RING
            )
        return value
    @model_validator(mode="before")
    @classmethod
    def check_muon_deepspeed_fsdp(cls, data):
@@ -1309,14 +1276,11 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
        ):
            capabilities = data.get("capabilities")
            is_fsdp = data.get("fsdp") is not None
-            is_fsdp2 = (
+
-                data.get("fsdp_config") is not None
+            if capabilities and capabilities.get("n_gpu", 0) > 1:
                and str(data.get("fsdp_config").get("fsdp_version")) == "2"
            )
            if capabilities and capabilities.get("n_gpu", 0) > 1 and not is_fsdp2:
                if is_fsdp:
                    raise ValueError(
-                        "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not compatible with FSDP1."
+                        "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not compatible with FSDP."
                    )
        return data
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -17,7 +17,6 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
 from axolotl.utils.distributed import reduce_and_broadcast
 from axolotl.utils.environment import check_cuda_p2p_ib_support
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
@@ -236,8 +235,7 @@ def drop_long_seq(sample, sequence_len=2048, min_sequence_len=2):
 def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
-    drop_attn_mask = cfg.model_config_type in ["mamba", "gemma3"]
+    if cfg.model_config_type in ["mamba", "gemma3"]:
    if drop_attn_mask:
        LOG.info("dropping attention_mask column")
        train_dataset = train_dataset.remove_columns("attention_mask")
        if eval_dataset:
@@ -627,12 +625,6 @@ def setup_trainer(
        A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
            on the provided parameters.
    """
    if (
        cfg.torch_compile
        and cfg.fsdp_config
        and str(cfg.fsdp_config.fsdp_version) == "2"
    ):
        patch_evaluation_loop_for_fsdp2()
    if cfg.rl:
        trainer_builder = HFRLTrainerBuilder(cfg, model, tokenizer, processor)
        trainer_builder.model_ref = model_ref
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -56,12 +56,11 @@ class TestPackedFlex:
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
-                "max_steps": 2,
+                "max_steps": 5,
                "use_tensorboard": True,
                "save_strategy": "no",
            }
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -177,7 +177,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
            "VLLM_USE_V1": "0",
        }
        vllm_process_id = start_vllm(
            cfg.base_model,
@@ -265,7 +264,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
            "VLLM_USE_V1": "0",
        }
        vllm_process_id = start_vllm(
            cfg.base_model,
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -621,6 +621,12 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
        )
    # TODO: remove skip once deepspeed regression is fixed
    # see https://github.com/huggingface/transformers/pull/37324
    @pytest.mark.skipif(
        transformers_version_eq("4.51.0"),
        reason="zero3 is not supported with transformers==4.51.0",
    )
    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
--- a/tests/e2e/multigpu/test_sp.py
+++ b/tests/e2e/multigpu/test_sp.py
@@ -3,7 +3,6 @@
 import os
 from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from transformers.testing_utils import get_torch_dist_unique_port
@@ -18,15 +17,8 @@ os.environ["WANDB_DISABLED"] = "true"
 class TestSequenceParallelism:
    """Test case for training with sequence parallelism enabled"""
-    def _run_sequence_parallel_test(
+    def test_sequence_parallel_training(self, temp_dir):
-        self,
+        # pylint: disable=duplicate-code
        temp_dir,
        sample_packing=True,
        micro_batch_size=1,
        pad_to_sequence_len=True,
        ring_attn_func=None,
    ):
        """Helper method to run sequence parallel tests with different configurations"""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -35,9 +27,9 @@ class TestSequenceParallelism:
                "strict": False,
                "sequence_len": 2048,
                "adapter": "qlora",
-                "sample_packing": sample_packing,
+                "sample_packing": True,
-                "eval_sample_packing": sample_packing,
+                "eval_sample_packing": True,
-                "pad_to_sequence_len": pad_to_sequence_len,
+                "pad_to_sequence_len": True,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
@@ -53,7 +45,7 @@ class TestSequenceParallelism:
                ],
                "num_epochs": 1,
                "max_steps": 8,
-                "micro_batch_size": micro_batch_size,
+                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
@@ -69,7 +61,6 @@ class TestSequenceParallelism:
                "weight_decay": 0.0,
                "use_tensorboard": True,
                "sequence_parallel_degree": 2,
                "ring_attn_func": ring_attn_func,
            }
        )
@@ -95,35 +86,3 @@ class TestSequenceParallelism:
        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.6, "Train Loss is too high"
        )
    @pytest.mark.parametrize(
        "sample_packing, micro_batch_size, pad_to_sequence_len, ring_attn_func",
        [
            (True, 1, True, None),  # defaults to varlen_llama3 ring_attn_func
            (False, 2, True, None),  # defaults to batch_ring ring_attn_func
            (False, 2, True, "batch_zigzag"),
            # (False, 2, False),  # not yet working
        ],
        ids=[
            "sample_packing, varlen_llama3 ring_attn_func",
            "no sample_packing, no pad_to_sequence_len, batch_ring ring_attn_func",
            "no sample_packing, no pad_to_sequence_len, batch_zigzag ring_attn_func",
            # "no sample_packing, pad_to_sequence_len",  # not yet working
        ],
    )
    def test_sequence_parallel_training(
        self,
        temp_dir,
        sample_packing,
        micro_batch_size,
        pad_to_sequence_len,
        ring_attn_func,
    ):
        """Test sequence parallel training with different configurations"""
        self._run_sequence_parallel_test(
            temp_dir,
            sample_packing=sample_packing,
            micro_batch_size=micro_batch_size,
            pad_to_sequence_len=pad_to_sequence_len,
            ring_attn_func=ring_attn_func,
        )
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -144,7 +144,7 @@ def test_swiglu_mlp_integration(small_llama_model):
 def test_geglu_model_integration():
    """Test GeGLU activation with Gemma model."""
    model = AutoModelForCausalLM.from_pretrained(
-        "mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="cuda:0"
+        "mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="auto"
    )
    peft_config = get_peft_config(
        {
@@ -347,7 +347,7 @@ def test_model_architecture(model_config):
    """Test LoRA kernel patches across different model architectures."""
    # Load model with appropriate dtype
    model = AutoModelForCausalLM.from_pretrained(
-        model_config["name"], torch_dtype=model_config["dtype"], device_map="cuda:0"
+        model_config["name"], torch_dtype=model_config["dtype"], device_map="auto"
    )
    # Apply LoRA configuration
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -73,10 +73,7 @@ class TestRingAttention:
        self, mock_world_size, mock_rank, mock_new_group, partial_state
    ):
        """Test that ring attention groups are created correctly."""
-        from axolotl.monkeypatch.attention.ring_attn import (
+        from axolotl.monkeypatch.attention.ring_attn import register_ring_attn
            RingAttnFunc,
            register_ring_attn,
        )
        # Setup mocks
        mock_world_size.return_value = 8  # 8 GPUs total
@@ -85,11 +82,7 @@ class TestRingAttention:
        mock_new_group.return_value = mock_group
        # Call register_ring_attn with size 4
-        register_ring_attn(
+        register_ring_attn(sequence_parallel_degree=4, heads_k_stride=1)
            sequence_parallel_degree=4,
            heads_k_stride=1,
            ring_attn_func=RingAttnFunc.VARLEN_LLAMA3,
        )
        # Verify the number of calls without examining the arguments
        assert mock_new_group.call_count == 2
--- a/tests/e2e/solo/test_preprocess.py
+++ b/tests/e2e/solo/test_preprocess.py
@@ -1,85 +0,0 @@
 """
 E2E tests for preprocessing
 """
 import logging
 import os
 import unittest
 import transformers
 from axolotl.cli.args import PreprocessCliArgs
 from axolotl.common.datasets import load_preference_datasets
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 from ..utils import with_temp_dir
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
 class TestCustomRewardFunctionLoading(unittest.TestCase):
    """
    Test case for GRPO training using single GPU
    """
    def _utils_write_rewards(self):
        # write cfg to yaml file
        with open("rewards.py", "w", encoding="utf-8") as fout:
            fout.write(
                """import random
 def rand_reward_func(completions, **kwargs) -> list[float]:
    return [random.uniform(0, 1) for _ in completions]
 def oai_gsm8k_transform(cfg, *args, **kwargs):
    def transform_fn(example, tokenizer=None):
        label = example["answer"].split("####")[-1].strip().replace(",", "")
        return {
            "prompt": [{"role": "user", "content": example["question"]},],
            "answer": label,
        }
    return transform_fn, {"remove_columns": ["question"]}
 """
            )
    @with_temp_dir
    def test_custom_rewards_fn_preprocess(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "strict": False,
                "rl": "grpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        "rewards.rand_reward_func"
                    ],  # format: '{file_name}.{fn_name}'
                    "reward_weights": [1.0],
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": "rewards.oai_gsm8k_transform",
                    },
                ],
                "dataset_prepared_path": temp_dir,
                "gradient_accumulation_steps": 1,
                "micro_batch_size": 1,
                "learning_rate": 0.000005,
            }
        )
        self._utils_write_rewards()
        cfg = validate_config(cfg)
        normalize_config(cfg)
        parser = transformers.HfArgumentParser(PreprocessCliArgs)
        cli_args, _ = parser.parse_args_into_dataclasses(return_remaining_strings=True)
        load_preference_datasets(cfg=cfg, cli_args=cli_args)