Compare commits

..

2 Commits

Author SHA1 Message Date
salman
e36d3c9f30 Merge branch 'main' into testingci 2025-07-24 09:13:15 +01:00
Salman Mohammadi
53614391ed wip 2025-07-24 09:12:55 +01:00
293 changed files with 4161 additions and 6897 deletions

View File

@@ -57,13 +57,6 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
5. Push your branch to your fork on GitHub. 5. Push your branch to your fork on GitHub.
6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues. 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
#### Skipping CI Checks
You can skip certain CI checks by including specific keywords in your commit messages:
- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks
## Style Guidelines ## Style Guidelines
### Code Style ### Code Style

View File

@@ -54,7 +54,7 @@ jobs:
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base" dockerfile: "Dockerfile-base"
- cuda: "128" - cuda: "128"
cuda_version: 12.8.1 cuda_version: 12.6.3
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.1 pytorch: 2.7.1
@@ -64,16 +64,9 @@ jobs:
cuda_version: 12.8.1 cuda_version: 12.8.1
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.8.0 pytorch: nightly
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base" dockerfile: "Dockerfile-base-nightly"
# - cuda: "128"
# cuda_version: 12.8.1
# cudnn_version: ""
# python_version: "3.11"
# pytorch: nightly
# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
# dockerfile: "Dockerfile-base-nightly"
# # "next" is for release candidates of pytorch # # "next" is for release candidates of pytorch
# - cuda: "128" # - cuda: "128"
# cuda_version: 12.8.1 # cuda_version: 12.8.1
@@ -129,13 +122,6 @@ jobs:
pytorch: 2.6.0 pytorch: 2.6.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base" dockerfile: "Dockerfile-uv-base"
- cuda: "126"
cuda_version: 12.6.3
cudnn_version: ""
python_version: "3.11"
pytorch: 2.7.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
- cuda: "128" - cuda: "128"
cuda_version: 12.8.1 cuda_version: 12.8.1
cudnn_version: "" cudnn_version: ""
@@ -143,13 +129,6 @@ jobs:
pytorch: 2.7.1 pytorch: 2.7.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base" dockerfile: "Dockerfile-uv-base"
- cuda: "128"
cuda_version: 12.8.1
cudnn_version: ""
python_version: "3.11"
pytorch: 2.8.0
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base"
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4

View File

@@ -24,13 +24,12 @@ jobs:
cuda_version: 12.6.3 cuda_version: 12.6.3
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.0 pytorch: 2.7.0
axolotl_extras: axolotl_extras: vllm
- cuda: 126 - cuda: 126
cuda_version: 12.6.3 cuda_version: 12.6.3
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.1 pytorch: 2.7.1
axolotl_extras: vllm axolotl_extras:
is_latest: true
- cuda: 128 - cuda: 128
cuda_version: 12.8.1 cuda_version: 12.8.1
python_version: "3.11" python_version: "3.11"
@@ -98,12 +97,6 @@ jobs:
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.1 pytorch: 2.7.1
axolotl_extras: axolotl_extras:
is_latest:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras: vllm
is_latest: true is_latest: true
- cuda: 128 - cuda: 128
cuda_version: 12.8.1 cuda_version: 12.8.1
@@ -157,18 +150,6 @@ jobs:
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.6.0
axolotl_extras: axolotl_extras:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras:
is_latest:
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras: vllm
is_latest: true
runs-on: axolotl-gpu-runner runs-on: axolotl-gpu-runner
steps: steps:
- name: Checkout - name: Checkout

View File

@@ -37,14 +37,14 @@ jobs:
cuda_version: 12.6.3 cuda_version: 12.6.3
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.0 pytorch: 2.7.0
axolotl_extras: axolotl_extras: vllm
num_gpus: 2 num_gpus: 2
nightly_build: "true" nightly_build: "true"
- cuda: 126 - cuda: 126
cuda_version: 12.6.3 cuda_version: 12.6.3
python_version: "3.11" python_version: "3.11"
pytorch: 2.7.1 pytorch: 2.7.1
axolotl_extras: vllm axolotl_extras:
num_gpus: 2 num_gpus: 2
nightly_build: "true" nightly_build: "true"
runs-on: [self-hosted, modal] runs-on: [self-hosted, modal]

View File

@@ -53,7 +53,6 @@ jobs:
- name: Netlify Publish - name: Netlify Publish
uses: nwtgck/actions-netlify@v3.0 uses: nwtgck/actions-netlify@v3.0
if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
id: netlify id: netlify
with: with:
publish-dir: './_site' publish-dir: './_site'

View File

@@ -105,8 +105,7 @@ jobs:
- name: Run tests - name: Run tests
run: | run: |
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
@@ -180,8 +179,8 @@ jobs:
- name: Run tests - name: Run tests
run: | run: |
pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml pytest -v --durations=10 tests/patched/
pytest -v --durations=10 tests/cli/ pytest -v --durations=10 tests/cli/
- name: cleanup pip cache - name: cleanup pip cache

View File

@@ -3,7 +3,7 @@ default_language_version:
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0 rev: v5.0.0
hooks: hooks:
- id: check-yaml - id: check-yaml
- id: end-of-file-fixer - id: end-of-file-fixer
@@ -23,11 +23,11 @@ repos:
hooks: hooks:
- id: flake8 - id: flake8
- repo: https://github.com/pylint-dev/pylint - repo: https://github.com/pylint-dev/pylint
rev: v3.3.8 rev: v3.3.7
hooks: hooks:
- id: pylint - id: pylint
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.17.1 rev: v1.17.0
hooks: hooks:
- id: mypy - id: mypy
additional_dependencies: additional_dependencies:

View File

@@ -119,15 +119,14 @@ datasets:
## Dataset Processing ## Dataset Processing
| Option | Default | Description | | Option | Default | Description |
| --------------------------------- | -------------------------- | ----------------------------------- | | ----------------------------- | -------------------------- | --------------------------------- |
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | | `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
| `push_dataset_to_hub` | `""` | Push dataset to HF hub | | `push_dataset_to_hub` | `""` | Push dataset to HF hub |
| `dataset_processes` | `4` | Number of preprocessing processes | | `dataset_processes` | `4` | Number of preprocessing processes |
| `dataset_keep_in_memory` | `false` | Keep dataset in memory | | `dataset_keep_in_memory` | `false` | Keep dataset in memory |
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets | | `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging | | `dataset_exact_deduplication` | `true` | Deduplicate datasets |
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
## LoRA Configuration ## LoRA Configuration
@@ -185,6 +184,7 @@ datasets:
| `flash_attention` | `false` | Use flash attention | | `flash_attention` | `false` | Use flash attention |
| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy | | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
| `flash_attn_rms_norm` | `false` | Flash attention RMS norm | | `flash_attn_rms_norm` | `false` | Flash attention RMS norm |
| `flash_attn_fuse_qkv` | `false` | Fuse QKV operations |
| `flash_attn_fuse_mlp` | `false` | Fuse MLP operations | | `flash_attn_fuse_mlp` | `false` | Fuse MLP operations |
| `sdp_attention` | `false` | Use scaled dot product | | `sdp_attention` | `false` | Use scaled dot product |
| `s2_attention` | `false` | Use shifted sparse attention | | `s2_attention` | `false` | Use shifted sparse attention |

View File

@@ -296,6 +296,7 @@
# flash_attention: # flash_attention:
# flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only # flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
# flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only # flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
# # Whether to use scaled-dot-product attention # # Whether to use scaled-dot-product attention
# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -540,6 +541,7 @@ xformers_attention: ${XFORMERS_ATTENTION}
flash_attention: ${FLASH_ATTENTION} flash_attention: ${FLASH_ATTENTION}
flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY} flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM} flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP} flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
sdp_attention: ${SDP_ATTENTION} sdp_attention: ${SDP_ATTENTION}
s2_attention: ${S2_ATTENTION} s2_attention: ${S2_ATTENTION}

View File

@@ -1,10 +0,0 @@
cff-version: 1.2.0
type: software
title: "Axolotl: Post-Training for AI Models"
message: "If you use this software, please cite it as below."
authors:
- name: "Axolotl maintainers and contributors"
repository-code: "https://github.com/axolotl-ai-cloud/axolotl"
url: "https://axolotl.ai/"
license: Apache-2.0
date-released: "2023-05-30"

View File

@@ -25,28 +25,15 @@
## 🎉 Latest Updates ## 🎉 Latest Updates
- 2025/07:
- ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
- Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
- FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
- [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
- TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
<details>
<summary>Expand older updates</summary>
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl! - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version! - 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own! - 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try. - 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun! - 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html). - 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
</details>
## ✨ Overview ## ✨ Overview
Axolotl is a tool designed to streamline post-training for various AI models. Axolotl is a tool designed to streamline post-training for various AI models.
@@ -92,20 +79,6 @@ docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html). Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
#### Cloud Providers
<details>
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
- [Novita](https://novita.ai/gpus-console?templateId=311)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
</details>
### Your First Fine-tune ### Your First Fine-tune
```bash ```bash
@@ -147,22 +120,14 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
## ❤️ Sponsors ## ❤️ Sponsors
Thank you to our sponsors who help make Axolotl possible:
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
fine-tune large language models, run protein folding simulations, and much more.
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai) Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
## 📝 Citing Axolotl
If you use Axolotl in your research or projects, please cite it as follows:
```bibtex
@software{axolotl,
title = {Axolotl: Post-Training for AI Models},
author = {{Axolotl maintainers and contributors}},
url = {https://github.com/axolotl-ai-cloud/axolotl},
license = {Apache-2.0},
year = {2023}
}
```
## 📜 License ## 📜 License
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details. This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.

10
TODO.md Normal file
View File

@@ -0,0 +1,10 @@
# todo list
- [] Validation of parameters for combinations that won't work
## things that are known not to work
- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
- adamw_bnb_8bit doesn't play well with FSDP offload

View File

@@ -35,30 +35,25 @@ quartodoc:
- cli.train - cli.train
- cli.evaluate - cli.evaluate
- cli.args - cli.args
- cli.art
- cli.checks - cli.checks
- cli.config - cli.config
- cli.delinearize_llama4
- cli.inference - cli.inference
- cli.merge_lora - cli.merge_lora
- cli.merge_sharded_fsdp_weights - cli.merge_sharded_fsdp_weights
- cli.preprocess - cli.preprocess
- cli.quantize - cli.sweeps
- cli.utils
- cli.vllm_serve - cli.vllm_serve
- cli.cloud.base - cli.cloud.base
- cli.cloud.modal_ - cli.cloud.modal_
- cli.utils - cli.quantize
- cli.utils.args
- cli.utils.fetch
- cli.utils.load
- cli.utils.sweeps
- cli.utils.train
- title: Trainers - title: Trainers
desc: Training implementations desc: Training implementations
contents: contents:
- core.trainers.base - core.trainers.base
- core.trainers.trl - core.trainers.trl
- core.trainers.mamba - core.trainers.mamba
- core.trainers.relora
- core.trainers.dpo.trainer - core.trainers.dpo.trainer
- core.trainers.grpo.trainer - core.trainers.grpo.trainer
- core.trainers.grpo.sampler - core.trainers.grpo.sampler
@@ -274,7 +269,7 @@ website:
- docs/dataset_preprocessing.qmd - docs/dataset_preprocessing.qmd
- docs/multipack.qmd - docs/multipack.qmd
- docs/mixed_precision.qmd - docs/mixed_precision.qmd
- docs/optimizers.qmd - docs/gradient_accumulation.qmd
- section: "Advanced Features" - section: "Advanced Features"
contents: contents:
@@ -284,7 +279,6 @@ website:
- docs/custom_integrations.qmd - docs/custom_integrations.qmd
- docs/sequence_parallelism.qmd - docs/sequence_parallelism.qmd
- docs/gradient_checkpointing.qmd - docs/gradient_checkpointing.qmd
- docs/nd_parallelism.qmd
- section: "Troubleshooting" - section: "Troubleshooting"
contents: contents:

View File

@@ -2,7 +2,7 @@
set -e set -e
# Only run two tests at a time to avoid OOM on GPU (with coverage collection) # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
pytest -v --durations=10 -n2 \ pytest -v -n2 \
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \ --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
/workspace/axolotl/tests/e2e/multigpu/ \ /workspace/axolotl/tests/e2e/multigpu/ \
@@ -19,7 +19,5 @@ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
--cov-append \ --cov-append \
--cov-report=xml:multigpu-coverage.xml --cov-report=xml:multigpu-coverage.xml
# Upload coverage to Codecov if CODECOV_TOKEN is available # Upload coverage to Codecov
if [ -n "$CODECOV_TOKEN" ]; then codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
fi

View File

@@ -65,9 +65,6 @@ GPU_CONFIG = f"L40S:{N_GPUS}"
def run_cmd(cmd: str, run_folder: str): def run_cmd(cmd: str, run_folder: str):
import subprocess # nosec import subprocess # nosec
sp_env = os.environ.copy()
sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
# Propagate errors from subprocess. # Propagate errors from subprocess.
if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env): # nosec if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
exit(exit_code) # pylint: disable=consider-using-sys-exit exit(exit_code) # pylint: disable=consider-using-sys-exit

View File

@@ -16,10 +16,7 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
ibverbs-providers ibverbs-utils infiniband-diags \
librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
&& rm -rf /var/cache/apt/archives \ && rm -rf /var/cache/apt/archives \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& wget \ && wget \

View File

@@ -15,7 +15,7 @@ COPY scripts/motd /etc/motd
RUN pip install jupyterlab notebook ipywidgets && \ RUN pip install jupyterlab notebook ipywidgets && \
jupyter lab clean jupyter lab clean
RUN apt update && \ RUN apt update && \
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \ apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
rm -rf /var/cache/apt/archives && \ rm -rf /var/cache/apt/archives && \
rm -rf /var/lib/apt/lists/* && \ rm -rf /var/lib/apt/lists/* && \
mkdir -p ~/.ssh && \ mkdir -p ~/.ssh && \

View File

@@ -9,15 +9,13 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
EXPOSE 8888 EXPOSE 8888
EXPOSE 22 EXPOSE 22
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
COPY scripts/motd /etc/motd COPY scripts/motd /etc/motd
RUN pip install jupyterlab notebook ipywidgets && \ RUN pip install jupyterlab notebook ipywidgets && \
jupyter lab clean jupyter lab clean
RUN apt update && \ RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \ pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
rm -rf /var/cache/apt/archives && \
rm -rf /var/lib/apt/lists/* && \
mkdir -p ~/.ssh && \ mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \ chmod 700 ~/.ssh && \
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \ printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \

View File

@@ -23,20 +23,6 @@ axolotl <command> [config.yml] [options]
The config file can be local or a URL to a raw YAML file. The config file can be local or a URL to a raw YAML file.
### Launcher Arguments
For commands that support multi-GPU (`train`, `evaluate`, ...), you can pass launcher-specific arguments using the `--` separator:
```bash
# Pass torchrun arguments
axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1
# Pass accelerate arguments
axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4
```
Arguments after `--` are passed directly to the launcher (torchrun, accelerate launch, etc.).
## Command Reference ## Command Reference
### fetch ### fetch
@@ -94,11 +80,7 @@ axolotl train config.yml \
--num-epochs 3 --num-epochs 3
# Training without accelerate # Training without accelerate
axolotl train config.yml --launcher python axolotl train config.yml --no-accelerate
# Pass launcher-specific arguments using -- separator
axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1
axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml
# Resume training from checkpoint # Resume training from checkpoint
axolotl train config.yml --resume-from-checkpoint path/to/checkpoint axolotl train config.yml --resume-from-checkpoint path/to/checkpoint
@@ -193,9 +175,6 @@ Evaluates a model's performance (loss etc) on the train and eval datasets.
```bash ```bash
# Basic evaluation # Basic evaluation
axolotl evaluate config.yml axolotl evaluate config.yml
# Evaluation with launcher arguments
axolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2
``` ```
### lm-eval ### lm-eval
@@ -308,6 +287,9 @@ axolotl preprocess config.yml --cloud cloud_config.yml
# Train on cloud # Train on cloud
axolotl train config.yml --cloud cloud_config.yml axolotl train config.yml --cloud cloud_config.yml
# Train without accelerate on cloud
axolotl train config.yml --cloud cloud_config.yml --no-accelerate
# Run lm-eval on cloud # Run lm-eval on cloud
axolotl lm-eval config.yml --cloud cloud_config.yml axolotl lm-eval config.yml --cloud cloud_config.yml
``` ```

View File

@@ -212,11 +212,10 @@ Instead of passing `tools` via the system prompt, an alternative method would be
Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step). Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
::: :::
Example config for Llama4:
```yaml ```yaml
chat_template: llama4 chat_template: llama4
datasets: datasets:
- path: Nanobit/text-tools-2k-test - path: ...
type: chat_template type: chat_template
# field_tools: tools # default is `tools` # field_tools: tools # default is `tools`
``` ```

View File

@@ -136,7 +136,3 @@ description: Frequently asked questions
> dynamic: false > dynamic: false
> mode: max-autotune-no-cudagraphs > mode: max-autotune-no-cudagraphs
> ``` > ```
**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.

View File

@@ -124,13 +124,10 @@ For providers supporting Docker:
- Use `axolotlai/axolotl-cloud:main-latest` - Use `axolotlai/axolotl-cloud:main-latest`
- Available on: - Available on:
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz) - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link) - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true) - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - [Novita](https://novita.ai/gpus-console?templateId=311)
- [Novita](https://novita.ai/gpus-console?templateId=311)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
### Google Colab {#sec-colab} ### Google Colab {#sec-colab}

View File

@@ -69,19 +69,11 @@ export NCCL_BUFFSIZE=2097152
Run the following on each node: Run the following on each node:
### Option 1: New Axolotl CLI with launcher args (Recommended)
```bash
axolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port"
```
### Option 2: Direct torchrun (Legacy)
```bash ```bash
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
``` ```
Please make sure to substitute the placeholder variables: Please make sure to substitute the placeholder variables.
- `num_nodes`: Number of nodes (containing GPUs) - `num_nodes`: Number of nodes (containing GPUs)
- `gpu_per_node`: Number of gpus per node - `gpu_per_node`: Number of gpus per node
@@ -89,6 +81,8 @@ Please make sure to substitute the placeholder variables:
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400) - `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
- `rdzv_id`: A unique job ID that is used by the job across nodes. - `rdzv_id`: A unique job ID that is used by the job across nodes.
The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features. ::: {.callout-note}
You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
:::
More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html) More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)

View File

@@ -1,108 +0,0 @@
---
title: "N-D Parallelism (Beta)"
---
Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:
- A model's weights are too large to fit on a single GPU's memory.
- A model's activations, especially with very long contexts, are too large for a single GPU.
- You want to accelerate training by using multiple GPUs or nodes.
or combinations of the above!
## Core Concepts
Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch's `DeviceMesh` is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.
### Data Parallelism {#sec-dp}
Data Parallelism focuses on splitting the global data batch across GPUs.
- Distributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.
- [Fully Sharded Data Parallel (FSDP)](multi-gpu.qmd#fully-sharded-data-parallel-(fsdp)): A highly memory-efficient form of data parallelism (inspired by DeepSpeed's ZeRO). Instead of replicating the model, FSDP shards the model's *parameters, gradients, and optimizer states* across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an `all_gather` operation just before they are used, and they can be discarded immediately after (`reshard-after-forward`).
- FSDP maps to ZeRO stages:
- ZeRO-2 (`reshard_after_forward=False`): Shards gradients and optimizer states. Model weights are replicated on each GPU.
- ZeRO-3 (`reshard_after_forward=True`): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).
### [Experimental] Tensor Parallelism (TP) {#sec-tp}
Also known as "horizontal model parallelism," as described in the [Megatron-LM paper](https://arxiv.org/pdf/1909.08053.pdf). Instead of splitting the batch, TP splits the model's layers themselves across GPUs.
- How it works: For a linear layer `Y = XA`, the weight matrix `A` is split column-wise (`A = [A_1, A_2]`). The computation becomes `Y_1 = XA_1` and `Y_2 = XA_2`, which can happen in parallel on different GPUs. The final output `Y` is simply the concatenation of `Y_1` and `Y_2`. Check [this comment](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530) for more detailed info.
- Requirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.
### Context Parallelism (CP) {#sec-cp}
Context Parallelism, also called [Sequence Parallelism](sequence_parallelism.qmd), addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.
- How it works: If you have a sequence of 8192 tokens and a `context_parallel_size` of 4, each GPU will only handle a chunk of 2048 tokens.
- The Challenge: Attention is not local; every token needs to "attend to" every other token. Splitting the sequence breaks this.
- The Solution (`ring-flash-attention`): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a "ring." After `N-1` steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized `flash-attention` kernel at each step.
### Hybrid Sharding Data Parallel (HSDP) {#sec-hsdp}
HSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.
- Intra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the `all_gather` operations for sharded parameters fast.
- Inter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP's parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).
- Example: With 2 nodes of 8 GPUs each (16 total), you could have `dp_shard_size=8` (FSDP within each node) and `dp_replicate_size=2` (DDP across the two nodes).
## Usage
```yaml
# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp
fsdp_version: 2
fsdp_config:
# ...
# The number of GPUs to shard the model parameters across (FSDP dimension).
dp_shard_size: 4
# The number of times to replicate the sharded model (DDP dimension).
dp_replicate_size: 2
# Number of GPUs for Tensor Parallelism.
tensor_parallel_size: 1 # (default is 1, no TP)
# Number of GPUs for Context/Sequence Parallelism.
context_parallel_size: 1 # (default is 1, no CP)
```
Note: We recommend FSDP. DeepSpeed is only compatible with `tensor_parallel_size`.
## Examples
::: {.callout-tip}
See our example configs [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/distributed-parallel).
:::
1. HSDP on 2 nodes with 4 GPUs each (8 GPUs total):
- You want FSDP within each node and DDP across nodes.
- Set `dp_shard_size: 4` and `dp_replicate_size: 2`.
2. FSDP + TP on a single 8-GPU node:
- You want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.
- Set `dp_shard_size: 4` and `tensor_parallel_size: 2`.
3. FSDP + CP on a single 8-GPU node for long context:
- You want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.
- Set `dp_shard_size: 8` and `context_parallel_size: 8`. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.
## Support Matrix
This matrix describes how different parallelism methods can be combined in Axolotl.
| Combination | `dp_replicate_size` | `dp_shard_size` | `tp_size` | `cp_size` | Status & Notes |
| --- | :---: | :---: |:---:|:---:|---|
| **FSDP** (ZeRO-3) | 1 | >1 | 1 | 1 | ✅ Fully supported. Shards model across all GPUs. |
| **HSDP** | >1 | >1 | 1 | 1 | ✅ Fully supported. FSDP intra-node, DDP inter-node. |
| **FSDP + TP** | 1 | >1 | >1 | 1 | ✅ **2D Parallelism**. Shards the model across a `dp_shard` group, and TP-splits layers within the `tp` group. |
| **HSDP + TP** | >1 | >1 | >1 | 1 | ✅ **3D Parallelism**. A powerful but complex combination. |
| **FSDP + CP** | 1 | >1 | 1 | >1 | ✅ **2D Parallelism**. Combines FSDP with context parallelism. |
| **FSDP + TP + CP**| 1 | >1 | >1| >1| ✅ **3D Parallelism**. Another advanced combination. |
| DDP + TP/CP | >1 | 1 | >1 | >1 | ❌ **Not Supported**. The `ParallelismConfig` explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (`dp_shard_size > 1`). |
| Just TP / CP | 1 | 1 | >1 | >1 | ✅ Supported. Useful for inference or when the model fits on one GPU but context is too long. |
- `tp_size` refers to `tensor_parallel_size`
- `cp_size` refers to `context_parallel_size`

View File

@@ -1,129 +0,0 @@
---
title: Optimizers
description: Configuring optimizers
---
## Overview
Axolotl supports all optimizers supported by [transformers OptimizerNames](https://github.com/huggingface/transformers/blob/51f94ea06d19a6308c61bbb4dc97c40aabd12bad/src/transformers/training_args.py#L142-L187)
Here is a list of optimizers supported by transformers as of `v4.54.0`:
- `adamw_torch`
- `adamw_torch_fused`
- `adamw_torch_xla`
- `adamw_torch_npu_fused`
- `adamw_apex_fused`
- `adafactor`
- `adamw_anyprecision`
- `adamw_torch_4bit`
- `adamw_torch_8bit`
- `ademamix`
- `sgd`
- `adagrad`
- `adamw_bnb_8bit`
- `adamw_8bit` # alias for adamw_bnb_8bit
- `ademamix_8bit`
- `lion_8bit`
- `lion_32bit`
- `paged_adamw_32bit`
- `paged_adamw_8bit`
- `paged_ademamix_32bit`
- `paged_ademamix_8bit`
- `paged_lion_32bit`
- `paged_lion_8bit`
- `rmsprop`
- `rmsprop_bnb`
- `rmsprop_bnb_8bit`
- `rmsprop_bnb_32bit`
- `galore_adamw`
- `galore_adamw_8bit`
- `galore_adafactor`
- `galore_adamw_layerwise`
- `galore_adamw_8bit_layerwise`
- `galore_adafactor_layerwise`
- `lomo`
- `adalomo`
- `grokadamw`
- `schedule_free_radam`
- `schedule_free_adamw`
- `schedule_free_sgd`
- `apollo_adamw`
- `apollo_adamw_layerwise`
- `stable_adamw`
## Custom Optimizers
Enable custom optimizers by passing a string to the `optimizer` argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.
### optimi_adamw
```yaml
optimizer: optimi_adamw
```
### ao_adamw_4bit
Deprecated: Please use `adamw_torch_4bit`.
### ao_adamw_8bit
Deprecated: Please use `adamw_torch_8bit`.
### ao_adamw_fp8
```yaml
optimizer: ao_adamw_fp8
```
### adopt_adamw
GitHub: [https://github.com/iShohei220/adopt](https://github.com/iShohei220/adopt)
Paper: [https://arxiv.org/abs/2411.02853](https://arxiv.org/abs/2411.02853)
```yaml
optimizer: adopt_adamw
```
### came_pytorch
GitHub: [https://github.com/yangluo7/CAME/tree/master](https://github.com/yangluo7/CAME/tree/master)
Paper: [https://arxiv.org/abs/2307.02047](https://arxiv.org/abs/2307.02047)
```yaml
optimizer: came_pytorch
# optional args (defaults below)
adam_beta1: 0.9
adam_beta2: 0.999
adam_beta3: 0.9999
adam_epsilon: 1e-30
adam_epsilon2: 1e-16
```
### muon
Blog: [https://kellerjordan.github.io/posts/muon/](https://kellerjordan.github.io/posts/muon/)
Paper: [https://arxiv.org/abs/2502.16982v1](https://arxiv.org/abs/2502.16982v1)
```yaml
optimizer: muon
```
### dion
Microsoft's Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient
orthonormalizing optimizer that uses low-rank approximations to reduce gradient communication.
GitHub: [https://github.com/microsoft/dion](https://github.com/microsoft/dion)
Paper: [https://arxiv.org/pdf/2504.05295](https://arxiv.org/pdf/2504.05295)
Note: Implementation written for PyTorch 2.7+ for DTensor
```yaml
optimizer: dion
dion_lr: 0.01
dion_momentum: 0.95
lr: 0.00001 # learning rate for embeddings and parameters that fallback to AdamW
```

View File

@@ -22,7 +22,7 @@ To enable sequence parallelism, add the following to your configuration file:
```yaml ```yaml
# Set to a divisor (> 1) of the number of GPUs available # Set to a divisor (> 1) of the number of GPUs available
context_parallel_size: 4 # Split sequences across 4 GPUs sequence_parallel_degree: 4 # Split sequences across 4 GPUs
# Optional; strides across the key dimension. Larger values use more memory but should make training faster. # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1 heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -30,7 +30,7 @@ heads_k_stride: 1
ring_attn_func: ring_attn_func:
``` ```
The `context_parallel_size` should be a divisor of the total number of GPUs. For example: The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
- With 8 GPUs, valid values would be 2, 4, or 8 - With 8 GPUs, valid values would be 2, 4, or 8
- With 4 GPUs, valid values would be 2 or 4 - With 4 GPUs, valid values would be 2 or 4
@@ -66,7 +66,7 @@ sequence_len: 8192
... ...
context_parallel_size: 4 # Split each sequence into 4 parts, one per GPU sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
# Optional; strides across the key dimension. Larger values use more memory but should make training faster. # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1 heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -89,12 +89,12 @@ Sequence parallelism is compatible with Axolotl's sample packing functionality.
## Effect on Batch Size ## Effect on Batch Size
When using sequence parallelism, your effective global batch size is **divided** by the `context_parallel_size`. This happens because: When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
- Each group of `context_parallel_size` GPUs works on the same batch (just different parts of each sequence) - Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
- The number of batches processed per step decreases - The number of batches processed per step decreases
For example: For example:
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step - With 8 GPUs and no sequence parallelism: 8 different batches processed per step
- With 8 GPUs and `context_parallel_size=4`: Only 2 different batches processed per step (each split across 4 GPUs) - With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4 - If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4

View File

@@ -1,9 +0,0 @@
# Arctic Long Sequence Training (ALST)
Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
techniques. It is a combination of:
- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).

View File

@@ -1,53 +0,0 @@
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
datasets:
- path: togethercomputer/Long-Data-Collections
type: completion
field: text
data_files:
- pretrain/rp_sub.jsonl.zst
- path: princeton-nlp/TextbookChapters
type: completion
field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true
tiled_mlp: true
context_parallel_size: 8
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: true
gradient_checkpointing: true
activation_offloading: legacy
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
pad_token: <|end_of_text|>
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -1,59 +0,0 @@
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
datasets:
- path: togethercomputer/Long-Data-Collections
type: completion
field: text
data_files:
- pretrain/rp_sub.jsonl.zst
- path: princeton-nlp/TextbookChapters
type: completion
field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true
tiled_mlp: true
context_parallel_size: 8
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: true
gradient_checkpointing: true
activation_offloading: legacy
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
pad_token: <|end_of_text|>
fsdp_version: 2
fsdp_config:
offload_params: false # offloading is currently not compatible with SP + torchao optimizer
state_dict_type: SHARDED_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: LlamaDecoderLayer
reshard_after_forward: true
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -1,53 +0,0 @@
# Finetune ArceeAI's AFM with Axolotl
[Arcee Foundation Models (AFM)](https://huggingface.co/collections/arcee-ai/afm-45b-68823397c351603014963473) are a family of 4.5B parameter open weight models trained by Arcee.ai.
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model.
## Getting started
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as AFM is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
Here is an example of how to install from main for pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'
```
2. Run the finetuning example:
```bash
axolotl train examples/arcee/afm-4.5b-qlora.yaml
```
This config uses about 7.8GiB VRAM.
Let us know how it goes. Happy finetuning! 🚀
### TIPS
- For inference, the official Arcee.ai team recommends `top_p: 0.95`, `temperature: 0.5`, `top_k: 50`, and `repeat_penalty: 1.1`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
## Related Resources
- [AFM Blog](https://docs.arcee.ai/arcee-foundation-models/introduction-to-arcee-foundation-models)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -1,64 +0,0 @@
base_model: arcee-ai/AFM-4.5B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out
adapter: qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -66,7 +66,7 @@ flash_optimum:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 32
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
save_total_limit: save_total_limit:

View File

@@ -43,7 +43,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 40
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -77,7 +77,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.000001 weight_decay: 0.000001

View File

@@ -44,7 +44,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 40
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -40,7 +40,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -41,7 +41,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -42,7 +42,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -42,7 +42,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -43,7 +43,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -45,7 +45,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -41,7 +41,7 @@ logging_steps: 1
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0 weight_decay: 0

View File

@@ -47,9 +47,10 @@ logging_steps: 1
flash_attention: true flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
eval_steps: eval_steps:
saves_per_epoch: 4 saves_per_epoch: 4

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: false flash_attention: false
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 0 evals_per_epoch: 0
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -75,7 +75,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -20,7 +20,7 @@ special_tokens:
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test
type: alpaca type: alpaca
warmup_ratio: 0.1 warmup_steps: 10
# Iterations # Iterations
num_epochs: 1 num_epochs: 1

View File

@@ -40,7 +40,7 @@
"%%capture\n", "%%capture\n",
"# This step can take ~5-10 minutes to install dependencies\n", "# This step can take ~5-10 minutes to install dependencies\n",
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\"" "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@631d646\""
] ]
}, },
{ {

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -37,7 +37,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -61,7 +61,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -10,14 +10,17 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
## Getting started ## Getting started
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Devstral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
Here is an example of how to install from pip: Here is an example of how to install from main for pip:
```bash ```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min) # Ensure you have Pytorch installed (Pytorch 2.6.0+)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' pip3 install --no-build-isolation -e '.[flash-attn]'
``` ```
2. Run the finetuning example: 2. Run the finetuning example:

View File

@@ -1,52 +0,0 @@
# ND Parallelism Examples
This directory contains example configurations for training models using ND Parallelism in Axolotl. These examples demonstrate how to compose different parallelism strategies (FSDP, TP, CP, HSDP) for efficient multi-GPU training.
## Quick Start
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
2. Run the command below:
```bash
# Train Qwen3 8B with FSDP + TP + CP on a single 8-GPU node
axolotl train examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
# Train Llama 3.1 8B with HSDP + TP on 2 nodes (16 GPUs total)
axolotl train examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
```
## Example Configurations
### Single Node (8 GPUs)
**Qwen3 8B with FSDP + TP + CP** ([qwen3-8b-fsdp-tp-cp.yaml](./qwen3-8b-fsdp-tp-cp.yaml))
- Uses all 3 parallelism dimensions on a single node
- Ideal for: when model weights, activations, and/or context are too large to fit on single GPU
```yaml
dp_shard_size: 2 # FSDP across 2 GPUs
tensor_parallel_size: 2 # TP across 2 GPUs
context_parallel_size: 2 # CP across 2 GPUs
# Total: 2 × 2 × 2 = 8 GPUs
```
### Multi-Node
**Llama 3.1 8B with HSDP + TP** ([llama-3_1-8b-hsdp-tp.yaml](./llama-3_1-8b-hsdp-tp.yaml))
- FSDP & TP within nodes, DDP across nodes to minimize inter-node communication
- Ideal for: Scaling to multiple nodes while maintaining training efficiency
```yaml
dp_shard_size: 4 # FSDP within each 4-GPU group
tensor_parallel_size: 2 # TP within each node
dp_replicate_size: 2 # DDP across 2 groups
# Total: (4 × 2) × 2 = 16 GPUs (2 nodes)
```
## Learn More
- [ND Parallelism Documentation](https://docs.axolotl.ai/docs/nd_parallelism.html)
- [Blog: Accelerate ND-Parallel Guide](https://huggingface.co/blog/accelerate-nd-parallel)
- [Multi-GPU Training Guide](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -1,47 +0,0 @@
base_model: meta-llama/Llama-3.1-8B
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
dp_shard_size: 4
dp_replicate_size: 2
tensor_parallel_size: 2
# context_parallel_size: 2
dataset_prepared_path: last_run_prepared
special_tokens:
pad_token: <|end_of_text|>
fsdp_version: 2
fsdp_config:
offload_params: false
state_dict_type: FULL_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: LlamaDecoderLayer
reshard_after_forward: true
datasets:
- path: tatsu-lab/alpaca
type: alpaca
output_dir: ./outputs/ndp-out/
sequence_len: 2048
sample_packing: true
flash_attention: true
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6
bf16: true
tf32: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.1

View File

@@ -1,46 +0,0 @@
base_model: Qwen/Qwen3-8B
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
dp_shard_size: 2
# dp_replicate_size: 1
context_parallel_size: 2
tensor_parallel_size: 2
dataset_prepared_path: last_run_prepared
fsdp_version: 2
fsdp_config:
offload_params: false
state_dict_type: FULL_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: Qwen3DecoderLayer
reshard_after_forward: true
datasets:
- path: tatsu-lab/alpaca
type: alpaca
output_dir: ./outputs/ndp-out/
sequence_len: 8192
sample_packing: true
flash_attention: true
gradient_accumulation_steps: 1
micro_batch_size: 1 # must be 1 when using context parallel
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6
bf16: true
tf32: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.1
special_tokens:

View File

@@ -1,62 +1,19 @@
# Finetune Gemma-3n with Axolotl # Gemma-3n
Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl. ## Requirements
## Getting started In addition to Axolotl's requirements, Gemma-3n requires
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). ```
pip3 install timm
Here is an example of how to install from pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
``` ```
2. In addition to Axolotl's requirements, Gemma-3n requires: If you will load audio datasets, please also install
```bash ```
pip3 install timm==1.0.17 pip3 install librosa
# for loading audio data
pip3 install librosa==0.11.0
``` ```
3. Run the finetuning example: ## Usage
```bash See example configs and the [multimodal doc](https://docs.axolotl.ai/docs/multimodal.html).
# text only
axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
# text + vision
axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
# text + vision + audio
axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
```
Let us know how it goes. Happy finetuning! 🚀
WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
### TIPS
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
## Related Resources
- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -34,6 +34,8 @@ eot_tokens:
datasets: datasets:
- path: Nanobit/text-vision-audio-2k-test - path: Nanobit/text-vision-audio-2k-test
type: chat_template type: chat_template
data_files:
- dataset.jsonl
dataset_prepared_path: dataset_prepared_path:
val_set_size: 0.01 val_set_size: 0.01
output_dir: ./outputs/out output_dir: ./outputs/out

View File

@@ -55,7 +55,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -1,74 +0,0 @@
# Finetune OpenAI's GPT-OSS with Axolotl
[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
## Getting started
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
Here is an example of how to install from pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```
2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
```bash
# LoRA SFT linear layers (1x48GB @ ~44GiB)
axolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
```
Note: Memory usage taken from `device_mem_reserved(gib)` from logs.
### Training 120B
On 8xH100s
```bash
# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
```
### Tool use
GPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.
Here is an example dataset config:
```yaml
datasets:
- path: Nanobit/text-tools-2k-test
type: chat_template
```
See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-tools-2k-test) for the sample dataset.
Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info.
### TIPS
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
## Related Resources
- [GPT-OSS Blog](https://openai.com/index/introducing-gpt-oss/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -1,67 +0,0 @@
# the original mxfp4 quantized model is not supported with FSDP cpu_ram_efficient_loading
# FSDP cpu_ram_efficient_loading is used to reduce the initial CPU memory usage when loading the model
base_model: axolotl-ai-co/gpt-oss-120b-dequantized
use_kernels: false
dp_shard_size: 16 # requires 2x8xH100 nodes
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
datasets:
- path: HuggingFaceH4/Multilingual-Thinking
type: chat_template
field_thinking: thinking
template_thinking_key: thinking
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload
lr_scheduler: constant_with_warmup
learning_rate: 2e-5
bf16: true
tf32: true
flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3
gradient_checkpointing: true
activation_offloading: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.03
special_tokens:
eot_tokens:
- "<|end|>"
fsdp_version: 2
fsdp_config:
offload_params: true
state_dict_type: SHARDED_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: GptOssDecoderLayer
reshard_after_forward: true
cpu_ram_efficient_loading: true

View File

@@ -1,58 +0,0 @@
base_model: openai/gpt-oss-20b
use_kernels: false
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
dequantize: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
datasets:
- path: HuggingFaceH4/Multilingual-Thinking
type: chat_template
field_thinking: thinking
template_thinking_key: thinking
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
sequence_len: 4096
sample_packing: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-5
bf16: true
tf32: true
flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3
gradient_checkpointing: true
activation_offloading: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.03
special_tokens:
eot_tokens:
- "<|end|>"
# choose the zero3 configuration that best fits your system capabilities
deepspeed: deepspeed_configs/zero3_bf16.json

View File

@@ -1,68 +0,0 @@
base_model: openai/gpt-oss-20b
use_kernels: true
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
dequantize: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
datasets:
- path: HuggingFaceH4/Multilingual-Thinking
type: chat_template
field_thinking: thinking
template_thinking_key: thinking
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload
lr_scheduler: constant_with_warmup
learning_rate: 2e-5
bf16: true
tf32: true
flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3
gradient_checkpointing: true
activation_offloading: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.03
special_tokens:
eot_tokens:
- "<|end|>"
fsdp_version: 2
fsdp_config:
offload_params: true
state_dict_type: SHARDED_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: GptOssDecoderLayer
reshard_after_forward: true
# cpu_ram_efficient_loading: true
# cpu_ram_efficient_loading cannot be used with MXFP4 model quantization.
# It can only be used with a dequantized model like `axolotl-ai-co/gpt-oss-120b-dequantized`

View File

@@ -1,64 +0,0 @@
base_model: openai/gpt-oss-20b
use_kernels: false
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
dequantize: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding
datasets:
- path: HuggingFaceH4/Multilingual-Thinking
type: chat_template
field_thinking: thinking
template_thinking_key: thinking
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
sequence_len: 4096
sample_packing: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-5
bf16: true
tf32: true
flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3
gradient_checkpointing: true
activation_offloading: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.03
special_tokens:
eot_tokens:
- "<|end|>"
fsdp_version: 2
fsdp_config:
offload_params: false
state_dict_type: SHARDED_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: GptOssDecoderLayer
reshard_after_forward: true
# cpu_ram_efficient_loading: true

View File

@@ -1,67 +0,0 @@
base_model: openai/gpt-oss-20b
use_kernels: true
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
dequantize: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding
datasets:
- path: HuggingFaceH4/Multilingual-Thinking
type: chat_template
field_thinking: thinking
template_thinking_key: thinking
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
sequence_len: 4096
sample_packing: true
adapter: lora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters
lora_target_linear: true
# TODO: not supported for now, see peft#2710
#lora_target_parameters: # target the experts in the last two layers
# - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
# - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
# - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
# - "23._checkpoint_wrapped_module.mlp.experts.down_proj"
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-4
bf16: true
tf32: true
flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3
gradient_checkpointing: true
activation_offloading: true
logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.1
special_tokens:
eot_tokens:
- "<|end|>"

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -45,9 +45,10 @@ logging_steps: 1
flash_attention: true flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ logging_steps: 1
flash_attention: flash_attention:
sdp_attention: sdp_attention:
flash_optimum: flash_optimum:
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -49,9 +49,10 @@ logging_steps: 1
flash_attention: true flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -25,12 +25,9 @@ lora_alpha: 16
lora_dropout: 0.05 lora_dropout: 0.05
lora_target_linear: true lora_target_linear: true
relora: true relora_steps: 150
relora_prune_ratio: 0.9 relora_warmup_steps: 10
relora_cpu_offload: false relora_cpu_offload: false
jagged_restart_steps: 150
jagged_restart_warmup_steps: 10
jagged_restart_anneal_steps: false
wandb_project: wandb_project:
wandb_entity: wandb_entity:
@@ -53,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ logging_steps: 1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
warmup_ratio: 0.1 warmup_steps: 10
weight_decay: 0.0 weight_decay: 0.0
fsdp: fsdp:
- full_shard - full_shard

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -36,7 +36,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -67,7 +67,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

Some files were not shown because too many files have changed in this diff Show More