Compare commits

..

9 Commits

Author SHA1 Message Date
Salman Mohammadi
bc2bc688d8 update fsdp2 patch 2025-07-23 16:53:03 +01:00
Wing Lian
b3c04dd9fe workaround for fsdp2 optimizer save failures 2025-07-23 09:38:57 -04:00
Wing Lian
972c719d38 use latest transformers on main with fix 2025-07-23 09:22:36 -04:00
Wing Lian
2c1cb8b300 fix for accelerator state getting reset and missing schema 2025-07-23 08:43:34 -04:00
Wing Lian
cca207eec4 handle none checks 2025-07-22 21:21:45 -04:00
Wing Lian
9a2da4d9f0 update tp validation 2025-07-22 21:20:57 -04:00
Wing Lian
8fe4758e94 make sure to return data for validation 2025-07-22 21:18:39 -04:00
Wing Lian
8c641fdcb4 handle tp load 2025-07-22 21:17:27 -04:00
Wing Lian
5c74bebfd0 use new upstream branches for nd-parallelism 2025-07-22 21:12:22 -04:00
184 changed files with 1175 additions and 1987 deletions

View File

@@ -17,7 +17,7 @@ on:
jobs:
build-base:
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
if: github.repository_owner == 'axolotl-ai-cloud'
timeout-minutes: 480
# this job needs to be run on self-hosted GPU runners...
runs-on: ubuntu-latest-m
@@ -108,7 +108,7 @@ jobs:
PYTORCH_VERSION=${{ matrix.pytorch }}
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
build-base-uv:
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
if: github.repository_owner == 'axolotl-ai-cloud'
timeout-minutes: 480
runs-on: ubuntu-latest-m
strategy:

View File

@@ -3,7 +3,6 @@ on:
# check on PRs, and manual triggers
merge_group:
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
paths:
- '**.py'
- 'requirements.txt'
@@ -17,7 +16,6 @@ jobs:
pre-commit:
name: pre-commit
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5

View File

@@ -21,7 +21,7 @@ concurrency:
jobs:
test-axolotl-multigpu:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
strategy:
fail-fast: false
matrix:
@@ -37,14 +37,14 @@ jobs:
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.0
axolotl_extras:
axolotl_extras: vllm
num_gpus: 2
nightly_build: "true"
- cuda: 126
cuda_version: 12.6.3
python_version: "3.11"
pytorch: 2.7.1
axolotl_extras: vllm
axolotl_extras:
num_gpus: 2
nightly_build: "true"
runs-on: [self-hosted, modal]

View File

@@ -2,7 +2,7 @@ name: Preview
on:
workflow_dispatch:
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
types: [opened, synchronize, reopened]
# Run the workflow only when one of these files changes
paths:
@@ -25,7 +25,6 @@ permissions:
jobs:
preview:
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
steps:
- name: Check out repository
uses: actions/checkout@v4
@@ -53,7 +52,6 @@ jobs:
- name: Netlify Publish
uses: nwtgck/actions-netlify@v3.0
if: ${{ secrets.NETLIFY_AUTH_TOKEN != '' }}
id: netlify
with:
publish-dir: './_site'
@@ -68,7 +66,7 @@ jobs:
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
- name: Update PR with preview link
if: ${{ steps.netlify.outcome == 'success' && secrets.NETLIFY_AUTH_TOKEN != '' }}
if: ${{ steps.netlify.outcome == 'success' }}
uses: marocchino/sticky-pull-request-comment@v2
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -13,7 +13,6 @@ on:
- 'cicd/cicd.sh'
- 'cicd/Dockerfile.jinja'
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
paths:
- '**.py'
- 'requirements.txt'
@@ -35,7 +34,6 @@ jobs:
pre-commit:
name: pre-commit
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
@@ -49,7 +47,6 @@ jobs:
pytest:
name: PyTest
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
# needs: [preload-cache]
strategy:
fail-fast: false
@@ -124,7 +121,6 @@ jobs:
pytest-sdist:
name: PyTest from Source Dist
runs-on: ubuntu-latest
if: ${{ !github.event.pull_request.draft }}
strategy:
fail-fast: false
matrix:
@@ -189,7 +185,7 @@ jobs:
docker-e2e-tests-1st:
# Run this job first as a gate for running the remainder of the test matrix
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
@@ -239,7 +235,7 @@ jobs:
modal run cicd.e2e_tests
docker-e2e-tests:
if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
if: github.repository_owner == 'axolotl-ai-cloud'
# this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal]
timeout-minutes: 120
@@ -293,7 +289,6 @@ jobs:
runs-on: [self-hosted, modal]
timeout-minutes: 90
needs: [docker-e2e-tests]
if: ${{ !github.event.pull_request.draft }}
strategy:
fail-fast: false

View File

@@ -119,15 +119,14 @@ datasets:
## Dataset Processing
| Option | Default | Description |
| --------------------------------- | -------------------------- | ----------------------------------- |
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
| `dataset_processes` | `4` | Number of preprocessing processes |
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
| Option | Default | Description |
| ----------------------------- | -------------------------- | --------------------------------- |
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
| `dataset_processes` | `4` | Number of preprocessing processes |
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
## LoRA Configuration

View File

@@ -25,8 +25,6 @@
## 🎉 Latest Updates
- 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
@@ -81,20 +79,6 @@ docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
#### Cloud Providers
<details>
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
- [Novita](https://novita.ai/gpus-console?templateId=311)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
</details>
### Your First Fine-tune
```bash
@@ -136,6 +120,12 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co
## ❤️ Sponsors
Thank you to our sponsors who help make Axolotl possible:
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
fine-tune large language models, run protein folding simulations, and much more.
Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
## 📜 License

View File

@@ -19,7 +19,5 @@ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
--cov-append \
--cov-report=xml:multigpu-coverage.xml
# Upload coverage to Codecov if CODECOV_TOKEN is available
if [ -n "$CODECOV_TOKEN" ]; then
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
fi
# Upload coverage to Codecov
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true

View File

@@ -9,15 +9,13 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
EXPOSE 8888
EXPOSE 22
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
COPY scripts/motd /etc/motd
RUN pip install jupyterlab notebook ipywidgets && \
jupyter lab clean
RUN apt update && \
apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
rm -rf /var/cache/apt/archives && \
rm -rf /var/lib/apt/lists/* && \
RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \
printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \

View File

@@ -136,7 +136,3 @@ description: Frequently asked questions
> dynamic: false
> mode: max-autotune-no-cudagraphs
> ```
**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.

View File

@@ -124,13 +124,10 @@ For providers supporting Docker:
- Use `axolotlai/axolotl-cloud:main-latest`
- Available on:
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
- [Novita](https://novita.ai/gpus-console?templateId=311)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Novita](https://novita.ai/gpus-console?templateId=311)
### Google Colab {#sec-colab}

View File

@@ -22,7 +22,7 @@ To enable sequence parallelism, add the following to your configuration file:
```yaml
# Set to a divisor (> 1) of the number of GPUs available
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
context_parallel_size: 4 # Split sequences across 4 GPUs
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -30,7 +30,7 @@ heads_k_stride: 1
ring_attn_func:
```
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
The `context_parallel_size` should be a divisor of the total number of GPUs. For example:
- With 8 GPUs, valid values would be 2, 4, or 8
- With 4 GPUs, valid values would be 2 or 4
@@ -66,7 +66,7 @@ sequence_len: 8192
...
sequence_parallel_degree: 4 # Split each sequence into 4 parts, one per GPU
context_parallel_size: 4 # Split each sequence into 4 parts, one per GPU
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -89,12 +89,12 @@ Sequence parallelism is compatible with Axolotl's sample packing functionality.
## Effect on Batch Size
When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
When using sequence parallelism, your effective global batch size is **divided** by the `context_parallel_size`. This happens because:
- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
- Each group of `context_parallel_size` GPUs works on the same batch (just different parts of each sequence)
- The number of batches processed per step decreases
For example:
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
- With 8 GPUs and `context_parallel_size=4`: Only 2 different batches processed per step (each split across 4 GPUs)
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4

View File

@@ -1,9 +0,0 @@
# Arctic Long Sequence Training (ALST)
Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
techniques. It is a combination of:
- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).

View File

@@ -1,53 +0,0 @@
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
datasets:
- path: togethercomputer/Long-Data-Collections
type: completion
field: text
data_files:
- pretrain/rp_sub.jsonl.zst
- path: princeton-nlp/TextbookChapters
type: completion
field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true
tiled_mlp: true
sequence_parallel_degree: 8
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: true
gradient_checkpointing: true
activation_offloading: legacy
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
pad_token: <|end_of_text|>
deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -1,59 +0,0 @@
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
datasets:
- path: togethercomputer/Long-Data-Collections
type: completion
field: text
data_files:
- pretrain/rp_sub.jsonl.zst
- path: princeton-nlp/TextbookChapters
type: completion
field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true
tiled_mlp: true
context_parallel_size: 8
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5
bf16: auto
tf32: true
gradient_checkpointing: true
activation_offloading: legacy
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
pad_token: <|end_of_text|>
fsdp_version: 2
fsdp_config:
offload_params: false # offloading is currently not compatible with SP + torchao optimizer
state_dict_type: SHARDED_STATE_DICT
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: LlamaDecoderLayer
reshard_after_forward: true
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -66,7 +66,7 @@ flash_optimum:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 32
evals_per_epoch: 4
saves_per_epoch: 1
save_total_limit:

View File

@@ -43,7 +43,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 40
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -77,7 +77,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.000001

View File

@@ -44,7 +44,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 40
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -40,7 +40,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -41,7 +41,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -42,7 +42,7 @@ logging_steps: 5
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0001

View File

@@ -42,7 +42,7 @@ logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -43,7 +43,7 @@ logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -45,7 +45,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ logging_steps: 5
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0001

View File

@@ -41,7 +41,7 @@ logging_steps: 1
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0

View File

@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
eval_steps:
saves_per_epoch: 4

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: false
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 0
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -75,7 +75,7 @@ xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -20,7 +20,7 @@ special_tokens:
datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
warmup_ratio: 0.1
warmup_steps: 10
# Iterations
num_epochs: 1

View File

@@ -40,7 +40,7 @@
"%%capture\n",
"# This step can take ~5-10 minutes to install dependencies\n",
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\""
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@631d646\""
]
},
{

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -37,7 +37,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -61,7 +61,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -1,65 +1,19 @@
# Finetune Gemma-3n with Axolotl
# Gemma-3n
Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl.
## Requirements
## Getting started
In addition to Axolotl's requirements, Gemma-3n requires
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Gemma3n is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
Here is an example of how to install from main for pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min recommended)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'
```
pip3 install timm
```
2. In addition to Axolotl's requirements, Gemma-3n requires:
If you will load audio datasets, please also install
```bash
pip3 install timm==1.0.17
# for loading audio data
pip3 install librosa==0.11.0
```
pip3 install librosa
```
3. Run the finetuning example:
## Usage
```bash
# text only
axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
# text + vision
axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
# text + vision + audio
axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
```
Let us know how it goes. Happy finetuning! 🚀
WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
### TIPS
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
## Related Resources
- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
See example configs and the [multimodal doc](https://docs.axolotl.ai/docs/multimodal.html).

View File

@@ -34,6 +34,8 @@ eot_tokens:
datasets:
- path: Nanobit/text-vision-audio-2k-test
type: chat_template
data_files:
- dataset.jsonl
dataset_prepared_path:
val_set_size: 0.01
output_dir: ./outputs/out

View File

@@ -55,7 +55,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch:
saves_per_epoch: 1

View File

@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ logging_steps: 1
flash_attention:
sdp_attention:
flash_optimum:
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -26,7 +26,7 @@ lora_dropout: 0.05
lora_target_linear: true
relora_steps: 150
relora_warmup_ratio: 0.1
relora_warmup_steps: 10
relora_cpu_offload: false
wandb_project:
@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ logging_steps: 1
evals_per_epoch: 1
saves_per_epoch: 1
warmup_ratio: 0.1
warmup_steps: 10
weight_decay: 0.0
fsdp:
- full_shard

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -36,7 +36,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -67,7 +67,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -79,7 +79,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -53,7 +53,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ logging_steps: 1
xformers_attention:
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 2
eval_table_size:
saves_per_epoch: 1

View File

@@ -66,7 +66,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs:
use_reentrant: false
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sharding_strategy: FULL_SHARD
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -69,7 +69,7 @@ tf32: true
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
@@ -88,7 +88,7 @@ fsdp_config:
fsdp_sharding_strategy: FULL_SHARD
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -76,12 +76,12 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs:
use_reentrant: false
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -65,7 +65,7 @@ tf32: true
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
warmup_steps: 100
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
fsdp_sharding_strategy: FULL_SHARD
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
@@ -82,7 +82,7 @@ fsdp_config:
fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -74,13 +74,13 @@ gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
warmup_ratio: 0.1
warmup_steps: 20
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
@@ -85,7 +85,7 @@ fsdp_config:
fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad|>
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -1,6 +1,6 @@
# Finetune Magistral Small with Axolotl
Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
MistralAI has also released a proprietary medium-sized version called Magistral Medium.
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
Here is an example of how to install from main for pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
@@ -31,37 +31,12 @@ This config uses about 24GB VRAM.
Let us know how it goes. Happy finetuning! 🚀
### Thinking
MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
Example format:
```json
{
"messages": [
{"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
{"role": "user", "content": [{ "type": "text", "text": "..."}]},
{"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
],
}
```
Example config: `./magistral-small-think-qlora.yaml`.
The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
Limitations:
- You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
- This mode does not work with custom `train_detail` and `training` at the moment.
### TIPS
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
## Optimization Guides

View File

@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true

View File

@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true

View File

@@ -1,68 +0,0 @@
base_model: mistralai/Magistral-Small-2507
# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true
datasets:
- path: Nanobit/text-think-2k-test
type: chat_template
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/lora-out
adapter: qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1
flash_attention:
warmup_ratio: 0.1
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

Some files were not shown because too many files have changed in this diff Show More