Compare commits

...

9 Commits

Author SHA1 Message Date
NanoCode012
08c8f3f22f fix: total tokens and defaults in config 2025-12-02 21:38:10 +07:00
NanoCode012
76f0fe2621 fix: steps not allowed fractional 2025-12-02 21:30:15 +07:00
Yohan Na
c6ddcdd06a feat: add exaone4 chat template and update enums (#3279)
* feat: add exaone4 chat template and update enums

* fix: handle first message as system or tools in exaone4 chat template

* Update src/axolotl/utils/chat_templates/templates/exaone4.jinja

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* fix: lint

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Co-authored-by: NanoCode012 <nano@axolotl.ai>
2025-12-01 15:52:45 +07:00
github-actions[bot]
7fb6a947d9 chore: update pre-commit hooks (#3287)
Co-authored-by: SalmanMohammadi <25081738+SalmanMohammadi@users.noreply.github.com>
2025-12-01 15:03:14 +07:00
NanoCode012
b234532d9f Feat: add peft_ensure_weight_tying (#3278)
* feat: upgrade peft to 0.18.0

* feat: add peft_ensure_weight_tying

* fix: default

* chore: adjust kwarg per feedback
2025-11-28 18:54:48 +07:00
VED
8990ca3205 fix: removed unused "scikit-learn==1.4.2" (#3277)
Co-authored-by: Ved <ved.work2024@gmail.com>
2025-11-24 13:48:53 +07:00
NanoCode012
006f226270 Feat: add Olmo3 (BC with Olmo and Olmo2) (#3275)
* feat: update cce to include olmo family

* chore: update docs following feedback

* feat: add olmo3 config

* fix: clarify 3 methods

* chore: add olmo to readme
2025-11-24 10:21:31 +07:00
Wing Lian
0b635e69c5 build docker images for 2.9.x (#3273) 2025-11-20 09:26:24 -05:00
Wing Lian
0d27e14e45 Torch 2.9.1 base images (#3268)
* update torch 2.9.1 base images

* update base dockerfile image check
2025-11-20 09:04:37 -05:00
22 changed files with 322 additions and 46 deletions

View File

@@ -57,14 +57,14 @@ jobs:
cuda_version: 12.8.1 cuda_version: 12.8.1
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.9.0 pytorch: 2.9.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-base" dockerfile: "Dockerfile-base"
- cuda: "130" - cuda: "130"
cuda_version: 13.0.0 cuda_version: 13.0.0
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.9.0 pytorch: 2.9.1
torch_cuda_arch_list: "9.0+PTX" torch_cuda_arch_list: "9.0+PTX"
dockerfile: "Dockerfile-base" dockerfile: "Dockerfile-base"
# - cuda: "128" # - cuda: "128"
@@ -146,14 +146,14 @@ jobs:
cuda_version: 12.8.1 cuda_version: 12.8.1
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.9.0 pytorch: 2.9.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
dockerfile: "Dockerfile-uv-base" dockerfile: "Dockerfile-uv-base"
- cuda: "130" - cuda: "130"
cuda_version: 13.0.0 cuda_version: 13.0.0
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.9.0 pytorch: 2.9.1
torch_cuda_arch_list: "9.0+PTX" torch_cuda_arch_list: "9.0+PTX"
dockerfile: "Dockerfile-uv-base" dockerfile: "Dockerfile-uv-base"
steps: steps:

View File

@@ -36,6 +36,16 @@ jobs:
pytorch: 2.8.0 pytorch: 2.8.0
axolotl_extras: axolotl_extras:
is_latest: true is_latest: true
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.0
axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.1
axolotl_extras:
runs-on: axolotl-gpu-runner runs-on: axolotl-gpu-runner
steps: steps:
- name: Checkout - name: Checkout
@@ -109,6 +119,16 @@ jobs:
pytorch: 2.8.0 pytorch: 2.8.0
axolotl_extras: axolotl_extras:
is_latest: true is_latest: true
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.0
axolotl_extras:
- cuda: 128
cuda_version: 12.8.1
python_version: "3.11"
pytorch: 2.9.1
axolotl_extras:
runs-on: axolotl-gpu-runner runs-on: axolotl-gpu-runner
steps: steps:
- name: Checkout - name: Checkout

View File

@@ -11,13 +11,13 @@ repos:
- id: no-commit-to-branch - id: no-commit-to-branch
args: ['--branch', 'main'] args: ['--branch', 'main']
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.3 rev: v0.14.7
hooks: hooks:
- id: ruff - id: ruff
args: [--fix] args: [--fix]
- id: ruff-format - id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.18.2 rev: v1.19.0
hooks: hooks:
- id: mypy - id: mypy
additional_dependencies: additional_dependencies:
@@ -26,7 +26,7 @@ repos:
'pydantic>=2.5.3', 'pydantic>=2.5.3',
] ]
- repo: https://github.com/PyCQA/bandit - repo: https://github.com/PyCQA/bandit
rev: 1.8.6 rev: 1.9.2
hooks: hooks:
- id: bandit - id: bandit
args: [ args: [

View File

@@ -29,6 +29,7 @@
## 🎉 Latest Updates ## 🎉 Latest Updates
- 2025/11: Axolotl now includes support for [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3).
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3), [Granite 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4), [HunYuan](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan), [Magistral 2509](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision), [Apertus](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus), and [Seed-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss). - 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3), [Granite 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4), [HunYuan](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan), [Magistral 2509](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision), [Apertus](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus), and [Seed-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss).
- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion). - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107). - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).

View File

@@ -51,7 +51,7 @@ RUN git lfs install --skip-repo && \
pip3 install -U --no-cache-dir pydantic==1.10.10 && \ pip3 install -U --no-cache-dir pydantic==1.10.10 && \
pip3 cache purge pip3 cache purge
RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \ RUN if [ "$PYTORCH_VERSION" = "2.9.1" ] && [ "$CUDA" = "128" ] ; then \
wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \ wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \ pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \ rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \

View File

@@ -4,7 +4,7 @@ format:
html: html:
toc: true toc: true
toc-depth: 3 toc-depth: 3
number-sections: true # number-sections: true
code-tools: true code-tools: true
execute: execute:
enabled: false enabled: false
@@ -14,12 +14,18 @@ This guide covers advanced training configurations for multi-GPU setups using Ax
## Overview {#sec-overview} ## Overview {#sec-overview}
Axolotl supports several methods for multi-GPU training: When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.
- DeepSpeed (recommended) You generally cannot combine these strategies; they are mutually exclusive.
- FSDP (Fully Sharded Data Parallel)
- Sequence parallelism 1. **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.
- FSDP + QLoRA 2. **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).
3. **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).
These features can often be combined with the strategies above:
* **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).
* **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).
## DeepSpeed {#sec-deepspeed} ## DeepSpeed {#sec-deepspeed}
@@ -65,12 +71,18 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
## Fully Sharded Data Parallel (FSDP) {#sec-fsdp} ## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.
::: {.callout-note} ::: {.callout-note}
FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl. FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
::: :::
### FSDP + QLoRA {#sec-fsdp-qlora}
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2} ### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
@@ -145,10 +157,6 @@ single sequence causes OOM errors during model training.
See our [dedicated guide](sequence_parallelism.qmd) for more information. See our [dedicated guide](sequence_parallelism.qmd) for more information.
### FSDP + QLoRA {#sec-fsdp-qlora}
For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
## Performance Optimization {#sec-performance} ## Performance Optimization {#sec-performance}
### Liger Kernel Integration {#sec-liger} ### Liger Kernel Integration {#sec-liger}

View File

@@ -40,7 +40,7 @@
"%%capture\n", "%%capture\n",
"# This step can take ~5-10 minutes to install dependencies\n", "# This step can take ~5-10 minutes to install dependencies\n",
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\"" "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953\""
] ]
}, },
{ {

View File

@@ -30,7 +30,7 @@ eval_sample_packing: true
gradient_accumulation_steps: 4 gradient_accumulation_steps: 4
micro_batch_size: 4 micro_batch_size: 4
num_epochs: 1 num_epochs: 1
warmup_steps: 0.1 warmup_ratio: 0.1
optimizer: adamw_8bit optimizer: adamw_8bit
lr_scheduler: cosine lr_scheduler: cosine
@@ -44,7 +44,7 @@ resume_from_checkpoint:
sdp_attention: true sdp_attention: true
logging_steps: 1 logging_steps: 1
save_strategy: best save_strategy: epoch
eval_strategy: epoch eval_strategy: epoch
special_tokens: special_tokens:

46
examples/olmo3/README.md Normal file
View File

@@ -0,0 +1,46 @@
# Finetune Allenai's Olmo 3 with Axolotl
[Olmo 3](https://huggingface.co/collections/allenai/olmo-3) are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
## Getting started
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
Here is an example of how to install from pip:
```bash
# Ensure you have a compatible version of Pytorch installed
pip3 install packaging setuptools wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
# Install Cut Cross Entropy
python scripts/cutcrossentropy_install.py | sh
```
2. Run the finetuning example:
```bash
axolotl train examples/olmo3/olmo3-7b-qlora.yaml
```
Let us know how it goes. Happy finetuning! 🚀
### TIPS
- The example config can be re-used for Olmo and Olmo 2.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
## Optimization Guides
Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
## Related Resources
- [Olmo 3 Blog](https://allenai.org/blog/olmo3)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -0,0 +1,64 @@
base_model: allenai/Olmo-3-7B-Instruct-SFT
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out
adapter: qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -6,21 +6,17 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
## Getting started ## Getting started
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Seed-OSS is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
Here is an example of how to install from main for pip: Here is an example of how to install from pip:
```bash
# Ensure you have a compatible version of Pytorch installed
pip3 install packaging setuptools wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```bash # Install Cut Cross Entropy
# Ensure you have Pytorch installed (Pytorch 2.6.0 min) python scripts/cutcrossentropy_install.py | sh
git clone https://github.com/axolotl-ai-cloud/axolotl.git ```
cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'
# Install Cut Cross Entropy
python scripts/cutcrossentropy_install.py | sh
```
2. Run the finetuning example: 2. Run the finetuning example:
@@ -41,9 +37,7 @@ Let us know how it goes. Happy finetuning! 🚀
## Optimization Guides ## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
## Related Resources ## Related Resources

View File

@@ -37,9 +37,7 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
## Optimization Guides ## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
## Related Resources ## Related Resources

View File

@@ -11,7 +11,7 @@ liger-kernel==0.6.3
packaging==23.2 packaging==23.2
huggingface_hub>=0.36.0 huggingface_hub>=0.36.0
peft>=0.17.1 peft>=0.18.0
tokenizers>=0.22.1 tokenizers>=0.22.1
transformers==4.57.1 transformers==4.57.1
accelerate==1.11.0 accelerate==1.11.0
@@ -42,7 +42,6 @@ numpy>=2.2.6
# qlora things # qlora things
evaluate==0.4.1 evaluate==0.4.1
scipy scipy
scikit-learn==1.4.2
nvidia-ml-py==12.560.30 nvidia-ml-py==12.560.30
art art
tensorboard tensorboard

View File

@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
print( print(
UNINSTALL_PREFIX UNINSTALL_PREFIX
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"' + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953"'
) )

View File

@@ -631,7 +631,9 @@ class AxolotlTrainer(
logs["tokens_per_second_per_gpu"] = round( logs["tokens_per_second_per_gpu"] = round(
self.state.last_tokens_per_second.item() / self.args.logging_steps, 2 self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
) )
logs["total_tokens"] = int(self.state.total_tokens.item())
if hasattr(self.state, "total_tokens"):
logs["total_tokens"] = int(self.state.total_tokens.item())
del self._stored_metrics[train_eval] del self._stored_metrics[train_eval]

View File

@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
- If you are installing from pip - If you are installing from pip
```bash ```bash
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec" pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953"
``` ```
## Usage ## Usage
@@ -65,6 +65,9 @@ plugins:
- mistral3 - mistral3
- mixtral - mixtral
- mllama - mllama
- olmo
- olmo2
- olmo3
- phi - phi
- phi3 - phi3
- phi4_multimodal - phi4_multimodal

View File

@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
_CCE_INSTALL_MESSAGE = ( _CCE_INSTALL_MESSAGE = (
"Please install Axolotl's fork of cut_cross_entropy with transformers support using " "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
'`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"`' '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953"`'
) )

View File

@@ -102,6 +102,8 @@ def load_lora(
lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
if cfg.peft_trainable_token_indices: if cfg.peft_trainable_token_indices:
lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
if cfg.peft_ensure_weight_tying is not None:
lora_config_kwargs["ensure_weight_tying"] = cfg.peft_ensure_weight_tying
# Determine the correct PEFT task type # Determine the correct PEFT task type
model_cls = type(model).__name__ model_cls = type(model).__name__

View File

@@ -49,6 +49,9 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"seed_oss", "seed_oss",
"lfm2", "lfm2",
"lfm2_moe", "lfm2_moe",
"olmo",
"olmo2",
"olmo3",
] ]

View File

@@ -0,0 +1,126 @@
{%- if not skip_think is defined %}
{%- set skip_think = true %}
{%- endif %}
{%- set role_indicators = {
'user': '[|user|]\n',
'assistant': '[|assistant|]\n',
'system': '[|system|]\n',
'tool': '[|tool|]\n'
} %}
{%- set end_of_turn = '[|endofturn|]\n' %}
{%- macro available_tools(tools) %}
{{- "# Available Tools" }}
{{- "\nYou can use none, one, or multiple of the following tools by calling them as functions to help with the users query." }}
{{- "\nHere are the tools available to you in JSON format within <tool> and </tool> tags:\n" }}
{%- for tool in tools %}
{{- "<tool>" }}
{{- tool | tojson(ensure_ascii=False) | safe }}
{{- "</tool>\n" }}
{%- endfor %}
{{- "\nFor each function call you want to make, return a JSON object with function name and arguments within <tool_call> and </tool_call> tags, like:" }}
{{- "\n<tool_call>{\"name\": function_1_name, \"arguments\": {argument_1_name: argument_1_value, argument_2_name: argument_2_value}}</tool_call>" }}
{{- "\n<tool_call>{\"name\": function_2_name, \"arguments\": {...}}</tool_call>\n..." }}
{{- "\nNote that if no argument name is specified for a tool, you can just print the argument value directly, without the argument name or JSON formatting." }}
{%- endmacro %}
{%- set ns = namespace(last_query_index = messages|length - 1) %}
{%- for message in messages %}
{%- if message.role == "user" and message.content is string %}
{%- set ns.last_query_index = loop.index0 -%}
{%- endif %}
{%- endfor %}
{%- for i in range(messages | length) %}
{%- set msg = messages[i] %}
{%- set role = msg.role %}
{%- if role not in role_indicators %}
{{- raise_exception('Unknown role: ' ~ role) }}
{%- endif %}
{# ---- Case A: If the first message is "system", handle it here alone (without continue) ---- #}
{%- if i == 0 and role == 'system' %}
{{- role_indicators['system'] }}
{{- msg.content }}
{%- if tools is defined and tools %}
{{- "\n\n" }}{{- available_tools(tools) }}
{%- endif %}
{{- end_of_turn -}}
{%- else %}
{# ---- Case B: If the first message is tools instead of system, inject the system tools preamble ---- #}
{%- if i == 0 and tools is defined and tools %}
{{- role_indicators['system'] }}
{{- available_tools(tools) }}
{{- end_of_turn -}}
{%- endif %}
{%- endif %}
{%- if role == 'assistant' %}
{{- role_indicators['assistant'] }}
{%- if msg.content %}
{%- if "</think>" in msg.content %}
{%- set content = msg.content.split('</think>')[-1].strip() %}
{%- set reasoning_content = msg.content.split('</think>')[0].strip() %}
{%- if reasoning_content.startswith("<think>") %}
{%- set reasoning_content = reasoning_content[7:].strip() %}
{%- endif %}
{%- else %}
{%- set content = msg.content %}
{%- endif %}
{%- if msg.reasoning_content %}
{%- set reasoning_content = msg.reasoning_content %}
{%- endif %}
{%- if (not skip_think and loop.last) and reasoning_content is defined %}
{{- "<think>\n" }}
{{- reasoning_content}}
{{- "\n</think>\n\n" }}
{%- else %}
{{- "<think>\n\n</think>\n\n" }}
{%- endif %}
{{- content }}
{%- endif %}
{%- if msg.tool_calls %}
{%- if msg.content %}
{{- "\n" }}
{%- else %}
{{- "<think>\n\n</think>\n\n" }}
{%- endif %}
{%- for tool_call in msg.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- if tool_call.arguments is defined %}
{%- set arguments = tool_call.arguments %}
{%- elif tool_call.parameters is defined %}
{%- set arguments = tool_call.parameters %}
{%- else %}
{{- raise_exception('arguments or parameters are mandatory: ' ~ tool_call) }}
{%- endif %}
{{- "<tool_call>" }}{"name": "{{- tool_call.name }}", "arguments": {{ arguments | tojson(ensure_ascii=False) | safe }}}{{- "</tool_call>" }}
{%- if not loop.last %}
{{- "\n" }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- end_of_turn -}}
{%- elif role == "tool" %}
{%- if i == 0 or messages[i - 1].role != "tool" %}
{{- role_indicators['tool'] }}
{%- endif %}
{%- if msg.content is defined %}
{{- "<tool_result>" }}{"result": {{ msg.content | tojson(ensure_ascii=False) | safe }}}{{- "</tool_result>" }}
{%- endif %}
{%- if loop.last or messages[i + 1].role != "tool" %}
{{- end_of_turn -}}
{%- else %}
{{- "\n" }}
{%- endif %}
{%- else %}
{{- role_indicators[role] }}
{{- msg.content }}
{{- end_of_turn -}}
{%- endif %}
{% endfor %}
{%- if add_generation_prompt %}
{{- role_indicators['assistant'] }}
{%- if enable_thinking is defined and enable_thinking is true %}
{{- "<think>\n" }}
{%- else %}
{{- "<think>\n\n</think>\n\n" }}
{%- endif %}
{%- endif %}

View File

@@ -58,6 +58,7 @@ class ChatTemplate(str, Enum):
falcon_h1 = "falcon_h1" falcon_h1 = "falcon_h1"
tokenizer_default = "tokenizer_default" tokenizer_default = "tokenizer_default"
exaone = "exaone" exaone = "exaone"
exaone4 = "exaone4"
metharme = "metharme" metharme = "metharme"
pixtral = "pixtral" pixtral = "pixtral"
llava = "llava" llava = "llava"

View File

@@ -100,6 +100,15 @@ class LoraConfig(BaseModel):
) )
}, },
) )
peft_ensure_weight_tying: bool | None = Field(
default=None,
json_schema_extra={
"description": (
"Whether to tie adapter weights for tied model weights. "
"See https://github.com/huggingface/peft/issues/2864"
)
},
)
qlora_sharded_model_loading: bool | None = Field( qlora_sharded_model_loading: bool | None = Field(
default=False, default=False,