Qwen3.5-MoE example config with lora_target_modules regex (#3515) [skip ci]
* lora target modules with regex * updates * fsdp for non moe * update wording * chore: cleanup and lint * chore: cleanup docs from merge --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>
This commit is contained in:
84
examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
Normal file
84
examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
base_model: Qwen/Qwen3.5-122B-A10B
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
chat_template: qwen3_5
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
load_in_4bit: true
|
||||||
|
quantize_moe_experts: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 16
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_dropout: 0
|
||||||
|
lora_target_modules:
|
||||||
|
- q_proj
|
||||||
|
- k_proj
|
||||||
|
- v_proj
|
||||||
|
- o_proj
|
||||||
|
# Regex matching to target shared experts too
|
||||||
|
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
# Target experts
|
||||||
|
# lora_target_parameters:
|
||||||
|
# - mlp.experts.gate_up_proj
|
||||||
|
# - mlp.experts.down_proj
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
lora_mlp_kernel: false
|
||||||
|
lora_qkv_kernel: false
|
||||||
|
lora_o_kernel: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
offload_params: true
|
||||||
|
cpu_ram_efficient_loading: false
|
||||||
|
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
|
||||||
|
state_dict_type: FULL_STATE_DICT
|
||||||
|
sharding_strategy: FULL_SHARD
|
||||||
|
reshard_after_forward: true
|
||||||
|
activation_checkpointing: true
|
||||||
@@ -32,7 +32,11 @@ lora_target_modules:
|
|||||||
- v_proj
|
- v_proj
|
||||||
- o_proj
|
- o_proj
|
||||||
|
|
||||||
#lora_target_parameters:
|
# Regex matching to target shared experts too
|
||||||
|
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
# Target experts
|
||||||
|
# lora_target_parameters:
|
||||||
# - mlp.experts.gate_up_proj
|
# - mlp.experts.gate_up_proj
|
||||||
# - mlp.experts.down_proj
|
# - mlp.experts.down_proj
|
||||||
|
|
||||||
@@ -52,7 +56,6 @@ learning_rate: 0.0002
|
|||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
|
|
||||||
lora_mlp_kernel: false
|
lora_mlp_kernel: false
|
||||||
lora_qkv_kernel: false
|
lora_qkv_kernel: false
|
||||||
lora_o_kernel: false
|
lora_o_kernel: false
|
||||||
|
|||||||
81
examples/qwen3.5/27b-qlora-fsdp.yaml
Normal file
81
examples/qwen3.5/27b-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
base_model: Qwen/Qwen3.5-27B
|
||||||
|
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
chat_template: qwen3_5
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 16
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_target_modules:
|
||||||
|
- q_proj
|
||||||
|
- k_proj
|
||||||
|
- v_proj
|
||||||
|
- o_proj
|
||||||
|
- down_proj
|
||||||
|
- up_proj
|
||||||
|
# Uncomment below to also target the linear attention projections.
|
||||||
|
# These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific).
|
||||||
|
# - linear_attn.in_proj_qkv
|
||||||
|
# - linear_attn.in_proj_z
|
||||||
|
# - linear_attn.out_proj
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
offload_params: false
|
||||||
|
cpu_ram_efficient_loading: false
|
||||||
|
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer
|
||||||
|
state_dict_type: FULL_STATE_DICT
|
||||||
|
sharding_strategy: FULL_SHARD
|
||||||
|
reshard_after_forward: true
|
||||||
|
activation_checkpointing: true
|
||||||
@@ -1,9 +1,7 @@
|
|||||||
base_model: Qwen/Qwen3.5-27B
|
base_model: Qwen/Qwen3.5-27B
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
# Note: Qwen3.5 is an early-fusion VLM (image+text). This config fine-tunes
|
|
||||||
# the text-only path. For multimodal (image+text) fine-tuning, add image
|
|
||||||
# columns to your dataset following axolotl's multimodal dataset format.
|
|
||||||
|
|
||||||
plugins:
|
plugins:
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|||||||
85
examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
Normal file
85
examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
base_model: Qwen/Qwen3.5-35B-A3B
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
chat_template: qwen3_5
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
|
||||||
|
sequence_len: 2048
|
||||||
|
sample_packing: true
|
||||||
|
|
||||||
|
load_in_4bit: true
|
||||||
|
quantize_moe_experts: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 16
|
||||||
|
lora_alpha: 32
|
||||||
|
lora_dropout: 0
|
||||||
|
lora_target_modules:
|
||||||
|
- q_proj
|
||||||
|
- k_proj
|
||||||
|
- v_proj
|
||||||
|
- o_proj
|
||||||
|
|
||||||
|
# Regex matching to target shared experts too
|
||||||
|
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
# Target experts
|
||||||
|
# lora_target_parameters:
|
||||||
|
# - mlp.experts.gate_up_proj
|
||||||
|
# - mlp.experts.down_proj
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
lora_mlp_kernel: false
|
||||||
|
lora_qkv_kernel: false
|
||||||
|
lora_o_kernel: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 4
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
offload_params: true
|
||||||
|
cpu_ram_efficient_loading: false
|
||||||
|
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
|
||||||
|
state_dict_type: FULL_STATE_DICT
|
||||||
|
sharding_strategy: FULL_SHARD
|
||||||
|
reshard_after_forward: true
|
||||||
|
activation_checkpointing: true
|
||||||
@@ -32,7 +32,11 @@ lora_target_modules:
|
|||||||
- v_proj
|
- v_proj
|
||||||
- o_proj
|
- o_proj
|
||||||
|
|
||||||
#lora_target_parameters:
|
# Regex matching to target shared experts too
|
||||||
|
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
# Target experts
|
||||||
|
# lora_target_parameters:
|
||||||
# - mlp.experts.gate_up_proj
|
# - mlp.experts.gate_up_proj
|
||||||
# - mlp.experts.down_proj
|
# - mlp.experts.down_proj
|
||||||
|
|
||||||
|
|||||||
@@ -26,8 +26,6 @@ lora_r: 32
|
|||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
# Targets the language model attention and MLP layers.
|
# Targets the language model attention and MLP layers.
|
||||||
# Qwen3.5 is early-fusion: all layers (including those seeing vision tokens) share
|
|
||||||
# the same transformer stack, so standard attention targets work for both modalities.
|
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
- q_proj
|
- q_proj
|
||||||
- k_proj
|
- k_proj
|
||||||
|
|||||||
@@ -2,20 +2,6 @@
|
|||||||
|
|
||||||
[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`.
|
[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`.
|
||||||
|
|
||||||
Vision and text tokens are processed through the same transformer stack. The configs below train on text-only data unless noted otherwise. See `9b-lora-vision.yaml` for a multimodal example.
|
|
||||||
|
|
||||||
Available configs:
|
|
||||||
|
|
||||||
| Config | Model | Type | Peak VRAM |
|
|
||||||
|---|---|---|---|
|
|
||||||
| `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only QLoRA | ~47 GiB |
|
|
||||||
| `27b-fft.yaml` | Qwen3.5-27B | Dense VLM, text-only FFT (vision frozen) | ~53 GiB |
|
|
||||||
| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — |
|
|
||||||
| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — |
|
|
||||||
| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — |
|
|
||||||
| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB |
|
|
||||||
|
|
||||||
|
|
||||||
## Getting started
|
## Getting started
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||||
@@ -23,43 +9,69 @@ Available configs:
|
|||||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||||
|
|
||||||
3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
|
3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
|
||||||
```bash
|
```bash
|
||||||
pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
|
pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
|
||||||
|
```
|
||||||
|
> FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.
|
||||||
|
|
||||||
|
4. Pick any config from the table below and run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
axolotl train examples/qwen3.5/<config>.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
Available configs:
|
||||||
|
|
||||||
|
| Config | Model | Type | Peak VRAM |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — |
|
||||||
|
| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB |
|
||||||
|
| `27b-qlora.yaml` | Qwen3.5-27B | Dense, text-only QLoRA | ~47 GiB |
|
||||||
|
| `27b-fft.yaml` | Qwen3.5-27B | Dense, text-only FFT (vision frozen) | ~53 GiB |
|
||||||
|
| `27b-qlora-fsdp.yaml` | Qwen3.5-27B | Dense, text-only QLoRA + FSDP2 | — |
|
||||||
|
| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — |
|
||||||
|
| `35b-a3b-moe-qlora-fsdp.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA + FSDP2 | — |
|
||||||
|
| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — |
|
||||||
|
| `122b-a10b-moe-qlora-fsdp.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA + FSDP2 | — |
|
||||||
|
|
||||||
|
### Gated DeltaNet Linear Attention
|
||||||
|
|
||||||
|
Qwen3.5 interleaves standard attention with Gated DeltaNet linear attention layers. To apply LoRA to them, add to `lora_target_modules`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
lora_target_modules:
|
||||||
|
# ... standard projections ...
|
||||||
|
- linear_attn.in_proj_qkv
|
||||||
|
- linear_attn.in_proj_z
|
||||||
|
- linear_attn.out_proj
|
||||||
```
|
```
|
||||||
> FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.
|
|
||||||
|
|
||||||
4. Run a finetuning example:
|
### Routed Experts (MoE)
|
||||||
|
|
||||||
```bash
|
To apply LoRA to routed expert parameters, add `lora_target_parameters`:
|
||||||
# Dense 27B text-only (QLoRA, ~47 GiB VRAM with sample packing)
|
|
||||||
axolotl train examples/qwen3.5/27b-qlora.yaml
|
|
||||||
|
|
||||||
# Dense 27B text-only FFT with vision encoder frozen (~53 GiB, single 80 GiB GPU)
|
```yaml
|
||||||
axolotl train examples/qwen3.5/27b-fft.yaml
|
lora_target_parameters:
|
||||||
|
- mlp.experts.gate_up_proj
|
||||||
|
- mlp.experts.down_proj
|
||||||
|
# - mlp.gate.weight # router
|
||||||
|
```
|
||||||
|
|
||||||
# MoE 35B-A3B text-only (QLoRA)
|
### Shared Experts (MoE)
|
||||||
axolotl train examples/qwen3.5/35b-a3b-moe-qlora.yaml
|
|
||||||
|
|
||||||
# MoE 122B-A10B text-only (QLoRA)
|
Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately:
|
||||||
axolotl train examples/qwen3.5/122b-a10b-moe-qlora.yaml
|
|
||||||
|
|
||||||
# 9B vision+text (LoRA, multimodal dataset)
|
|
||||||
axolotl train examples/qwen3.5/9b-lora-vision.yaml
|
|
||||||
|
|
||||||
# 9B vision+text FFT, single 80 GiB GPU (~61 GiB peak)
|
|
||||||
axolotl train examples/qwen3.5/9b-fft-vision.yaml
|
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
|
||||||
```
|
```
|
||||||
|
|
||||||
### TIPS
|
### TIPS
|
||||||
|
|
||||||
- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`.
|
- For inference hyp, please see the respective model card details.
|
||||||
- For **text-only FFT** on 27B, use `27b-fft.yaml` which sets `unfrozen_parameters` to freeze the vision encoder (`model.visual.*`) — this avoids wasting optimizer state on parameters that receive no gradient from text-only data.
|
|
||||||
- You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below.
|
- You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below.
|
||||||
- Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
- Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||||
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||||
- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`.
|
- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`.
|
||||||
- The Gated DeltaNet linear attention layers (`linear_attn.*`) can optionally be added to `lora_target_modules` — they are commented out by default.
|
|
||||||
|
|
||||||
## Optimization Guides
|
## Optimization Guides
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user