Compare commits

..

1 Commits

Author SHA1 Message Date
Wing Lian
747dafe5b2 Add Llama4 maverick examples 2025-04-09 08:27:46 -04:00
185 changed files with 383 additions and 1574 deletions

View File

@@ -1,14 +0,0 @@
[run]
source = axolotl
omit =
*/tests/*
setup.py
[report]
exclude_lines =
pragma: no cover
def __repr__
raise NotImplementedError
if __name__ == .__main__.:
pass
raise ImportError

View File

@@ -29,7 +29,7 @@ jobs:
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
pytorch: 2.6.0 pytorch: 2.6.0
axolotl_extras: vllm axolotl_extras:
is_latest: true is_latest: true
runs-on: axolotl-gpu-runner runs-on: axolotl-gpu-runner
steps: steps:

View File

@@ -102,16 +102,9 @@ jobs:
- name: Run tests - name: Run tests
run: | run: |
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml pytest -v tests/patched/
pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml pytest -v tests/cli/
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
- name: cleanup pip cache - name: cleanup pip cache
run: | run: |

1
CNAME
View File

@@ -1 +0,0 @@
docs.axolotl.ai

View File

@@ -9,7 +9,6 @@
<p align="center"> <p align="center">
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License"> <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests"> <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
<a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a> <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
<br/> <br/>
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a> <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
@@ -64,7 +63,7 @@ axolotl fetch examples
axolotl fetch deepspeed_configs # OPTIONAL axolotl fetch deepspeed_configs # OPTIONAL
``` ```
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html). Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
### Your First Fine-tune ### Your First Fine-tune
@@ -79,7 +78,7 @@ axolotl fetch examples --dest path/to/folder
axolotl train examples/llama-3/lora-1b.yml axolotl train examples/llama-3/lora-1b.yml
``` ```
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough. That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
## ✨ Key Features ## ✨ Key Features
@@ -92,20 +91,20 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
## 📚 Documentation ## 📚 Documentation
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments - [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples - [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them - [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html) - [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html) - [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html) - [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation - [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions - [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
## 🤝 Getting Help ## 🤝 Getting Help
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html) - Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
- Need dedicated support? Please contact [wing@axolotl.ai](mailto:wing@axolotl.ai) for options - Need dedicated support? Please contact [wing@axolotl.ai](mailto:wing@axolotl.ai) for options
## 🌟 Contributing ## 🌟 Contributing

View File

@@ -3,59 +3,10 @@ set -e
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__" python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
# Run unit tests with initial coverage report pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
pytest -v --durations=10 -n8 \ pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels # running these with the other patches causes a failure
--ignore=tests/e2e/ \ pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
--ignore=tests/patched/ \ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
--ignore=tests/cli \ pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
/workspace/axolotl/tests/ \ pytest -v --durations=10 /workspace/axolotl/tests/cli
--cov=axolotl \ pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
--cov-report=xml:coverage.xml
# Run lora kernels tests with coverage append
pytest -v --durations=10 \
/workspace/axolotl/tests/e2e/patched/lora_kernels \
--cov=axolotl \
--cov-append
# Run patched tests excluding lora kernels with coverage append
pytest -v --durations=10 \
--ignore=tests/e2e/patched/lora_kernels \
/workspace/axolotl/tests/e2e/patched \
--cov=axolotl \
--cov-append
# Run solo tests with coverage append
pytest -v --durations=10 -n1 \
/workspace/axolotl/tests/e2e/solo/ \
--cov=axolotl \
--cov-append
# Run integration tests with coverage append
pytest -v --durations=10 \
/workspace/axolotl/tests/e2e/integrations/ \
--cov=axolotl \
--cov-append
pytest -v --durations=10 /workspace/axolotl/tests/cli \
--cov=axolotl \
--cov-append
# Run remaining e2e tests with coverage append and final report
pytest -v --durations=10 \
--ignore=tests/e2e/solo/ \
--ignore=tests/e2e/patched/ \
--ignore=tests/e2e/multigpu/ \
--ignore=tests/e2e/integrations/ \
--ignore=tests/cli \
/workspace/axolotl/tests/e2e/ \
--cov=axolotl \
--cov-append \
--cov-report=xml:coverage.xml
# Upload coverage to Codecov
if [ -f e2e-coverage.xml ]; then
codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
else
echo "Coverage file not found. Coverage report may have failed."
fi

View File

@@ -4,22 +4,3 @@ set -e
# only run one test at a time so as not to OOM the GPU # only run one test at a time so as not to OOM the GPU
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
pytest -v -n2 \
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
/workspace/axolotl/tests/e2e/multigpu/ \
--cov=axolotl \
--cov-report=xml:multigpu-coverage.xml
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
--cov=axolotl \
--cov-append \
--cov-report=xml:multigpu-coverage.xml
# Upload coverage to Codecov
if [ -f multigpu-coverage.xml ]; then
codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
else
echo "Coverage file not found. Coverage report may have failed."
fi

View File

@@ -1,51 +0,0 @@
codecov:
require_ci_to_pass: yes
coverage:
precision: 2
round: down
range: "70...100"
status:
project:
default:
# basic
target: auto
threshold: 0%
base: auto
# advanced
branches: null
if_no_uploads: error
if_not_found: success
if_ci_failed: error
only_pulls: false
flags: null
paths: null
patch:
default:
# basic
target: auto
threshold: 0%
base: auto
# advanced
branches: null
if_no_uploads: error
if_not_found: success
if_ci_failed: error
only_pulls: false
flags: null
paths: null
parsers:
gcov:
branch_detection:
conditional: yes
loop: yes
method: no
macro: no
comment:
layout: "reach,diff,flags,files,footer"
behavior: default
require_changes: no
require_base: no
require_head: yes

View File

@@ -90,7 +90,7 @@ lora_on_cpu: true
# List[str]. Add plugins to extend the pipeline. # List[str]. Add plugins to extend the pipeline.
# See `src/axolotl/integrations` for the available plugins or doc below for more details. # See `src/axolotl/integrations` for the available plugins or doc below for more details.
# https://docs.axolotl.ai/docs/custom_integrations.html # https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
plugins: plugins:
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
@@ -394,7 +394,7 @@ lora_fan_in_fan_out: false
# Apply custom LoRA autograd functions and activation function Triton kernels for # Apply custom LoRA autograd functions and activation function Triton kernels for
# speed and memory savings # speed and memory savings
# See: https://docs.axolotl.ai/docs/lora_optims.html # See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
lora_mlp_kernel: true lora_mlp_kernel: true
lora_qkv_kernel: true lora_qkv_kernel: true
lora_o_kernel: true lora_o_kernel: true
@@ -688,14 +688,11 @@ ddp_broadcast_buffers:
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. # Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized # E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
# subsequences, or set to 4 to split into four equal-sized subsequences. # subsequences, or set to 4 to split into four equal-sized subsequences.
# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details. # See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
sequence_parallel_degree: sequence_parallel_degree:
# Optional; strides across the key dimension. Larger values use more memory but should make training faster. # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
# Must evenly divide the number of KV heads in your model. # Must evenly divide the number of KV heads in your model.
heads_k_stride: 1 heads_k_stride: 1
# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
# in the sample packing case, and "batch_ring" in the non-sample packing case.
ring_attn_func:
# Path to torch distx for optim 'adamw_anyprecision' # Path to torch distx for optim 'adamw_anyprecision'
torchdistx_path: torchdistx_path:

View File

@@ -457,7 +457,10 @@ datasets:
type: alpaca type: alpaca
``` ```
Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format. Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
Reference: [Instruction Dataset Documentation](inst_tune.qmd).
#### Custom Instruct Prompt Format #### Custom Instruct Prompt Format

View File

@@ -36,9 +36,6 @@ deepspeed: deepspeed_configs/zero1.json
### Usage {#sec-deepspeed-usage} ### Usage {#sec-deepspeed-usage}
```{.bash} ```{.bash}
# Fetch deepspeed configs (if not already present)
axolotl fetch deepspeed_configs
# Passing arg via config # Passing arg via config
axolotl train config.yml axolotl train config.yml
@@ -51,20 +48,10 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
We provide default configurations for: We provide default configurations for:
- ZeRO Stage 1 (`zero1.json`) - ZeRO Stage 1 (`zero1.json`)
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
- ZeRO Stage 2 (`zero2.json`) - ZeRO Stage 2 (`zero2.json`)
- ZeRO Stage 3 (`zero3.json`) - ZeRO Stage 3 (`zero3.json`)
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
::: {.callout-tip} Choose based on your memory requirements and performance needs.
Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
Start from Stage 1 -> Stage 2 -> Stage 3.
:::
## FSDP {#sec-fsdp} ## FSDP {#sec-fsdp}

View File

@@ -530,7 +530,7 @@ trl:
``` ```
```bash ```bash
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
``` ```
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute: Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:

View File

@@ -27,9 +27,6 @@ To enable sequence parallelism, add the following to your configuration file:
sequence_parallel_degree: 4 # Split sequences across 4 GPUs sequence_parallel_degree: 4 # Split sequences across 4 GPUs
# Optional; strides across the key dimension. Larger values use more memory but should make training faster. # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1 heads_k_stride: 1
# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
ring_attn_func:
``` ```
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example: The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:

View File

@@ -8,6 +8,7 @@ tokenizer_type: GPT2Tokenizer
trust_remote_code: true trust_remote_code: true
tokenizer_use_fast: true tokenizer_use_fast: true
tokenizer_legacy: true tokenizer_legacy: true
strict: false
push_dataset_to_hub: push_dataset_to_hub:
hf_use_auth_token: true hf_use_auth_token: true
datasets: datasets:

View File

@@ -4,6 +4,7 @@ base_model: cerebras/Cerebras-GPT-1.3B
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -4,6 +4,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
# huggingface repo # huggingface repo
chat_template: cohere chat_template: cohere

View File

@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
trust_remote_code: true trust_remote_code: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -6,6 +6,7 @@ trust_remote_code: true
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
trust_remote_code: true trust_remote_code: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -1,58 +0,0 @@
base_model: agentica-org/DeepCoder-14B-Preview
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
field_messages: messages
message_property_mappings:
role: role
content: content
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out
sequence_len: 4096
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

View File

@@ -1,58 +0,0 @@
base_model: deepcogito/cogito-v1-preview-llama-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
field_messages: messages
message_property_mappings:
role: role
content: content
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out
sequence_len: 4096
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

View File

@@ -1,58 +0,0 @@
base_model: deepcogito/cogito-v1-preview-qwen-14B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: fozziethebeat/alpaca_messages_2k_test
type: chat_template
field_messages: messages
message_property_mappings:
role: role
content: content
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out
sequence_len: 4096
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

View File

@@ -2,6 +2,7 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
trust_remote_code: true trust_remote_code: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -6,6 +6,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
plugins: plugins:

View File

@@ -11,6 +11,7 @@ trust_remote_code: true
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
gptq: false gptq: false
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -15,6 +15,7 @@ load_in_8bit: false
# enable 4bit for QLoRA # enable 4bit for QLoRA
load_in_4bit: true load_in_4bit: true
gptq: false gptq: false
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: QingyiSi/Alpaca-CoT - path: QingyiSi/Alpaca-CoT

View File

@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true trust_remote_code: true
gptq: false gptq: false
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
# huggingface repo # huggingface repo
datasets: datasets:

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
# huggingface repo # huggingface repo
chat_template: gemma chat_template: gemma

View File

@@ -5,6 +5,7 @@ num_labels: 1
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
reward_model: true reward_model: true
chat_template: gemma chat_template: gemma

View File

@@ -10,6 +10,7 @@ ddp_find_unused_parameters: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
# huggingface repo # huggingface repo
chat_template: gemma3 chat_template: gemma3

View File

@@ -1,4 +1,5 @@
base_model: google/gemma-3-4b-it base_model: google/gemma-3-4b-it
strict: false
load_in_4bit: true load_in_4bit: true

View File

@@ -1,5 +1,6 @@
base_model: google/gemma-3-4b-it base_model: google/gemma-3-4b-it
processor_type: AutoProcessor processor_type: AutoProcessor
strict: false
load_in_4bit: true load_in_4bit: true

View File

@@ -4,6 +4,7 @@ base_model: EleutherAI/gpt-j-6b
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -6,6 +6,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -5,6 +5,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
load_in_4bit: true load_in_4bit: true
strict: false
use_tensorboard: true use_tensorboard: true
chat_template: jamba chat_template: jamba
datasets: datasets:

View File

@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -10,6 +10,7 @@ gptq_disable_exllama: true
tokenizer_use_fast: true tokenizer_use_fast: true
tokenizer_legacy: true tokenizer_legacy: true
strict: false
push_dataset_to_hub: push_dataset_to_hub:
hf_use_auth_token: true hf_use_auth_token: true
datasets: datasets:

View File

@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: yahma/alpaca-cleaned - path: yahma/alpaca-cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -5,6 +5,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -4,6 +4,7 @@ processor_type: AutoProcessor
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
# these 3 lines are needed for now to handle vision chat templates w images # these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true skip_prepare_dataset: true

View File

@@ -9,6 +9,7 @@ liger_rms_norm: true
liger_glu_activation: true liger_glu_activation: true
liger_fused_linear_cross_entropy: true liger_fused_linear_cross_entropy: true
strict: false
chat_template: llama3 chat_template: llama3
datasets: datasets:

View File

@@ -1,6 +1,7 @@
base_model: NousResearch/Meta-Llama-3.1-8B base_model: NousResearch/Meta-Llama-3.1-8B
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
chat_template: llama3 chat_template: llama3
rl: dpo rl: dpo

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
chat_template: llama3 chat_template: llama3
datasets: datasets:

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
chat_template: llama3 chat_template: llama3
rl: dpo rl: dpo

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -1,6 +1,7 @@
base_model: NousResearch/Llama-3.2-1B base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -1,6 +1,7 @@
base_model: NousResearch/Llama-3.2-1B base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -1,6 +1,7 @@
base_model: NousResearch/Llama-3.2-1B base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -4,6 +4,7 @@ base_model: meta-llama/Llama-3.2-1B
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
rl: kto rl: kto
rl_beta: 0.5 rl_beta: 0.5

View File

@@ -4,6 +4,7 @@ base_model: NousResearch/Llama-3.2-1B
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: aaditya/alpaca_subset_1 - path: aaditya/alpaca_subset_1

View File

@@ -1,28 +1,16 @@
# Llama 4 by Meta AI # Llama 4 by Meta AI
## Flash Attention vs Flex Attention
While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
## Available Examples ## Available Examples
### Llama 4 Scout 17Bx16Experts (109B) ### Llama 4 Scout 17Bx16Experts (109B)
- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
Flex Attention Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd)
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
- [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
[//]: # (Flash Attention &#40;Do not use&#41;)
[//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1]&#40;./scout-vision-qlora-fsdp.yaml&#41;)
[//]: # (- [Text Single GPU &#40;H100&#41; QLoRA]&#40;./scout-qlora-single-h100.yaml&#41;)
[//]: # (- [Text Multi GPU QLoRA w/ FSDP1]&#40;./scout-qlora-fsdp1.yaml&#41;)
Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
### Llama 4 Maverick 17Bx128Experts (400B) ### Llama 4 Maverick 17Bx128Experts (400B)
Coming Soon - [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml)
Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian)

View File

@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
plugins: plugins:
- axolotl.integrations.liger.LigerPlugin - axolotl.integrations.liger.LigerPlugin

View File

@@ -1,86 +0,0 @@
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true
llama4_linearized_experts: true
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
- self_attn.q_proj
- self_attn.k_proj
- self_attn.v_proj
- self_attn.o_proj
- shared_expert.gate_proj
- shared_expert.up_proj
- shared_expert.down_proj
# - experts.gate_projs.[0-9]+$
# - experts.up_projs.[0-9]+$
# - experts.down_projs.[0-9]+$
lora_modules_to_save:
# - lm_head
# - embed_tokens
chat_template: llama4
datasets:
- path: mlabonne/FineTome-100k
type: chat_template
split: train[:20%]
field_messages: conversations
message_property_mappings:
role: from
content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 3
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4
bf16: true
tf32: true
logging_steps: 1
flex_attention: true
flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
- auto_wrap
- full_shard
fsdp_config:
fsdp_version: 2
fsdp_offload_params: false
fsdp_cpu_ram_efficient_loading: true
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
fsdp_state_dict_type: SHARDED_STATE_DICT
fsdp_sharding_strategy: FULL_SHARD
fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>

View File

@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
# torch_compile: true # torch_compile: true
plugins: plugins:

View File

@@ -1,85 +0,0 @@
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true
cut_cross_entropy: true
llama4_linearized_experts: true # needed with custom linearized experts model
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
- self_attn.q_proj
- self_attn.k_proj
- self_attn.v_proj
- self_attn.o_proj
- shared_expert.gate_proj
- shared_expert.up_proj
- shared_expert.down_proj
# - experts.gate_projs.[0-9]+$ # optionally train the moe experts
# - experts.up_projs.[0-9]+$
# - experts.down_projs.[0-9]+$
lora_modules_to_save:
# - lm_head # needed if modifying vocabulary
# - embed_tokens
lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true
chat_template: llama4
datasets:
- path: mlabonne/FineTome-100k
type: chat_template
split: train[:20%]
field_messages: conversations
message_property_mappings:
role: from
content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
sequence_len: 4096 # up to 8k will work on a single H100
sample_packing: true
pad_to_sequence_len: true
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4
bf16: true
tf32: true
torch_compile: true
flex_attention: true
flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
gradient_checkpointing: offload
gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
warmup_steps: 20
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>

View File

@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
plugins: plugins:
- axolotl.integrations.liger.LigerPlugin - axolotl.integrations.liger.LigerPlugin

View File

@@ -4,6 +4,7 @@ processor_type: Llama4Processor
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
# these 3 lines are needed for now to handle vision chat templates w images # these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true skip_prepare_dataset: true

View File

@@ -1,89 +0,0 @@
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
processor_type: Llama4Processor
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false
sequence_len: 4096
plugins:
- axolotl.integrations.liger.LigerPlugin
liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true
llama4_linearized_experts: true # use Axolotl's customized model
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
- self_attn.q_proj
- self_attn.k_proj
- self_attn.v_proj
- self_attn.o_proj
- shared_expert.gate_proj
- shared_expert.up_proj
- shared_expert.down_proj
- vision_adapter.mlp.fc1
- vision_adapter.mlp.fc2
# - experts.gate_projs.[0-9]+$
# - experts.up_projs.[0-9]+$
# - experts.down_projs.[0-9]+$
lora_modules_to_save:
- lm_head
- embed_tokens
chat_template: llama4
datasets:
- path: HuggingFaceH4/llava-instruct-mix-vsft
type: chat_template
split: train[:1%]
field_messages: messages
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4
bf16: true
tf32: true
logging_steps: 1
flex_attention: true
flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_steps: 10
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
- auto_wrap
- full_shard
fsdp_config:
fsdp_version: 2
fsdp_offload_params: false
fsdp_cpu_ram_efficient_loading: true
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
fsdp_state_dict_type: SHARDED_STATE_DICT
fsdp_sharding_strategy: FULL_SHARD
fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true
special_tokens:
pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|>

View File

@@ -1,5 +1,6 @@
base_model: llava-hf/llava-1.5-7b-hf base_model: llava-hf/llava-1.5-7b-hf
processor_type: AutoProcessor processor_type: AutoProcessor
strict: false
# these 3 lines are needed for now to handle vision chat templates w images # these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true skip_prepare_dataset: true

View File

@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
tokenizer_config: EleutherAI/gpt-neox-20b tokenizer_config: EleutherAI/gpt-neox-20b
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
trust_remote_code: true trust_remote_code: true
strict: false
unfrozen_parameters: unfrozen_parameters:
- ^lm_head.weight$ - ^lm_head.weight$

View File

@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -12,6 +12,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
chat_template: chatml chat_template: chatml
rl: dpo rl: dpo

View File

@@ -9,6 +9,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
rl: orpo rl: orpo
orpo_alpha: 0.1 orpo_alpha: 0.1

View File

@@ -1,5 +1,6 @@
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
processor_type: AutoProcessor processor_type: AutoProcessor
strict: false
load_in_8bit: true load_in_8bit: true

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -9,6 +9,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -9,6 +9,7 @@ trust_remote_code: true
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: tatsu-lab/alpaca - path: tatsu-lab/alpaca

View File

@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
trust_remote_code: true trust_remote_code: true
strict: false
unfrozen_parameters: unfrozen_parameters:
- ^lm_head.weight$ - ^lm_head.weight$

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
push_dataset_to_hub: push_dataset_to_hub:
datasets: datasets:
- path: teknium/GPT4-LLM-Cleaned - path: teknium/GPT4-LLM-Cleaned

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: true load_in_8bit: true
load_in_4bit: false load_in_4bit: false
strict: false
chat_template: phi_3 chat_template: phi_3
datasets: datasets:

View File

@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: garage-bAInd/Open-Platypus - path: garage-bAInd/Open-Platypus

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true
strict: false
datasets: datasets:
- path: garage-bAInd/Open-Platypus - path: garage-bAInd/Open-Platypus

View File

@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: garage-bAInd/Open-Platypus - path: garage-bAInd/Open-Platypus

View File

@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test

View File

@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
chat_template: phi_3 chat_template: phi_3
strict: false
datasets: datasets:
- path: garage-bAInd/Open-Platypus - path: garage-bAInd/Open-Platypus

Some files were not shown because too many files have changed in this diff Show More