axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

c6da9b9e92 Update SETUP_MIAAI.md: add bare Ubuntu rebuild section (driver, packages, Ollama) activeblue/main tocmo0nlord 2026-05-13 21:33:02 +00:00
c7c4885369 Update SETUP_MIAAI.md: pre-training checklist, Ollama stop/start, verify script, corrected training time tocmo0nlord 2026-05-13 21:19:15 +00:00
981a13e110 Update human_chat_qlora.yml: working config for RTX 5080 (seq_len 2048, qlora, chat_template) tocmo0nlord 2026-05-13 18:59:19 +00:00
74f2263ac7 Update SETUP_MIAAI.md: bitsandbytes sm_120 patch, OOM fixes, working training config tocmo0nlord 2026-05-13 18:58:51 +00:00
8693a1f61b fix Dockerfile-base-next: cuda 12.8.2, miniforge, sm_120 tocmo0nlord 2026-05-13 14:37:01 +00:00
71c6a56e7a switch to HQQ quantization to bypass bitsandbytes sm_120 issue tocmo0nlord 2026-05-13 13:55:52 +00:00
38adf5cd37 add trust_remote_code, explicit bfloat16 and bnb dtype settings tocmo0nlord 2026-05-13 13:32:46 +00:00
3f29fa017b replace Capybara with SlimOrca (compatible ShareGPT format) tocmo0nlord 2026-05-13 12:58:29 +00:00
c02a76f132 fix field_messages mapping for Capybara/OpenHermes ShareGPT format tocmo0nlord 2026-05-13 12:56:03 +00:00
b9ceebfe7e fix deprecated type:sharegpt and flash_attention config keys tocmo0nlord 2026-05-13 12:52:25 +00:00
e9a3fd483f add human-like QLoRA training config for Llama 3.1 8B tocmo0nlord 2026-05-13 12:50:35 +00:00
eadd15c960 note MAX_JOBS for flash-attn compile speed tocmo0nlord 2026-05-13 04:45:21 +00:00
396ce4a9dd add miaai environment setup guide tocmo0nlord 2026-05-13 04:16:03 +00:00
b7ec06b8a1 Add optional Axolotl MoRA/ReMoRA integration (#3647) [skip ci] Wing Lian 2026-05-12 07:19:55 -04:00
e2f01de0e8 Fix Axolotl ReLoRA optimizer reset scope (#3646) Wing Lian 2026-05-09 17:52:35 -04:00
5352d41d32 feat: systemic multimodal assistant-only loss masking + cfg.role_boundaries` (#3625) thad0ctor 2026-05-05 08:25:39 -07:00
c15f6cffe2 fix: FSDP FULL_STATE_DICT oom from memory leak (#3635) VED 2026-05-05 20:52:35 +05:30
e4032fc90f Refactor separate attention flags with attn_implementation and capability/concerns feature flags (#3602) Wing Lian 2026-05-05 10:15:18 -04:00
6136ae627b Fix: add bitnet config (#3636) Younes B 2026-04-30 20:30:56 +04:00
e662972a29 Feat: Add bitnet integration (#3634) Younes B 2026-04-30 19:25:02 +04:00
ebbd7fa847 feat: Add Mistral Medium 3.5 (#3633) NanoCode012 2026-04-29 22:46:51 +07:00
ac77da96da use smaller pretrained models for ci (#3620) [skip ci] Wing Lian 2026-04-27 13:22:56 -04:00
993db05b3a fix losses smol-ci Wing Lian 2026-04-25 08:44:37 +00:00
464de78f6d regroup attn_implementation tests by feature concern attn-implementation-refactor Wing Lian 2026-04-25 08:57:56 +00:00
7a41b47d22 drop "Phase 2" naming from attn-implementation tests Wing Lian 2026-04-25 08:54:14 +00:00
6886def92c fix duplicate attn_implementation in gpt-oss yamls and flaky caplog tests Wing Lian 2026-04-25 08:53:28 +00:00
1b9520cc8b more train steps Wing Lian 2026-04-25 02:17:48 +00:00
79255ffdd7 Built site for gh-pages gh-pages Quarto GHA Workflow Runner 2026-04-24 09:09:53 +00:00
798c8fba89 chore: update docker docs (#3623) main NanoCode012 2026-04-24 16:03:21 +07:00
17fc747f99 fix: docker build failing (#3622) NanoCode012 2026-04-24 14:23:09 +07:00
f77408a3d0 fix tests Wing Lian 2026-04-23 23:47:28 +00:00
aeca18a8b0 remove dead gemma4 branch in _set_attention_config Wing Lian 2026-04-23 22:22:56 +00:00
434a484fe9 update doc snippets + reject gemma4-hybrid with non-FA2 backend Wing Lian 2026-04-23 22:18:02 +00:00
39226623d2 migrate example configs to canonical attn_implementation Wing Lian 2026-04-23 22:15:07 +00:00
2d64d009d8 expand attention tests + rewrite docs Wing Lian 2026-04-23 21:30:20 +00:00
a0d24bcc19 migrate remaining consumers to canonical attn_implementation Wing Lian 2026-04-23 21:26:18 +00:00
bce65e3332 move attention-dependent validators to mode=after Wing Lian 2026-04-23 21:23:11 +00:00
2579c496d5 make attn_implementation the single source of truth Wing Lian 2026-04-23 21:17:10 +00:00
35d43fe141 compute attn capability flags in normalizer instead of properties Wing Lian 2026-04-12 23:46:44 -04:00
ff5d6393c8 replace legacy attention boolean flags with capability properties Wing Lian 2026-04-12 22:01:09 -04:00
aee8c75d64 refactor attention handling Wing Lian 2026-04-01 16:57:34 +00:00
c4f986874d chore: lint Wing Lian 2026-04-01 16:08:43 +00:00
28e89a5c16 upgrade to torchao 0.17.0 Wing Lian 2026-04-01 16:04:55 +00:00
98e18d59d2 larger torch cuda arch list coverage for older cuda torch-211-base Wing Lian 2026-04-23 15:05:01 -04:00
462135acfb add pytorch 2.11 to base images Wing Lian 2026-04-23 14:57:49 -04:00
8495c79fb1 properly handles kernels repo type kernelize-scattermoe-lora Wing Lian 2026-04-23 14:56:16 -04:00
901f2356bc dpo collation/padding (#3601) [skip ci] Wing Lian 2026-04-23 14:49:52 -04:00
5db4272f69 more steps for loss check Wing Lian 2026-04-23 18:43:18 +00:00
431888c1de use smaller pretrained models for ci Wing Lian 2026-04-23 13:51:01 +00:00
ac48bfadba Built site for gh-pages Quarto GHA Workflow Runner 2026-04-23 04:33:48 +00:00
1bf65c500e feat: add processor_kwargs YAML field forwarded to from_pretrained (#3612) thad0ctor 2026-04-22 21:26:34 -07:00
bcbe049c21 Feat: add support for datasets with str saved messages field (#3607) brightwind26 2026-04-23 08:25:48 +04:00
90090fa9e8 DPO support loss types (#3566) Andrew Wu 2026-04-23 05:25:28 +01:00
c7ad3c8e22 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-22 13:12:17 +00:00
7420fd4de6 fix async prefetch with nemogym (#3606) Wing Lian 2026-04-22 09:05:46 -04:00
918d02d7a9 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-22 05:21:50 +00:00
05113bc91a train on remote compute using Tinker compatible APIs (#3614) Wing Lian 2026-04-22 01:14:41 -04:00
9a0d3016df first pass at build and deploy scattermoe-lora kernel Wing Lian 2026-04-22 01:10:01 -04:00
70b4b68acf Built site for gh-pages Quarto GHA Workflow Runner 2026-04-21 21:56:08 +00:00
e562e149ce fix: [gemma4] fix VRAM leak in hybrid FA2+SDPA (hybrid attentiuon) path under activation check… (#3611) thad0ctor 2026-04-21 14:49:58 -07:00
f18c2bb1f8 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-21 14:23:11 +00:00
9de5b76336 feat: move to uv first (#3545) NanoCode012 2026-04-21 21:16:03 +07:00
d17ed89a3c add missing file swe-rebench-rl-rebase Wing Lian 2026-04-21 08:44:01 -04:00
02e4f2350d fixes for scattermoe from latest peft upgrade Wing Lian 2026-04-21 08:00:16 -04:00
cec99c4133 fix test dims vllm-0191 Wing Lian 2026-04-21 00:44:26 +00:00
4195605ab2 fix test dims Wing Lian 2026-04-21 00:44:26 +00:00
37acb28d02 fix einsum dims Wing Lian 2026-04-20 23:09:47 +00:00
d248242490 support for vllm 0.19.1 Wing Lian 2026-04-19 18:09:46 -04:00
4a5281e61a Fix shape Wing Lian 2026-04-19 01:53:05 +00:00
a892d8cce1 chore: lint Wing Lian 2026-04-17 17:48:26 +00:00
78de2919a6 tiled mlp fix for gemma4 Wing Lian 2026-04-16 13:24:41 +00:00
4696e9911f Built site for gh-pages Quarto GHA Workflow Runner 2026-04-15 13:33:49 +00:00
28283ff373 revert shared_kv_states workaround with transformers 5.5.4 Wing Lian 2026-04-15 13:32:59 +00:00
dc16859983 [gemma4] fix fused RMSNorm+RoPE on hybrid attention models Wing Lian 2026-04-15 12:59:00 +00:00
d4e9cf2eec lint Wing Lian 2026-04-14 17:26:00 -04:00
53391a10d7 vllm-serve-lora add /v1/completions route + worker pipe lock Wing Lian 2026-04-14 15:52:02 +00:00
7617b951a8 make _maybe_sync_vllm_weights actually fire in sync mode Wing Lian 2026-04-13 18:30:16 +00:00
e993ed5208 retry head-server probe with longer timeout Wing Lian 2026-04-13 18:29:55 +00:00
69f165b39b probe vLLM weight-sync routes and select transport per server Wing Lian 2026-04-13 18:29:45 +00:00
80a97f192b validate batch shape against num_generations at config time Wing Lian 2026-04-13 18:29:22 +00:00
323da791eb bump transformers to 5.5.4 and trl to latest 1.1.0 (#3603) Wing Lian 2026-04-15 09:27:03 -04:00
6867872c76 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-13 00:59:30 +00:00
6990478163 fix: rename model to adapter_model for fsdp sharded final model (#3585) NanoCode012 2026-04-13 07:51:30 +07:00
63a58cfec1 feat: support excess_length_strategy for RL trainers (#3578) [skip ci] ゆり 2026-04-13 08:51:10 +08:00
3985ec2f67 feat: add FineGrainedFP8Config support for model quantization (#3587) [skip ci] madScientist10 2026-04-13 03:50:37 +03:00
a44edda6d7 Skip redundant evaluation when resuming from checkpoint (#3575) [skip ci] Joaquin Hui 2026-04-13 01:50:15 +01:00
afd0657e08 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-12 15:04:55 +00:00
66c3e5a3fd better handling of dora merge on Conv layers in Qwen 3.5 (#3599) Wing Lian 2026-04-12 10:57:45 -04:00
6a6a6329a0 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-12 14:37:16 +00:00
b8358aa5ab [gemma4] use mixed Flash Attention and SDPA and add fused RMSNorm+RoPE Triton kernels (#3598) Wing Lian 2026-04-12 10:29:55 -04:00
e079cf16a2 qwen3_5.jinja: handle list content on system messages (#3595) [skip ci] Joaquin Hui 2026-04-12 05:58:58 +01:00
e2f69828d2 [fix][fsdp2] clone sharded param so original full size shard can be gc'ed (#3597) [skip ci] Wing Lian 2026-04-11 20:22:35 -04:00
122b50bad6 pre-cache the eot token ids rather than on each iteration (#3594) [skip ci] Wing Lian 2026-04-11 20:05:21 -04:00
21b0c220c4 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-10 21:15:42 +00:00
e77a185e86 upgrade transformers to use v5.5.3 (#3593) Wing Lian 2026-04-10 17:08:14 -04:00
d76f8c505c Built site for gh-pages Quarto GHA Workflow Runner 2026-04-10 20:53:46 +00:00
29fa4dedbb Gemma4 fixes and profiler (#3591) Wing Lian 2026-04-10 16:46:17 -04:00
af1d4c8e78 Built site for gh-pages Quarto GHA Workflow Runner 2026-04-10 18:18:53 +00:00
315cdeede9 handle trainable/masked spans in content and reasoning content (#3592) Wing Lian 2026-04-10 14:11:10 -04:00
e7a6a5b529 fix: move warning after we've set any overrides (#3589) [skip ci] NanoCode012 2026-04-11 00:00:47 +07:00

1 2 3 4 5 ...