axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

7037e3c836 deepseekv2 liger support (#1878) Aman Gupta Karmani 2024-08-27 20:52:40 -07:00
467464f045 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-28 03:09:14 +00:00
c1a61ae23c fix liger plugin load issues (#1876) Aman Gupta Karmani 2024-08-27 20:08:26 -07:00
86c72380b9 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-28 00:23:33 +00:00
159b8b9a74 monkey-patch transformers to simplify monkey-patching modeling code (#1877) Aman Gupta Karmani 2024-08-27 17:22:26 -07:00
969b16f88f Built site for gh-pages Quarto GHA Workflow Runner 2024-08-27 17:40:18 +00:00
1e43660701 Sample pack trust remote code v2 (#1873) Wing Lian 2024-08-27 13:39:24 -04:00
058d8c1e09 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-27 17:04:04 +00:00
f6362d2a05 Add Liger Kernal support for Qwen2 (#1871) Chiwan Park 2024-08-28 02:03:16 +09:00
efeaa00bb4 Update docs/dataset-formats/conversation.qmd Chirag Jain 2024-08-27 19:08:54 +05:30
8a84408fc7 Address review comments and add docs Chirag Jain 2024-08-27 04:25:44 +05:30
4805f3ca0a Merge branch 'main' of https://github.com/OpenAccess-AI-Collective/axolotl into cj_tokenizer_default_prompt_template Chirag Jain 2024-08-27 02:35:58 +05:30
68cbda17a0 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-26 19:51:21 +00:00
17af1d7081 clear cuda cache to help with memory leak/creep (#1858) Wing Lian 2024-08-26 15:50:26 -04:00
482fe96a4a Built site for gh-pages Quarto GHA Workflow Runner 2024-08-26 16:57:11 +00:00
2dac1edf72 Fix drop_long_seq bug due to truncation in prompt tokenization strategies when using chat_template (#1867) Chiwan Park 2024-08-27 01:56:12 +09:00
0454b587e1 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-26 16:01:59 +00:00
6819c12cee update specturm authors (#1869) Wing Lian 2024-08-26 12:00:36 -04:00
3e8056b954 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-25 21:54:53 +00:00
8e29bdefdd Spectrum plugin (#1866) Wing Lian 2024-08-25 17:54:02 -04:00
5229c2a73d Built site for gh-pages Quarto GHA Workflow Runner 2024-08-25 16:32:29 +00:00
f245964f22 better handling of llama-3 tool rolw (#1782) Wing Lian 2024-08-25 12:31:40 -04:00
cb03ff72ab Built site for gh-pages Quarto GHA Workflow Runner 2024-08-24 00:23:59 +00:00
22f4eafa55 simplify logic (#1856) Wing Lian 2024-08-23 20:23:08 -04:00
4f3b5cad43 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 21:00:52 +00:00
77a4b9cda2 change up import to prevent AttributeError (#1863) Wing Lian 2024-08-23 17:00:01 -04:00
dd0b858185 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 18:35:00 +00:00
810ecd4e81 add liger to readme (#1865) Wing Lian 2024-08-23 14:34:03 -04:00
f5da86dfa8 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 16:38:42 +00:00
da0d581a8c add liger example (#1864) Wing Lian 2024-08-23 12:37:50 -04:00
7b598a4ed2 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 16:22:49 +00:00
1f686c576c Liger Kernel integration (#1861) Wing Lian 2024-08-23 12:21:51 -04:00
395feebea5 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 16:19:50 +00:00
e8ff5d5738 don't mess with bnb since it needs compiled wheels (#1859) Wing Lian 2024-08-23 12:18:47 -04:00
9bf03a6d1e Built site for gh-pages Quarto GHA Workflow Runner 2024-08-23 15:41:18 +00:00
328fd4b3b7 add axolotl community license (#1862) Wing Lian 2024-08-23 11:40:21 -04:00
756a34f0fe wip for tp Wing Lian 2024-08-23 10:57:57 -04:00
198f7cd893 2d parallel llama fsdp Wing Lian 2024-08-23 00:02:14 -04:00
5b15816cf4 drop valueerror as this was from when 4bit required gptq remove-gptq-warn Wing Lian 2024-08-22 19:16:32 -04:00
8ee30f5954 Merge branch 'main' into cj_tokenizer_default_prompt_template Chirag Jain 2024-08-23 03:44:25 +05:30
f1873957ec Built site for gh-pages Quarto GHA Workflow Runner 2024-08-22 20:40:19 +00:00
fefa95e350 most model types now support flash attention 2 regardless of multipack support (#1854) Wing Lian 2024-08-22 16:39:23 -04:00
0b35889407 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-22 17:14:21 +00:00
b33dc07a77 rename nightly test and add badge (#1853) Wing Lian 2024-08-22 13:13:33 -04:00
a4310969e8 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-22 17:11:47 +00:00
dcbff16983 run nightly ci builds against upstream main (#1851) Wing Lian 2024-08-22 13:10:54 -04:00
2f8037fee6 ensure that the hftrainer deepspeed config is set before the trainer class is ever init'ed (#1850) [skip ci] Wing Lian 2024-08-22 13:10:40 -04:00
edb70f789c Built site for gh-pages Quarto GHA Workflow Runner 2024-08-22 15:48:28 +00:00
de4ea2d1f2 docs: minor syntax highlight fix (#1839) Aman Gupta Karmani 2024-08-22 08:47:34 -07:00
7ed92e61c2 fix: prompt phi (#1845) [skip ci] JohanWork 2024-08-22 17:46:57 +02:00
9caa3eb699 make the train_on_eos default to turn so all eos tokens are treated the same (#1847) [skip ci] Wing Lian 2024-08-22 11:45:37 -04:00
5b0b774e38 ensure that the bias is also in the correct dtype (#1848) [skip ci] Wing Lian 2024-08-22 11:45:00 -04:00
c3fc529bfc numpy 2.1.0 was released, but incompatible with numba (#1849) [skip ci] Wing Lian 2024-08-22 11:44:45 -04:00
957c956f89 rename jamba example (#1846) [skip ci] Gal Cohen (galco) 2024-08-22 16:22:55 +03:00
d2cf919e4f Built site for gh-pages Quarto GHA Workflow Runner 2024-08-21 17:38:39 +00:00
e85d954f52 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-21 17:38:13 +00:00
f07802f9fa examples: fix tiny-llama pretrain yml syntax (#1840) Aman Gupta Karmani 2024-08-21 10:37:51 -07:00
8169df172a Built site for gh-pages Quarto GHA Workflow Runner 2024-08-21 17:37:47 +00:00
9f917245f6 feat: add jamba chat_template (#1843) Gal Cohen (galco) 2024-08-21 20:37:17 +03:00
649c19aba3 pretrain: fix with sample_packing=false (#1841) Aman Gupta Karmani 2024-08-21 10:36:51 -07:00
e074063ac7 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-20 16:42:40 +00:00
5aac4bc284 fix: dont change quant storage dtype in case of fsdp (#1837) Gal Cohen (galco) 2024-08-20 19:41:48 +03:00
bf4795c2a9 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-19 19:00:17 +00:00
e29931259b optionally save the final FSDP model as a sharded state dict (#1828) Wing Lian 2024-08-19 14:59:24 -04:00
6ef76f1ace remove custom mistral template Chirag Jain 2024-08-19 15:56:47 +05:30
2e758aed6f Merge branch 'main' into cj_tokenizer_default_prompt_template Chirag Jain 2024-08-19 15:52:04 +05:30
9c922ade89 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-17 01:32:52 +00:00
b1d2921222 add validation to prevent 8bit lora finetuning on H100s (#1827) Wing Lian 2024-08-16 21:32:00 -04:00
1f09f48d8f fixed small typo fixtypo sunny 2024-08-16 11:27:56 -04:00
b44546df6f fixedtypo sunny 2024-08-16 11:22:50 -04:00
967fbf8152 fixedtypo Sunny 2024-08-16 11:17:52 -04:00
c144a1ae65 fixed small typo Sunny 2024-08-16 10:57:42 -04:00
0206b57aa9 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-16 14:42:54 +00:00
803fed3e90 update sklearn versrion, torch compile env vars, don't worry about failure on preprocess load model (#1821) Wing Lian 2024-08-16 10:41:51 -04:00
7b994feb40 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-16 11:52:14 +00:00
68a3c7678a fix: parse model_kwargs (#1825) NanoCode012 2024-08-16 20:51:19 +09:00
e834e52dbf Built site for gh-pages Quarto GHA Workflow Runner 2024-08-14 13:47:40 +00:00
f18925fb4b fix: parse eager_attention (#1824) NanoCode012 2024-08-14 22:46:46 +09:00
21a2302538 Merge branch 'main' into cj_tokenizer_default_prompt_template Chirag Jain 2024-08-12 10:24:02 +05:30
2b890ead05 fsdp fft loading on meta device fsdp-fft Wing Lian 2024-08-11 22:18:04 -04:00
53791c1c2e Built site for gh-pages Quarto GHA Workflow Runner 2024-08-11 20:28:30 +00:00
1853d6021d bump hf dependencies (#1823) Wing Lian 2024-08-11 16:27:41 -04:00
a04e1ec370 Built site for gh-pages Quarto GHA Workflow Runner 2024-08-09 15:51:20 +00:00
0801f239cc fix the incorrect max_length for chat template (#1818) Chiwan Park 2024-08-10 00:50:31 +09:00
54392ac8a6 Attempt to run multigpu in PR CI for now to ensure it works (#1815) [skip ci] Wing Lian 2024-08-09 11:50:13 -04:00
3e2b269d06 update tinyllama to use final instead of checkpoints (#1820) [skip ci] Wing Lian 2024-08-09 10:58:19 -04:00
5ee4b7325f fix z3 leaf configuration when not using lists (#1817) [skip ci] Wing Lian 2024-08-09 10:54:52 -04:00
c7b0bee57b Built site for gh-pages Quarto GHA Workflow Runner 2024-08-06 19:26:43 +00:00
70978467a0 skip no commit to main on ci (#1814) Wing Lian 2024-08-06 15:25:54 -04:00
89f382a13a Merge branch 'main' into cj_tokenizer_default_prompt_template Chirag Jain 2024-08-06 21:23:14 +05:30
6674f532ac Built site for gh-pages Quarto GHA Workflow Runner 2024-08-06 14:32:54 +00:00
850f999a76 update peft and transformers (#1811) Wing Lian 2024-08-06 10:32:05 -04:00
c56e0a79a5 logging improvements (#1808) [skip ci] Wing Lian 2024-08-06 10:31:50 -04:00
35d5e59d78 set z3 leaf for deepseek v2 (#1809) [skip ci] Wing Lian 2024-08-06 09:30:46 -04:00
fbbeb4fee0 remove un-necessary zero-first guard as it's already only called in a parent fn (#1810) [skip ci] Wing Lian 2024-08-06 09:29:23 -04:00
fde80ab9ff Built site for gh-pages Quarto GHA Workflow Runner 2024-08-05 17:13:16 +00:00
ecdda006de One cycle lr (#1803) Wing Lian 2024-08-05 13:12:05 -04:00
b7665c26c8 Update conversation.qmd (#1788) [skip ci] Ben Feuer 2024-08-05 12:44:26 -04:00
d60b8e681c Built site for gh-pages Quarto GHA Workflow Runner 2024-08-05 16:43:37 +00:00
cb023c70db Update instruct-lora-8b.yml (#1789) [skip ci] Aaditya Ura (looking for PhD Fall’24) 2024-08-05 22:13:20 +05:30

... 36 37 38 39 40 ...