axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

9923b72649 Fix: eval table conflict with eval_sample_packing (#769) NanoCode012 2023-10-23 01:18:12 +09:00
21cf09b608 remove lora fused packing test (#758) Wing Lian 2023-10-21 22:59:35 -04:00
15d3a654bf Implement fused modules (#747) Casper 2023-10-21 22:08:25 +02:00
a21935f07a add to docs (#703) Wing Lian 2023-10-19 21:32:30 -04:00
8966a6f566 chore: bump transformers to v4.34.1 to fix tokenizer issue (#745) NanoCode012 2023-10-20 09:18:22 +09:00
e4d1585c4e Fix DeepSpeed Zero 3 Saving (#709) Motoki Wu 2023-10-19 16:18:24 -07:00
70157ccb8f add a latest tag for regular axolotl image, cleanup extraneous print statement (#746) Wing Lian 2023-10-19 12:28:29 -04:00
d0b534292f Add e2e test for ia3 ft ia3-peft Wing Lian 2023-10-19 09:27:55 -04:00
3a99495b05 improve: Enhance code readability of prompt_tokenizers.py (#707) seungduk.kim.2304 2023-10-19 21:12:17 +09:00
0bd89b38c6 migrate lora_ to peft_ Wing Lian 2023-09-28 11:58:23 -04:00
481ef187a5 update README for IA3 peft Wing Lian 2023-09-19 19:10:21 -04:00
d645b19fcf include task type for ia3 config Wing Lian 2023-09-19 19:08:10 -04:00
203369411e consolidate as peft_model_dir Wing Lian 2023-09-19 19:02:14 -04:00
ba85308720 Update src/axolotl/utils/models.py Wing Lian 2023-09-19 17:57:07 -04:00
998763bade ia3 keeps casting to float32, handle it here for now Wing Lian 2023-09-18 19:50:08 -04:00
c8e42a0f4f fix load_in_8bit check Wing Lian 2023-09-18 18:51:56 -04:00
1da328eb9a prepare ia3 for 8bit Wing Lian 2023-09-18 18:49:44 -04:00
2d7cccfc8e add ia3 peft support Wing Lian 2023-09-18 18:40:33 -04:00
440c3ab527 Fix(model): Linear detected and added to target module with rope linear (#738) NanoCode012 2023-10-19 11:13:20 +09:00
992d57f20a catch ConnectionError when checking dataset from HuggingFace (#743) Napuh 2023-10-19 04:11:54 +02:00
91a016f410 badge (#739) mhenrichsen 2023-10-18 16:21:34 +02:00
a045db0214 Mistral: Sliding Window Attention with Flash Attention and Sample Packing (#732) Casper 2023-10-16 21:13:46 +02:00
e1b214c62b Clarify custom format example (#729) Casper 2023-10-14 15:28:12 +02:00
3553172e3c fixes for alpaca w chatml, and don't include attention_mask w mistral for flash attention (#728) Wing Lian 2023-10-14 09:27:07 -04:00
080612219b use even if not using sample packing neft-v2 Wing Lian 2023-10-13 17:54:35 -04:00
f95858d369 alternate impl of NEFT Wing Lian 2023-10-13 17:45:24 -04:00
7f2027d93f tweak for xformers install w pytorch 2.1.0 (#727) Wing Lian 2023-10-13 15:21:17 -04:00
8d288a2ad4 workaround for installing xformers w torch 2.1.0 (#725) Wing Lian 2023-10-13 11:19:30 -04:00
f30afe4544 misc sharegpt fixes (#723) Wing Lian 2023-10-13 11:04:39 -04:00
bfbdba8614 pin xformers >= 0.0.22 (#724) Wing Lian 2023-10-13 10:27:56 -04:00
3bd9528390 add noisy embedding (#721) Maxime 2023-10-13 16:00:42 +02:00
2aa1f71464 fix pytorch 2.1.0 build, add multipack docs (#722) Wing Lian 2023-10-13 08:57:28 -04:00
1c412c7e9d improve handling of the prepared ds path and other cfg defaults (#701) Wing Lian 2023-10-13 07:46:07 -04:00
490923fb78 Save Axolotl config as WandB artifact (#716) Jan Philipp Harries 2023-10-11 13:28:12 +02:00
5855dded3d fix(doc): update default doc according to arg (#714) NanoCode012 2023-10-10 21:51:56 +09:00
ace70b33c6 Fix: lowercase True values in config (#713) atgctg 2023-10-10 14:32:20 +02:00
11c48c5e03 fix(doc): Add note on inference w sample packing (#712) NanoCode012 2023-10-10 21:08:17 +09:00
295b2662e1 Get qlora mistral-7b fine tuning working on a single 4090 (#708) lukemarsden 2023-10-10 07:14:23 +01:00
77c84e02fd Update README with some explanations (#700) seungduk.kim.2304 2023-10-09 02:37:54 +09:00
f91db198f3 fix unneeded space (#699) mhenrichsen 2023-10-07 20:19:25 +02:00
7f2618b5f4 add docker images for pytorch 2.10 (#697) Wing Lian 2023-10-07 12:23:31 -04:00
aca0398315 apex not needed as amp is part of pytorch (#696) Wing Lian 2023-10-07 12:20:45 -04:00
29b8f46aed Merge pull request #693 from OpenAccess-AI-Collective/update-mistral-example mhenrichsen 2023-10-07 11:04:58 +02:00
83a950bb87 lint mhenrichsen 2023-10-07 11:04:35 +02:00
de87ea68f6 fix multiline for docker (#694) Wing Lian 2023-10-06 22:38:15 -04:00
4c8ddf2c6f new lr, sample pack mhenrichsen 2023-10-06 22:58:13 +02:00
b4d84d56d5 support for batched sharegpt tokenization to skip bad data sharegpt-batched Wing Lian 2023-10-06 15:03:07 -04:00
669f1d052c Fix: Higher vram usage for mistral and sample_packing (#691) NanoCode012 2023-10-07 01:33:43 +09:00
d4a88e4eca Adding qlora config for Mistral (#675) Abhishek Mishra 2023-10-06 17:35:56 +05:30
2d60ba3a6e flash_attention + sample packing for stablelm 3b (#671) Wing Lian 2023-10-05 16:03:43 -04:00
eb480dfd68 Fix: ValueError when FA + Mistral when padding_side=right (#681) NanoCode012 2023-10-06 04:12:54 +09:00
133e676bcc Feat: Set WORKDIR to /workspace/axolotl (#679) NanoCode012 2023-10-06 04:09:14 +09:00
69fac9a020 Fix: Future deprecation warning with use_auth_token (#680) NanoCode012 2023-10-06 03:56:18 +09:00
e0b7eeabfd Fix(tokenizer): Set rstrip,lstrip,norm to False (#678) NanoCode012 2023-10-06 03:50:49 +09:00
43856c0a39 Fix(version): Update FA to work with Mistral SWA (#673) NanoCode012 2023-10-04 21:32:19 +09:00
e62d5901b5 chore: Clean up repetitive model kwargs (#670) NanoCode012 2023-10-04 20:41:26 +09:00
697c50d408 Feat: Allow usage of native Mistral FA when no sample_packing (#669) NanoCode012 2023-10-04 20:40:47 +09:00
90e0d673f7 Feat: Add config yaml to section for reprod in bug-report.yaml (#667) NanoCode012 2023-10-03 23:38:42 +09:00
2642caedf2 refactor to set eval_batch_size earlier if unset, so we can warn if mismatched (#662) Wing Lian 2023-10-02 21:08:07 -04:00
f34648c8b9 remove patch fix for phi (#664) Wing Lian 2023-10-02 21:07:41 -04:00
e50a64e85e prepared dataset caching, other misc fixes (#665) Wing Lian 2023-10-02 21:07:24 -04:00
f4868d733c make sure we also run CI tests when requirements.txt changes (#663) Wing Lian 2023-10-02 08:43:40 -04:00
a7e56d83c2 removed duplicate on requirements.txt (#661) Napuh 2023-10-02 14:40:05 +02:00
5b0bc48fbc add mistral e2e tests (#649) Wing Lian 2023-09-29 00:22:40 -04:00
9ec20777ba Make dataset_processes configurable (#651) Kyle Corbitt 2023-09-28 21:22:22 -07:00
590d6032fd Fix bug when using pretokenized datasets (#652) ich 2023-09-29 04:54:10 +02:00
409ca0f21c add support for defined train split (#654) Wing Lian 2023-09-28 20:14:14 -04:00
8662e8ffe8 don't strip the prompt for check since we don't strip to tokenize anymore (#650) Wing Lian 2023-09-28 12:21:51 -04:00
b2edaaeff6 fix for flash attn w mistral w/o sammple packing (#648) Wing Lian 2023-09-28 10:57:37 -04:00
b88f51512a Update mistral/README.md (#647) Adarsh Shirawalmath 2023-09-28 19:54:56 +05:30
eb41f76f92 Feat: Add example for Mistral (#644) NanoCode012 2023-09-28 20:15:00 +09:00
383f88d7a7 Fix(cfg): Add validation for save_strategy and eval_strategy (#633) NanoCode012 2023-09-28 10:14:41 +09:00
b6ab8aad62 Mistral flash attn packing (#646) Wing Lian 2023-09-27 18:41:00 -04:00
85b0be2ba7 Warn users to login to HuggingFace (#645) Napuh 2023-09-27 23:43:35 +02:00
8fe0e633d2 Fix bug in dataset loading (#284) Ethan Smith 2023-09-27 10:41:31 -07:00
d1236f2c41 Correct typos in datasets.py (#639) Felix Yan 2023-09-27 19:12:10 +03:00
895f0a0723 skip some flash attn patches unless explicitly enabled (#643) Wing Lian 2023-09-27 12:11:07 -04:00
e7d3e2dbb6 use fastchat conversations template (#578) Wing Lian 2023-09-27 12:10:45 -04:00
60c7c48c97 update for recent transformers updates (#636) Wing Lian 2023-09-27 12:10:32 -04:00
e8cbf50be6 attention_mask not needed for training (#642) Wing Lian 2023-09-27 11:12:08 -04:00
d887ad86c3 eval_table isn't quite stable enough to be in default llama configs (#637) Wing Lian 2023-09-26 10:13:20 -04:00
19a600a8b8 Feat: Add support for upstream FA2 (#626) NanoCode012 2023-09-26 22:53:28 +09:00
7771498eae add guassian dropout support llama-dropout Wing Lian 2023-09-25 14:50:39 -04:00
5e5296a77c Added quotes to the pip install -e command to fix an incompatibility with shells that do glob expansion like zsh (#632) Fernando Tarin Morales 2023-09-26 00:50:14 +09:00
f3d939016a Merge pull request #629 from OpenAccess-AI-Collective/chore/-change-default-model mhenrichsen 2023-09-25 09:32:01 +02:00
cfbce020e9 Fix: Fail bf16 check when running on cpu during merge (#631) NanoCode012 2023-09-25 13:48:18 +09:00
4fecbfe5e1 default model changed mhenrichsen 2023-09-24 18:52:53 +02:00
67b9888630 Feat(doc): Add eval_sample_packing to doc (#625) NanoCode012 2023-09-23 13:11:27 +09:00
923eb91304 tweak: improve base builder for smaller layers (#500) Maxime 2023-09-22 22:17:50 +02:00
a363604dcf better handling and logging of empty sharegpt turns (#603) Wing Lian 2023-09-22 16:13:42 -04:00
501958bb6f create a model card with axolotl badge (#624) Wing Lian 2023-09-22 16:13:26 -04:00
c25ba7939b update README w deepspeed info (#605) Wing Lian 2023-09-22 00:15:52 -04:00
d5f8589021 chore(callback): Remove old peft saving code (#510) NanoCode012 2023-09-22 12:31:33 +09:00
03e59077a0 misc fixes to add gptq tests (#621) Wing Lian 2023-09-21 21:52:12 -04:00
97d3776ce6 split completion text to sequence_len (#616) Wing Lian 2023-09-21 21:51:25 -04:00
2844eb22b6 run eval on the first step to get a baseline (#617) Wing Lian 2023-09-21 21:51:09 -04:00
e85d2eb06b let MAX_JOBS use the default since we're not resource constrained on our self-hosted runners (#427) Wing Lian 2023-09-21 20:36:30 -04:00
196ff1181e skip the gpu memory checks if the device is set to 'auto' (#609) Wing Lian 2023-09-21 15:20:31 -04:00
92512c390b ignore wandb to resolve isort headaches (#619) Wing Lian 2023-09-21 11:50:09 -04:00
2fe95cdcc1 fix distributed devices (#612) Maxime 2023-09-21 15:11:34 +02:00

... 45 46 47 48 49 ...