axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

bf289123e9 Replace with flash modules Casper 2023-12-07 19:48:40 +01:00
5302d2d534 Reworked Flash Attention ops: xentropy, rmsnorm Casper 2023-12-07 17:26:06 +01:00
4f9b172c47 Remove FP32 cast unsloth_modules Casper 2023-12-07 16:28:25 +01:00
06ae39200b Pin flash-attn to 2.3.3 (#919) Casper 2023-12-07 07:36:52 +01:00
8671ed5a0c Fix import Casper Hansen 2023-12-06 20:26:31 +00:00
538c004080 Fix shapes Casper Hansen 2023-12-06 20:26:25 +00:00
add3b139ed Mistral with fast cross entropy Casper 2023-12-06 20:17:42 +01:00
a581e9f8f6 feat: add check for quantized model (#913) NanoCode012 2023-12-05 01:20:06 +09:00
992e742cdc Support device_map=sequential & max_memory config parameters (#903) Bryan Thornbury 2023-12-04 06:29:21 -08:00
a1da39cd48 Feat(wandb): Refactor to be more flexible (#767) NanoCode012 2023-12-04 22:17:25 +09:00
58ec8b1113 feature: loss watchdog for terminating training runs that are failing (#899) kallewoof 2023-12-04 21:54:34 +09:00
476a205cea Remove learning rate scheduler in deepspeed config to avoid conflict (#909) Haoxiang Wang 2023-12-04 02:17:38 -08:00
a6fefa8885 Initial refactor [untested] Casper 2023-11-30 22:30:28 +01:00
3e3229e2d9 fix for qwen w lora (#906) Wing Lian 2023-11-30 12:45:50 -05:00
1d21aa6b0a ensure merged model matches the training dtype (#902) Wing Lian 2023-11-29 09:55:19 -05:00
71b7ea3c05 Determine FSDP/deepspeed settings on device select. (#883) kallewoof 2023-11-29 22:36:35 +09:00
a48dbf6561 fix: remove FA for qwen examples (#900) NanoCode012 2023-11-27 21:23:54 +09:00
effb281b24 wip for multipack pretraining multipack-pretraining Wing Lian 2023-11-25 17:12:20 -05:00
6a4562ac08 update datasets version to cut down the warnings due to pyarrow arg change (#897) Wing Lian 2023-11-25 16:30:00 -05:00
da154e6d56 support for json data as completion completion-json Wing Lian 2023-11-25 16:05:04 -05:00
1115c501b8 Feat: Add Qwen (#894) NanoCode012 2023-11-26 00:05:01 +09:00
7ee3c4cacb fix: warning should not show if eval_batch_size not provided (#896) NanoCode012 2023-11-25 16:04:00 +09:00
fb12895a17 Feat: Add warmup_ratio (#893) NanoCode012 2023-11-25 12:15:43 +09:00
9fc29e082b chore(doc): Add info on changing role in sharegpt (#886) NanoCode012 2023-11-22 15:32:50 +09:00
575a082aae fix: revert local dir dataset load (#878) NanoCode012 2023-11-18 22:50:41 +09:00
ddf815022a Install from git url (#874) Mark Saroufim 2023-11-17 09:50:51 -08:00
9bf854e59c Phi update 202311 (#876) Wing Lian 2023-11-17 12:47:17 -05:00
797f3dd1de don't train if eval split is too small (#873) Wing Lian 2023-11-16 11:35:42 -05:00
0de1457189 try #2: pin hf transformers and accelerate to latest release, don't reinstall pytorch (#867) Wing Lian 2023-11-16 10:42:36 -05:00
9084879861 tinyllama tinyllama-example mhenrichsen 2023-11-16 13:36:01 +00:00
3cc67d2cdd Feat: Add dataset loading from S3, GCS (#765) NanoCode012 2023-11-16 14:33:58 +09:00
1bc11868eb allow overriding of model_config parameters from the YML (#853) Wing Lian 2023-11-15 23:47:08 -05:00
b3a61e8ce2 add e2e tests for checking functionality of resume from checkpoint (#865) Wing Lian 2023-11-15 23:05:55 -05:00
8a8d1c4023 make docker command more robust (#861) Wing Lian 2023-11-15 23:03:54 -05:00
332984db18 lint fix that didn't get caught by linter (#866) Wing Lian 2023-11-15 14:36:40 -05:00
48630f5b34 Update data.py for signature generation (#851) MilesQLi 2023-11-15 11:12:32 -08:00
b33c1d55a2 Docs: add instructions to 1-click launching on public clouds (#862) Zongheng Yang 2023-11-15 11:11:27 -08:00
0c2a630326 multipack len should use max, not min (#863) Wing Lian 2023-11-15 12:52:32 -05:00
db8a8afcba adds llama and mistral dropout support (#858) Wing Lian 2023-11-15 12:28:50 -05:00
14706504e3 various bugfixes (#856) Wing Lian 2023-11-15 12:23:18 -05:00
501b4d1379 chore(doc): Separate section on runpod (#860) NanoCode012 2023-11-16 01:06:51 +09:00
306fe19c54 feat(doc): add more info on train_on_split (#855) NanoCode012 2023-11-15 23:42:26 +09:00
614cff4107 include the suffix modified string in ascii art (#852) Fabian Preiß 2023-11-15 13:12:28 +01:00
1a6309c8a6 cleanup the old multipack dataloader (#841) Wing Lian 2023-11-12 05:39:09 -05:00
8836986a92 support for fp8 fp8 Wing Lian 2023-11-10 02:35:19 -05:00
105d0b350b Pin optimum package (#838) Bryan Thornbury 2023-11-09 19:36:15 -08:00
f544ab2bed don't compile deepspeed or bitsandbytes from source (#837) Wing Lian 2023-11-08 19:49:55 -05:00
641e6f7e51 multipack w batch sampler (#795) Wing Lian 2023-11-07 20:27:40 -05:00
6dc68a653f use temp_dir kwarg instead Wing Lian 2023-11-06 07:31:46 -05:00
7de6a5639c missing dunder-init Wing Lian 2023-11-06 07:23:31 -05:00
c74f045ba7 chore: lint Wing Lian 2023-11-05 19:49:31 -05:00
0402d19759 make sure to cleanup tmp output_dir for e2e tests Wing Lian 2023-11-05 08:02:31 -05:00
b2430ce670 use accelerate logging for zero/main loggin only Wing Lian 2023-11-06 07:27:42 -05:00
4c834bf25d cleanup verbosity a bit Wing Lian 2023-10-28 14:56:50 -04:00
8056ecd30e add deepspeed-kernels dependency for deepspeed>=0.12.0 (#827) Fabian Preiß 2023-11-05 13:52:56 +01:00
738a057674 Feat: Added Gradio support (#812) Jason Stillerman 2023-11-04 23:59:22 -04:00
cdc71f73c8 update table for rwkv4 support, fix process count for dataset (#822) Wing Lian 2023-11-04 23:45:44 -04:00
6459ac7357 fix: pin autogptq (#818) NanoCode012 2023-11-03 23:14:55 +09:00
964d858da0 fix model parallel (#816) Wing Lian 2023-11-02 21:34:22 -04:00
87e8f13056 repalce linear layers for qlora as well as add peft tensor-parallel Wing Lian 2023-11-01 22:31:02 -04:00
026172eaa8 remove unused code, support adapter for tensor parallel Wing Lian 2023-11-01 20:31:51 -04:00
b3689f73e3 chore: lint Wing Lian 2023-11-01 20:25:10 -04:00
c4664ba8ee tp fixes Wing Lian 2023-11-01 18:50:18 -04:00
75e4fc2825 wip more tp fixes Wing Lian 2023-11-01 01:45:36 -04:00
e13c2fd6b1 getting better Wing Lian 2023-09-08 10:46:11 -04:00
8a21e14a21 load to cpu first Wing Lian 2023-09-08 02:22:29 -04:00
9c52a83403 load model faster w low_cpu_mem_usage Wing Lian 2023-09-08 02:06:12 -04:00
fb8ee37ca6 wip tp Wing Lian 2023-09-08 01:58:15 -04:00
65f3a4f703 tensor-parallel support Wing Lian 2023-09-08 00:47:01 -04:00
10388a8daf fix(tokenizer): update log order after update (#806) NanoCode012 2023-10-31 13:21:20 +09:00
b52e61a574 pretrain fixes for mm llava Wing Lian 2023-10-30 11:03:55 -04:00
9f7e8a971d feat(doc): add dummyoptim faq fix (#802) NanoCode012 2023-10-29 23:06:06 +09:00
637ed095a0 fix(config): Set eos/bos to tokenizer if different (#801) NanoCode012 2023-10-29 21:32:37 +09:00
53f93f67bb fix to set training args so projector properly saves Wing Lian 2023-10-29 06:08:38 -04:00
ef95ea2977 additional args for parity, fix to properly save projector during pretrain Wing Lian 2023-10-29 05:12:34 -04:00
db86e32cf4 enable hf trasfer and add unzip to image docker-cleanup-20231029 Wing Lian 2023-10-29 04:53:14 -04:00
827ec3d274 refactor neft patch to be more re-usable similar to trl's impl (#796) Wing Lian 2023-10-29 04:33:13 -04:00
1321608dc4 add docs and tweak yml Wing Lian 2023-10-28 13:07:59 -04:00
8b79ff0e94 fix eval_steps to be a sane default (#797) Wing Lian 2023-10-27 22:36:30 -04:00
0800885e2f Update to adapt to sharegpt datasets with "assistant" rather than "gp… (#774) MilesQLi 2023-10-27 22:00:16 -04:00
d3193beac3 Fix Deepspeed Zero3 Config (#791) Teknium 2023-10-27 18:57:02 -07:00
2e71ff03a6 Add docker advanced instruction to README (#792) Aleksa Gordić 2023-10-27 14:24:04 +01:00
facc49f32b GitBook: No commit message chanvichetvong 2023-10-26 15:11:00 +00:00
e50ab072e2 Create preprocess CLI (#785) Casper 2023-10-26 15:35:42 +02:00
05bd6f1122 Threaded MultipackDistributedDataloader with prefetched samples (#759) Casper 2023-10-26 07:49:52 +02:00
7ff30c4033 wip Wing Lian 2023-10-25 09:19:19 -04:00
faa46fbcf8 fix code for llava parity, add llama yml Wing Lian 2023-10-24 09:45:47 -04:00
20aa4b57d2 chore(readme): Improve documentation on conversation field (#782) NanoCode012 2023-10-24 12:52:32 +09:00
11d1d607db chore: refactor truthy check and fix mypy (#780) NanoCode012 2023-10-24 12:28:40 +09:00
fdc3e4d505 more fixes to try to get mm working Wing Lian 2023-10-23 23:15:33 -04:00
b885169229 handle load_model splat llava-train Wing Lian 2023-10-23 21:55:05 -04:00
ab9d12ce34 handle dataset loading for multimodal Wing Lian 2023-10-23 21:44:07 -04:00
866774737b WIP llaval support Wing Lian 2023-10-23 20:29:49 -04:00
6c81c61bc4 refactor setup trainer so we can add more hooks (#773) Wing Lian 2023-10-23 17:38:41 -04:00
9b43e7ea15 disable eval table w sample packing in examples (#778) Wing Lian 2023-10-23 09:18:44 -04:00
2d8def68dc simplify by removing duplicate base_model_config (#772) Wing Lian 2023-10-23 01:42:38 -04:00
44c9d0151a Fix: Warn when fullfinetune without adapter (#770) NanoCode012 2023-10-23 04:41:43 +09:00
ca84cca2c0 convert exponential notation lr to floats (#771) Wing Lian 2023-10-22 15:37:03 -04:00
32eeeb5b64 Hotfix for not saving correctly (#762) Casper 2023-10-22 19:22:32 +02:00
afedc470bd Fix: Cannot tokenize with bf16 and on cpu (#766) NanoCode012 2023-10-23 01:32:26 +09:00

... 44 45 46 47 48 ...