axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

c1382e79b6 Create multi-node.md (#613) Maxime 2023-09-21 04:02:16 +02:00
5d931cc042 Only run tests when a change to python files is made (#614) Maxime 2023-09-21 04:02:04 +02:00
ca476d7f8e don't load the actual model when pre-loading to load modeling code 20230920-btlm Wing Lian 2023-09-20 13:37:32 -04:00
ec0958f4f8 Update requirements.txt (#610) Javier 2023-09-20 14:40:49 +02:00
faecff9798 support to disable exllama for gptq (#604) Wing Lian 2023-09-19 17:51:08 -04:00
aa656e04bd Delete duplicate lines (#606) bofeng huang 2023-09-19 22:40:05 +02:00
b53e77775b update dockerfile to not build evoformer since it fails the build (#607) Wing Lian 2023-09-19 16:28:29 -04:00
674c57692d more sane defaults for openllama 3b used for quickstarts (#602) Wing Lian 2023-09-19 09:15:10 -04:00
1eebbd09c3 improve handling for empty text on the tokenization step (#502) Wing Lian 2023-09-19 08:09:56 -04:00
62a774140b Fix for check with cfg and merge_lora (#600) Wing Lian 2023-09-18 21:14:32 -04:00
31b9e0c6e8 minor tweaks to simplify (#597) Wing Lian 2023-09-18 11:45:44 -04:00
6b9b229356 btlm and falcon monkey patches for flash attn (#566) Wing Lian 2023-09-17 13:49:18 -04:00
131afdbd89 add bf16 check (#587) Wing Lian 2023-09-17 13:49:03 -04:00
00dce35fb2 Feat(data): Allow loading local csv and text (#594) NanoCode012 2023-09-18 00:32:27 +09:00
b15b19eb8d gather/broadcast the max value of the packing efficiency automatically (#463) Wing Lian 2023-09-17 11:08:18 -04:00
ab534d75ba don't add position_ids for evals (#591) Wing Lian 2023-09-16 16:11:57 -04:00
21ec195c9f optionally configure sample packing for evals (#589) Wing Lian 2023-09-16 00:09:48 -04:00
62eaee7649 make phi training work with Loras (#588) Wing Lian 2023-09-15 20:51:55 -04:00
be75668400 set fsdp state dict (#584) Jan Philipp Harries 2023-09-15 23:47:36 +02:00
aeec7c4688 pop block_cls since it's not an actual kwarg Wing Lian 2023-09-15 15:54:06 -04:00
360788296a don't resize embeddings if it's already large enough (#577) Wing Lian 2023-09-15 15:47:09 -04:00
12a2dbbc2c Support Sample packing for phi arch (#586) Wing Lian 2023-09-15 15:46:54 -04:00
3a2edc85c3 Feat(doc): Add features to doc (#583) NanoCode012 2023-09-16 01:14:15 +09:00
f7a22632d7 support custom field for completion from yml (#580) Wing Lian 2023-09-15 07:48:21 -04:00
1aa400721e Fix Codellama examples (#582) Doan Minh Phuong 2023-09-15 15:19:13 +07:00
8dcd40ac78 prevent cli functions from getting fired on import (#581) Wing Lian 2023-09-15 04:03:32 -04:00
a5a625f47e update support matrix with btlm and phi (#579) Wing Lian 2023-09-15 02:46:15 -04:00
861cecac2a refactor scripts/finetune.py into new cli modules (#550) Wing Lian 2023-09-15 01:43:52 -04:00
1078d3eae7 E2e passing tests (#576) Wing Lian 2023-09-15 01:03:49 -04:00
24146733db E2e device cuda (#575) Wing Lian 2023-09-14 22:49:27 -04:00
9218ebecd2 e2e testing (#574) Wing Lian 2023-09-14 21:56:11 -04:00
228420972e Phi examples (#569) Wing Lian 2023-09-14 11:17:47 -04:00
c6d870b91d mypy wandb ignore (#572) Wing Lian 2023-09-14 11:17:30 -04:00
115795079d remove columns after tokenizing for pretraining (#571) Wing Lian 2023-09-14 11:08:22 -04:00
3b18c963cc set auto for other params that hf trainer sets for ds. include zero1 json (#570) Wing Lian 2023-09-14 11:04:37 -04:00
3fbde762ab fix save_steps so it doesn't get duplicated (#567) Wing Lian 2023-09-13 20:40:33 -04:00
f6060a664e Model parallel (#538) Wing Lian 2023-09-13 11:45:30 -04:00
a4e1bb6606 let hf trainer handle torch compile (#516) Wing Lian 2023-09-13 11:42:12 -04:00
36e53c7442 improve how we setup eval/save strategies and steps (#547) Wing Lian 2023-09-13 11:37:23 -04:00
e7aa7b1a1e gracefully handle length feature used for group by (#565) Wing Lian 2023-09-13 11:23:30 -04:00
e5bb22a56b add optimization for group-by-len (#563) Wing Lian 2023-09-13 10:57:12 -04:00
fdb777bc06 check for the existence of the default accelerate config that can create headaches (#561) Wing Lian 2023-09-13 10:38:28 -04:00
bf0804447c fix wandb so mypy doesn't complain (#562) Wing Lian 2023-09-13 10:36:16 -04:00
5b67ea98a6 Add training callback to send predictions to WandB table (#521) Glavin Wiechert 2023-09-13 10:51:08 -03:00
2f586d18db Fix pretraining with iterable/streaming Dataset (#556) Jan Philipp Harries 2023-09-13 06:16:40 +02:00
9845c5e12d document that packaging needs to be installed before flash-attn (#559) Wing Lian 2023-09-12 12:18:30 -04:00
772cd870d4 fix the sed command to replace the version w the tag v0.3.0 Wing Lian 2023-09-11 13:44:19 -04:00
6c5fbe6223 add long_description for pypi push (#555) Wing Lian 2023-09-11 13:34:29 -04:00
bcbc9597e9 replace tags, build dist for pypi publish (#553) Wing Lian 2023-09-11 13:25:41 -04:00
6d57f2f0f0 ergonomic update to optimizer config doc (#548) The Objective Dad 2023-09-11 11:35:45 -05:00
20ed4c1f9e pypi on tag push (#552) Wing Lian 2023-09-11 10:33:42 -04:00
c5dedb17ad remove with section, doesn't seem to work (#551) Wing Lian 2023-09-11 10:27:17 -04:00
b56503d423 publish to pypi workflow on tagged release (#549) Wing Lian 2023-09-11 09:44:47 -04:00
a94f9cb99e fix for quant config from model (#540) Wing Lian 2023-09-10 12:40:52 -04:00
c1921c9acb Update requirements.txt (#543) dongxiaolong 2023-09-09 04:07:11 +08:00
0b4cf5bc8c workaround for md5 variations (#533) Wing Lian 2023-09-08 16:01:05 -04:00
78ee2cdab2 add git environment variables to compose: avoid checkout failure error 128 on build (#534) SlapDrone 2023-09-08 21:59:49 +02:00
34c0a86a11 update readme to point to direct link to runpod template, cleanup install instrucitons (#532) Wing Lian 2023-09-08 11:58:54 -04:00
5e2d8a42d9 Adding NCCL Timeout Guide (#536) The Objective Dad 2023-09-08 10:57:47 -05:00
e30f1e3cf7 Early stopping metric (#537) Wing Lian 2023-09-08 11:57:02 -04:00
343714972b recommend padding when using sample packing (#531) Wing Lian 2023-09-06 17:00:21 -04:00
245c5c41e2 log rank too (#527) Wing Lian 2023-09-06 08:37:51 -04:00
a546ca2813 misc fixes/improvements (#513) Wing Lian 2023-09-05 16:40:13 -04:00
881d333b84 wip for new datasets abstractions datasets-refactor Wing Lian 2023-09-03 22:21:40 -04:00
3355706e22 Add support for GPTQ using native transformers/peft (#468) Wing Lian 2023-09-05 12:43:22 -04:00
daa4faca12 Merge pull request #520 from bdashore3/sharegpt-fixes mhenrichsen 2023-09-05 09:02:55 +02:00
fc8766e502 reorg a bit Aman Karmani 2023-09-05 02:21:24 +00:00
72a6fe1c1f use flash_attn rmsnorm when available (#526) Aman Gupta Karmani 2023-09-04 19:44:51 -04:00
5fe30b1497 use flash_attn xentropy when available (#525) Aman Gupta Karmani 2023-09-04 17:49:16 -04:00
44454ae4c4 move is_llama_derived_model into normalize_config (#524) Aman Gupta Karmani 2023-09-04 00:19:03 -04:00
09f154397e No gather single gpu (#523) Wing Lian 2023-09-03 23:24:28 -04:00
83d904a27d fix the context manager call multi-gpu-state Wing Lian 2023-09-03 22:49:28 -04:00
5e4a760ad8 start to swap out for accelerate partial state Wing Lian 2023-09-03 22:41:29 -04:00
995557bdf3 Prompters: ShareGPT: Allow for custom system prompts kingbri 2023-09-01 13:48:33 -04:00
0026fcc3df remove torch install for now autogptq-tests Wing Lian 2023-09-01 08:15:22 -07:00
1991946c5a fix: bad dtype for full finetune (#504) Maxime 2023-09-01 16:11:45 +02:00
f51c9c56c6 Fix(doc): Inform Windows users to use WSL/docker (#518) NanoCode012 2023-09-01 16:08:21 +09:00
7710e81f50 log supervised token count (#448) Wing Lian 2023-08-31 15:45:23 -07:00
48434bec54 Debug tokenization output: Add ability to output text only (no tokens), and/or specify num samples to see (#511) Tom Jobbins 2023-08-31 22:26:52 +01:00
396a7a74fc Added advanced DDP args (#515) Jan Philipp Harries 2023-08-31 19:37:47 +02:00
b21e4a20fe split train from other cli options (#503) Wing Lian 2023-08-30 22:01:47 -07:00
42f9642792 Changed Bench Eval to report metrics correctly by split. Added total accuracy and renamed previously used bench_accuracy to bench_average_accuracy. (#512) Alpay Ariyak 2023-08-31 01:00:50 -04:00
53ce90d21e add sync_model_states parameter to fix resume from checkpoint with fsdp fsdp-defaults Wing Lian 2023-08-15 01:01:01 -04:00
c56b450cf5 drop empty tokenized rows too (#509) Wing Lian 2023-08-30 06:55:26 -07:00
1e07c162f1 set zero3 optimizer betas to auto so they inherit from HF trainer config (#507) Aman Gupta Karmani 2023-08-30 08:10:33 -04:00
b448c77148 address pr feedback Wing Lian 2023-08-29 22:45:22 -07:00
76576323df add eval benchmark callback (#441) Wing Lian 2023-08-29 13:24:19 -07:00
c820d04669 gptq doesn't play well with sample packing Wing Lian 2023-08-29 12:15:31 -07:00
588cd65a64 fix setup.py to use extra index url Wing Lian 2023-08-23 22:08:11 -04:00
caa80e891d don't need explicit peft install for tests Wing Lian 2023-08-23 21:41:42 -04:00
ac37753aa2 remove old gptq docker Wing Lian 2023-08-23 21:38:45 -04:00
a29560004b more tweaks and add yml Wing Lian 2023-08-23 21:35:12 -04:00
1deb767fe8 auto gptq support Wing Lian 2023-08-23 21:16:48 -04:00
548787daae customizable ascii art (#506) Wing Lian 2023-08-29 10:13:42 -07:00
c3de28942c fix for gather across multiple gpus benchmark-callbacks-next Wing Lian 2023-08-29 06:57:28 -07:00
5ac3392075 support for datasets with multiple names (#480) Wing Lian 2023-08-29 06:18:17 -07:00
e356b297cb remove --force-reinstall from Dockerfile to ensure correct pytorch version (#492) Aman Gupta Karmani 2023-08-29 09:17:51 -04:00
48c56470d0 Fix(doc): Clarify no amp to full yaml docs (#496) NanoCode012 2023-08-29 22:17:37 +09:00
36b2e1cfee tweak: use default config file when only one file is present (#501) Maxime 2023-08-29 15:17:10 +02:00
125cccb786 Refactor train cfg cli (#499) Wing Lian 2023-08-29 05:37:53 -07:00

... 46 47 48 49 50 ...