axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

bd8cab49c9 update path to align with fsdp example mhenrichsen 2023-08-15 19:51:58 +02:00
c01015f33f Fix(config): Update handling of deepspeed config (#404) NanoCode012 2023-08-16 01:22:43 +09:00
72fe3f8e3d Fix(docs): Update flash attn requirements (#409) NanoCode012 2023-08-15 22:40:52 +09:00
47961fdb8b update docs for tokenizer_legacy (#401) Wing Lian 2023-08-15 09:34:42 -04:00
7ad37cb6d7 Fix(template): Remove iPhone/android from Issue template (#407) NanoCode012 2023-08-15 22:32:51 +09:00
29241cf1e4 Ax art (#405) Wing Lian 2023-08-15 08:34:30 -04:00
31db0ecce4 add templates, CoC and contributing guide (#126) lightningRalf 2023-08-15 13:41:05 +02:00
da10af03e9 fix eval steps and strategy (#403) Wing Lian 2023-08-15 07:28:50 -04:00
85cf4f8e2c better handling of empty input ids when tokenizing (#395) Wing Lian 2023-08-15 01:09:59 -04:00
2e22404d2d add utils.data.prepare_dataset Aman Karmani 2023-08-15 04:15:55 +00:00
be294fd605 Feat(doc): Add how to save by epochs (#396) NanoCode012 2023-08-15 13:24:25 +09:00
fc2d6be96d use context manager to run things on rank0 before others (#397) Wing Lian 2023-08-15 00:10:47 -04:00
31079cd5fd smart resize embeddings embeddings-resize Wing Lian 2023-08-07 10:15:10 -04:00
1687be6a35 don't use mask expansion for inference (#392) Wing Lian 2023-08-14 20:52:54 -04:00
41ecb451c2 Feat(doc): Add max_steps to readme (#389) NanoCode012 2023-08-15 00:34:22 +09:00
3c2ad00d07 Feat(config): add max steps (#387) Gabriel Puliatti 2023-08-14 10:19:29 -05:00
5d48a10548 Added "epoch" evaluation_strategy (#388) florian peyron 2023-08-14 16:59:23 +02:00
73a0b6ead5 Feat(config): Add hub_strategy (#386) NanoCode012 2023-08-14 20:12:55 +09:00
63fdb5a7fb Error msg for sharegpt if conv has less than 2 msg (#379) florian peyron 2023-08-14 10:40:40 +02:00
fdffef5940 new llama-2 default settings (#370) mhenrichsen 2023-08-14 10:39:09 +02:00
919246fbc1 don't pass rope_scaling kwarg if it's None (#383) Wing Lian 2023-08-13 18:57:38 -04:00
ffac902c1b bump flash-attn to 2.0.4 for the base docker image (#382) Wing Lian 2023-08-13 17:55:04 -04:00
15f6e57eaa Fix crash when running without CUDA Charles Goddard 2023-08-13 13:19:48 -07:00
956a177678 speed up flash-attn inference feature/attn-patches Aman Karmani 2023-08-13 18:03:38 +00:00
747e84d3bb update flash-attn patch for 70B/GQA and inference using helper from flash-attn tests Aman Karmani 2023-08-13 15:41:44 +00:00
c45a786039 sync xformers patch to follow shared format and be diffable Aman Karmani 2023-08-13 15:41:06 +00:00
70e6c28121 split sdp attn into its own patch Aman Karmani 2023-08-13 15:40:43 +00:00
729c299256 Feat(doc): Improve sharegpt doc (#378) NanoCode012 2023-08-14 00:36:00 +09:00
86a91e260b save tokenizer before training starts (#380) Wing Lian 2023-08-13 11:28:58 -04:00
094fc2c6e6 try to detect accelerate and only use device_map=None in that case (#373) Aman Gupta Karmani 2023-08-12 21:32:07 -07:00
2dafa730ef Create FUNDING.yml Wing Lian 2023-08-13 00:30:34 -04:00
343ac84e5a fix check for flash attn branching (#377) Wing Lian 2023-08-12 22:48:08 -04:00
0c967279ce remove unnecessary local variable Aman Karmani 2023-08-13 01:58:39 +00:00
efb3b2c95e simplify load_tokenizer Aman Karmani 2023-08-13 01:33:38 +00:00
7b55fe6419 improve GPU logging to break out pytorch cache and system mem Aman Karmani 2023-08-13 01:50:32 +00:00
e029ab34ea quiet noise from llama tokenizer by setting pad token earlier Aman Karmani 2023-08-13 01:30:54 +00:00
8cec513447 extract module for working with cfg Aman Karmani 2023-08-13 01:22:20 +00:00
a13e45d548 fix DefaultDict.__or__ Aman Karmani 2023-08-10 03:56:50 +00:00
1afbd8af2d Fix logic errors feature/relora-rebased Charles Goddard 2023-07-25 16:19:53 -07:00
b4f2eea2ed Remove redundant assert Charles Goddard 2023-07-24 23:23:24 -07:00
bbf88b02c1 Fix saving logic Charles Goddard 2023-07-24 22:14:16 -07:00
64a8e04430 Remove local config Charles Goddard 2023-07-24 21:11:52 -07:00
c8f7213bc6 Add CPU offload Charles Goddard 2023-07-24 21:07:36 -07:00
b57238ecec Experimental ReLoRA (+qlora) implementation Charles Goddard 2023-07-24 09:53:27 -07:00
918f1b0dfb revert previous change and build ax images w docker on gpu (#371) Wing Lian 2023-08-12 20:23:00 -04:00
c3fde36ada attempt to run non-base docker builds on regular cpu hosts (#369) Wing Lian 2023-08-12 19:07:38 -04:00
2bb0b78975 Attention mask and position id fixes for packing (#285) Wing Lian 2023-08-12 15:14:56 -04:00
a276c9c88d Fix(save): Save as safetensors (#363) NanoCode012 2023-08-13 01:22:52 +09:00
7019509daa Add wandb_entity to wandb options, update example configs, update README (#361) Morgan McGuire 2023-08-12 17:17:11 +01:00
96bd6ae1c4 Fix(model loading): Warn when model revision is passed to gptq (#364) NanoCode012 2023-08-13 01:16:59 +09:00
e37d9358e6 Fix(message): Improve error message for bad format (#365) NanoCode012 2023-08-13 01:16:18 +09:00
b5212068ac Feat: Add rope scaling (#343) NanoCode012 2023-08-13 00:50:15 +09:00
289d5c403d feat(merge): save tokenizer on merge (#362) NanoCode012 2023-08-13 00:18:10 +09:00
35c8b90306 Merge pull request #355 from tmm1/bitsandbytes-fixes Aman Gupta Karmani 2023-08-11 15:15:38 -07:00
64af21bcb2 set env vars trainer needs for FSDP packing-attn-limit-fa2-rebased Wing Lian 2023-08-11 08:37:33 -04:00
6b5cf8b5ea optimize length reducer from 9m -> <5sec Wing Lian 2023-08-11 08:30:30 -04:00
fae6ed8092 Update README.md on pretraining_dataset (#360) NanoCode012 2023-08-11 12:17:07 +09:00
94d03c8402 Clarify pre-tokenize before multigpu (#359) NanoCode012 2023-08-11 11:27:42 +09:00
79500f358a need to pass total num tokens to trainer too Wing Lian 2023-08-10 19:08:23 -04:00
7e977a9b68 optimization if total_num_tokens is already known Wing Lian 2023-08-10 19:02:28 -04:00
ac4b700daa optimization if total_num_tokens is already known Wing Lian 2023-08-10 19:01:17 -04:00
2565c2f259 async batching for multipack Wing Lian 2023-08-10 18:28:15 -04:00
a07f432d9c calculate cum seq lens with pos_ids instead of mask, simplify packing params, fix distributed barrier Wing Lian 2023-08-10 17:16:01 -04:00
11ddccb80f Merge pull request #356 from tmm1/load_model-args Aman Gupta Karmani 2023-08-09 18:24:34 -07:00
964312199e Merge pull request #354 from tmm1/gpu-util Aman Gupta Karmani 2023-08-09 15:44:18 -07:00
718102271f simplify load_model signature Aman Karmani 2023-08-09 22:35:33 +00:00
f5c11f8262 Merge pull request #350 from tmm1/group-len-false-examples Aman Gupta Karmani 2023-08-09 14:48:48 -07:00
fce40aab23 bump to latest bitsandbytes release with major bug fixes Aman Karmani 2023-08-09 21:47:11 +00:00
9c314101d5 use newer pynvml package Aman Karmani 2023-08-09 21:06:28 +00:00
e303d64728 log GPU memory usage Aman Karmani 2023-08-09 08:10:37 +00:00
57d9bf711c let's not cleanup the cached datasets Wing Lian 2023-08-08 21:27:55 -04:00
26983a1974 fix sampler to prevent overfit w new epochs Wing Lian 2023-08-08 15:34:18 -04:00
1b8747e319 use custom distributed checks Wing Lian 2023-08-08 13:35:04 -04:00
035b3c760c add numba to requirements. Wing Lian 2023-08-08 10:55:29 -04:00
17abbd59e1 previous accelerate is still most performant Wing Lian 2023-08-08 09:46:01 -04:00
6ec76ddb4c fix steps calculation Wing Lian 2023-08-08 05:13:21 -04:00
21d307b15b fix counts by accounting for num devices Wing Lian 2023-08-08 04:13:10 -04:00
58e9dee204 fixes and go back to distributed sampler since batch sampler won't work Wing Lian 2023-08-08 03:49:29 -04:00
4f7c04bae0 more fixes and optimizations Wing Lian 2023-08-08 03:16:00 -04:00
1162b93b6b filter w multiple cpus Wing Lian 2023-08-08 00:50:56 -04:00
21f445d763 more packing and dataset optimizations and fixes Wing Lian 2023-08-08 00:45:24 -04:00
b4d1d22782 note pattern when using groups Aman Karmani 2023-08-07 16:18:42 -07:00
229b9165aa fix test and pylint checks Wing Lian 2023-08-07 09:36:29 -04:00
394a65f11f add unit tests for cum seq lens, add ability to build cu_seq_lens from positional ids, fix prompt test Wing Lian 2023-08-06 17:33:40 -04:00
c70dae63cc add chatml Wing Lian 2023-08-05 22:41:47 -04:00
7712955b35 fix chatml system prompt for openorca, legacy tokenizer opts Wing Lian 2023-08-04 13:57:17 -04:00
f93f0017cd fix flash-attn, xformers, packing, support chatml Wing Lian 2023-08-04 10:09:16 -04:00
0b01da0713 properly calculate max len Wing Lian 2023-08-03 16:12:04 -04:00
b2f7bc7ccd use cumulative seq len with var len flash attn v2 w packing Wing Lian 2023-08-03 15:50:13 -04:00
b8905e2a91 sample_packing_seq_len_multiplier config Wing Lian 2023-08-03 08:24:33 -04:00
7e1edc662a make sure the chunk size is an int Wing Lian 2023-08-03 00:27:33 -04:00
98c9bc69de seq_len_multiple for packing Wing Lian 2023-08-02 23:20:19 -04:00
8378335dc9 limit packing to sequences of max seq len Wing Lian 2023-08-02 22:07:40 -04:00
bdd34c7400 weighted CEL fixes Wing Lian 2023-08-02 21:36:39 -04:00
c6cc54c7d9 weighted CE losses Wing Lian 2023-08-02 15:57:00 -04:00
83f7362480 don't split batches when packing Wing Lian 2023-08-02 08:26:49 -04:00
958d423e7c only process eval dataset for packing if not None Wing Lian 2023-07-30 22:55:17 -04:00
e74eab6e73 add a test for the mask expansion for sequence packing Wing Lian 2023-07-28 12:10:15 -04:00
487abfc769 pass sample packing efficiency to training args Wing Lian 2023-07-26 00:06:28 -04:00
2bee646e85 fix step calc for packing Wing Lian 2023-07-25 23:52:34 -04:00

... 48 49 50 51 52 ...