axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

945f2e5029 better handling so that all devices have the same dataloader len Wing Lian 2023-07-25 22:18:34 -04:00
daed942fe9 fix rounding of len of batches to int Wing Lian 2023-07-25 10:29:49 -04:00
df3eb645da better handling of variance in multipack dataloader length and trainer hanging when it runs out of data Wing Lian 2023-07-25 10:22:05 -04:00
32fed7039d optimized expand mask fn Wing Lian 2023-07-24 17:11:02 -04:00
7d7b5ebd71 more fixes for 4k and optimizations Wing Lian 2023-07-23 23:05:02 -04:00
4b7ad9927f validation for sample packing and doc Wing Lian 2023-07-22 03:35:06 -04:00
fedcf5a089 Update src/axolotl/utils/dataloader.py Wing Lian 2023-07-22 03:11:20 -04:00
2f2974196d fix for position_ids w packing Wing Lian 2023-07-21 20:31:54 -04:00
2e295c9f94 use accelerator prepare for dataloader Wing Lian 2023-07-19 22:58:16 -04:00
4ab9ab79fd use distributed sampler, avoid accelerate prepare Wing Lian 2023-07-19 12:16:19 -04:00
b02484a83e more fixes for sample packing Wing Lian 2023-07-18 22:27:37 -04:00
58045f0816 more fixes, position_ids seems broken Wing Lian 2023-07-18 16:47:08 -04:00
66774011c4 est total tokens, fix field loop Wing Lian 2023-07-18 11:30:07 -04:00
41d4992029 more fixes for dataloader integration Wing Lian 2023-07-18 10:50:40 -04:00
762f1b08db add position_ids back Wing Lian 2023-07-18 01:50:41 -04:00
3aba4c5d7c use multi pack dataloader w random sampler Wing Lian 2023-07-17 23:44:14 -04:00
ffd96839cf don't move masks to cpu Wing Lian 2023-07-17 11:08:43 -04:00
ef9bf7ad73 fix expand mask for multiple batch items, make sure we pad position_ids Wing Lian 2023-07-17 06:17:28 -04:00
4964b0d345 set position ids and use block diagonal attn mask Wing Lian 2023-07-17 01:56:32 -04:00
36b0e30a9d fix attetion mask with packing Wing Lian 2023-07-15 10:38:01 -04:00
9f99104038 update comment for group_by_length Aman Karmani 2023-08-07 01:04:56 -07:00
36fefcf94b set group_by_length to false in examples Aman Karmani 2023-08-06 23:59:09 -07:00
176b888a63 ensure enable_input_require_grads is called on model before getting the peft model (#345) Wing Lian 2023-08-06 18:13:10 -04:00
3392270544 experimental llama 2 chat support (#296) Jan Philipp Harries 2023-08-06 23:40:52 +02:00
bb53a165f5 add a basic ds zero3 config (#347) Wing Lian 2023-08-06 17:19:51 -04:00
10405b9995 Update XFormers Attention Monkeypatch to handle Llama-2 70B (GQA) (#339) ssmi153 2023-08-07 03:09:04 +12:00
9793faf6dc pre-commit formatting fixes ssmi-main Wing Lian 2023-08-05 22:46:02 -04:00
c93655c0a3 Added Orca Mini prompt strategy (#263) Jan Philipp Harries 2023-08-05 20:16:41 +02:00
64852ae15a Whitespace bug fix ssmi153 2023-08-05 15:08:44 +12:00
1fed74b1d9 Catch configs without pretraining_tp ssmi153 2023-08-05 11:45:12 +12:00
a300a4db1d Fix XFormers attention for Llama-2 70B (GQA) ssmi153 2023-08-05 11:01:44 +12:00
fe285430bc optimize the iteration when tokenizeing large datasets (#332) Wing Lian 2023-08-04 12:12:05 -04:00
0d2e34f056 Merge pull request #336 from tmm1/flash-attn Aman Gupta Karmani 2023-08-03 16:25:30 -07:00
b56a6c0101 Merge pull request #337 from tmm1/readme-fix Aman Gupta Karmani 2023-08-03 15:14:17 -07:00
2eda9e02a9 fix typo Aman Karmani 2023-08-03 21:04:12 +00:00
78b9efb7f4 scope flash-attn+qlora fix correctly, scope to llama, add comment Aman Karmani 2023-08-03 19:19:39 +00:00
312a9fad07 move flash-attn monkey patch alongside the others Aman Karmani 2023-08-03 17:20:49 +00:00
58d665943e python 3.10 and 3.11 both work fine, as does pytorch 2.1.0.dev Aman Karmani 2023-08-03 16:47:25 +00:00
cc7e80026e there is no configs folder Aman Karmani 2023-08-03 16:31:37 +00:00
dc71d8872a feat/llama-2 examples (#319) mhenrichsen 2023-08-03 12:22:48 +02:00
248bf90f89 ensure flash-attn fixes happen in both adapter/lora modes, and use torch_dtype Aman Karmani 2023-08-02 20:15:03 +00:00
77085ea24e qlora w flash attention fixes (#333) Wing Lian 2023-08-01 23:26:16 -04:00
db2a3586f3 add peft install back since it doesn't get installed by setup.py (#331) Wing Lian 2023-07-31 16:31:53 -04:00
6c9a87c8ee pin accelerate so it works with llama2 (#330) Wing Lian 2023-07-30 22:20:06 -04:00
894cba09f3 fix FSDP save of final model (#329) Wing Lian 2023-07-30 21:46:44 -04:00
41a4d15d43 update README for updated docker images (#328) Wing Lian 2023-07-28 16:50:03 -04:00
2c37bf6c21 Prune cuda117 (#327) Wing Lian 2023-07-26 16:27:49 -04:00
9f69c4d8c1 latest HEAD of accelerate causes 0 loss immediately w FSDP (#321) Wing Lian 2023-07-24 11:23:56 -04:00
3d4984b9a5 update prompts for open orca to match the paper (#317) Wing Lian 2023-07-22 13:49:11 -04:00
ff7f18d1ed disable gh cache for first step of docker builds too Wing Lian 2023-07-22 11:46:37 -04:00
cf62cfd661 add runpod envs to .bashrc, fix bnb env (#316) Wing Lian 2023-07-22 10:09:38 -04:00
c5df969262 don't use the gha cache w docker Wing Lian 2023-07-22 08:46:21 -04:00
40a53ff181 Merge pull request #307 from OpenAccess-AI-Collective/xgen-user-sharegpt-tokens Wing Lian 2023-07-22 04:10:38 -04:00
dcdec44347 Merge pull request #306 from ethanhs/xgen Wing Lian 2023-07-22 04:10:18 -04:00
3ffb018a4c Merge pull request #313 from OpenAccess-AI-Collective/tokenizer-llama2-embeddings Wing Lian 2023-07-22 04:09:59 -04:00
a94f2eecb1 Merge pull request #299 from OpenAccess-AI-Collective/flash-attention-2 Wing Lian 2023-07-22 04:07:48 -04:00
1066751358 don't resize embeddings to multiples of 32x by default Wing Lian 2023-07-22 01:52:38 -04:00
1b63bf13bc Merge pull request #308 from OpenAccess-AI-Collective/apache2-license Wing Lian 2023-07-21 09:50:14 -04:00
5cce2a42ff add apache 2.0 license Wing Lian 2023-07-21 09:49:29 -04:00
2a428e8014 better handling since xgen tokenizer breaks with convert_tokens_to_ids Wing Lian 2023-07-21 09:24:11 -04:00
cdf85fdbd5 pin flash attention 2 to the fix for backwards pass Wing Lian 2023-07-21 08:18:53 -04:00
9b790d359b flash attention 2 Wing Lian 2023-07-20 00:00:49 -04:00
38811434e6 Add XGen info to README and example config Ethan Smith 2023-07-21 00:44:50 -07:00
06c61d6f13 Merge pull request #304 from OpenAccess-AI-Collective/NanoCode012-patch-1 NanoCode012 2023-07-21 13:39:45 +09:00
262dc29df2 Merge pull request #300 from OpenAccess-AI-Collective/pytorch-201 Wing Lian 2023-07-21 00:28:38 -04:00
165907fddb Fix(readme): Improve wording for push model NanoCode012 2023-07-21 11:28:35 +09:00
a032c9f452 fix sdp attention to use the flash/mem-efficient context manaager Wing Lian 2023-07-20 01:05:48 -04:00
b06d3e3645 explicitly pin flash attention 1 to v1.0.9 Wing Lian 2023-07-20 01:02:08 -04:00
c58034d48c use pytorch 2.0.1 Wing Lian 2023-07-20 00:47:13 -04:00
28fd429bcf Merge pull request #293 from NanoCode012/fix/tokenize-speed NanoCode012 2023-07-19 11:02:04 +09:00
45ac7c4f88 feat: use multi-core NanoCode012 2023-07-19 10:16:54 +09:00
edd6980dd9 Merge pull request #289 from OpenAccess-AI-Collective/hf_transfer Wing Lian 2023-07-17 15:08:06 -04:00
dc6d25124d Merge pull request #288 from OpenAccess-AI-Collective/NanoCode012-patch-1 Wing Lian 2023-07-17 14:46:43 -04:00
6dd2e7d671 add hf_transfer to requirements for faster hf upload Wing Lian 2023-07-17 14:44:48 -04:00
b64f411849 fix(readme): remove accelerate config NanoCode012 2023-07-18 01:31:02 +09:00
03a59c1ed4 Merge pull request #287 from OpenAccess-AI-Collective/dataclass-fix Wing Lian 2023-07-17 06:09:23 -04:00
ebaec3c406 fix axolotl training args dataclass annotation Wing Lian 2023-07-17 04:57:02 -04:00
73e70e3996 Merge pull request #286 from OpenAccess-AI-Collective/logging-docker-fixes Wing Lian 2023-07-17 04:26:39 -04:00
d75adb9835 misc fixes Wing Lian 2023-07-17 03:00:27 -04:00
02224668c3 Merge pull request #283 from OpenAccess-AI-Collective/docker-git-fetch Wing Lian 2023-07-17 02:17:00 -04:00
f162f3c7cc set transformers cache env var in docker image Wing Lian 2023-07-16 23:03:54 -04:00
eca3531329 git fetch fix for docker Wing Lian 2023-07-16 22:25:05 -04:00
6f16c4569d Merge pull request #276 from theobjectivedad/logging_enhancement Wing Lian 2023-07-16 17:04:52 -04:00
0bd09c077d Merge pull request #280 from teknium1/main Wing Lian 2023-07-16 16:08:58 -04:00
469c08c9ba Merge pull request #279 from NanoCode012/feat/multi-gpu-readme Wing Lian 2023-07-16 16:08:37 -04:00
334af625d0 Merge pull request #277 from cg123/dataset-name Wing Lian 2023-07-16 16:08:15 -04:00
273b3a3aa7 Update requirements.txt Teknium 2023-07-16 10:24:24 -07:00
3cdd8e4122 Add dataset name to all yaml options in README Charles Goddard 2023-07-15 13:17:37 -07:00
cf5ae6b649 Feat(readme): improve docs on multi-gpu NanoCode012 2023-07-16 01:07:27 +09:00
8028652b8f fix attetion mask with packing openorca-fix-mask Wing Lian 2023-07-15 10:38:01 -04:00
b1f4f7a34d Fixed pre-commit problems, fixed small bug in logging_config to handle LOG_LEVEL env var theobjectivedad 2023-07-15 12:29:35 +00:00
81d60e96f0 multipack sampler support from openchat multipack Wing Lian 2023-07-15 08:01:33 -04:00
83237b8445 Merge branch 'OpenAccess-AI-Collective:main' into logging_enhancement The Objective Dad 2023-07-15 06:16:04 -05:00
46032a1a1f Fix formatting mistake Charles Goddard 2023-07-14 20:57:27 -07:00
8bba64258e Add example of dataset with configuration name to README Charles Goddard 2023-07-14 20:46:21 -07:00
88089e8b32 Add ability to pass 'name' argument to load_dataset Charles Goddard 2023-07-14 16:46:39 -07:00
168a7a09cc Merge pull request #274 from OpenAccess-AI-Collective/NanoCode012-patch-2 NanoCode012 2023-07-14 23:15:47 +09:00
231031a0e1 Merge pull request #275 from NanoCode012/feat/safetensors NanoCode012 2023-07-14 23:07:26 +09:00
9234b75cb4 Update log message format, IMO this is easier to read. theobjectivedad 2023-07-14 07:36:21 -05:00
553a86b52c Adding logging enhancement theobjectivedad 2023-07-14 07:26:19 -05:00

... 49 50 51 52 53 ...