axolotl

tocmo0nlord/axolotl

Fork 0

Commit Graph

Select branches

Hide Pull Requests

1947fix

1991test

20230920-btlm

20231212-fixes

20240216-updates

20240307-updates

20240404-lisa-determinism

3181

4bit-optimizers

775-option-to-drop-vs-truncate-on-rows-longer-than-context-length

NanoCode012-patch-1

accelerator-args-builder

activation-offloading-torchtune

activations

activeblue/main

async-grpo-patched-v2

attention_enum

attn-implementation-refactor

attn-patches

autodoc

autogptq-tests

axolotl-ci-hf

base-model-readme-update

benchmark-callbacks-next

bursteratom-doc-faq-update

chat-dataset-tool

chat-template-granite

chore/docstring-distributed

cj_tokenizer_default_prompt_template

cli-cloud-modal-math-hard

cli-refactor

codecov-pulls-only

coderabbitai/docstrings/3e51a68

coderabbitai/docstrings/QVUilv72ojQNaYsCLVNpUpfo2rK1ZU5x90oPNXYz0ZfsWzWSHca36pjgaU5JOtZOA4gNjbjVYxShdRmkm7fGSlW

coderabbitai/docstrings/b234532

colab-misc-fixes

colab-misc-fixes-test

completion-json

compute-perplexity-metrics

cp-sdpa

cuda-12.8.1

custom-modeling

custom-trainer-cls

datasets-351

datasets-refactor

debug-hf-home-cache

deepspeed-low-cpu-mem

deepspeed_0_14_4

destroy-pg

dev-base

device-mesh

devstral-support

dft

diff-transformer

diffusion-custom-loss

diffusion-custom-models

diffusion-next-token-trainer

djsaunde-patch-1

docker-base-nvcr-pytorch

docker-cleanup-20231029

docs-lint-20250212

dpo-spawn-fix

dump-config

dynamic-sft

e2e-fsdp-trainer

embeddings-resize

enable_tp

eos-hell

exp-expand-len

fa-261

fa-check

fa3-hopper

feat/beautiful-readme

feat/glm45

feat/glmflash-other

feat/liger-deepseekv3

feat/linearize

feat/lmeval-baseten

feat/phi_35_vision

feat/pref_liger

feat/soap-optim-v2

feat/spaces-ui

feat/torchao-qlora

feat/wizard

feat_hqq

feature/attn-patches

feature/enable-huggingface-dataset-revision

feature/relora-rebased

fix-ddp_find_unused_parameters

fix-l3-lora

fix-merge-lint-issue

fix-preview

fix/cce-linear

fix/cp-waste

fix/diffusion

fix/doc-key

fix/dpo-labels

fix/eval-accu

fix/gemma3-text-only

fix/gemma3n-text-attention

fix/granite-speech

fix/hpc-root

fix/issue-1-build-deps

fix/issue-2-flash-attn-install

fix/issue-3-telemetry-whitelist

fix/issue-4-deepspeed-optional

fix/issue-5-8-docs

fix/issue-6-default-attention

fix/issue-7-hf-token-check

fix/kd-trainer-num-items

fix/merge-lora-fp32

fix/replace_jackllama

fix/rl-trainer-arg

fix/vllm-version

fix/xformers

fix_kto

fixtypo

flan-no-bos

flash-attn-2_5_5

flash-attn-fix-patches-wo-sample-packing

flex_patching_update

flx_attn_support

fp8

fsdp-defaults

fsdp-fft

fsdp-fix

fsdp-qdora

fsdp2

fsdp2_fp32

fused-mlp-ez

gh-pages

grouped_lr_squashed

grpo-path

grpo-path-v2

grpo-ref-model-cleanup

grpo_liger

hamelsmu-patch-1

hf-trainer-refactor

hymba_multipack2

ia3-peft

iterable-optional

jagged-restart-lr-scheduler-v3

kd-fix-20250519-v2

kd-logits-view

kd-logprob-data

kd-trainer

kd-trainer-2

kd-trainer-pre

kd-trainer-rebased

kd-trainer-v2

kd-trainer-zscore

keep_in_memory

kernelize-scattermoe-lora

kto_fix

kwargs-refactor

latent-space

lhl-moe-aux-loss-free

liger-063

liger-065

liger-dpo

lisa

llama-4-examples

llama-4-z3

llama-dropout

llama-flash-attn-fix

llama-multipack

llama4

llama4-patches

llava

llava-train

llmcompressor-sft

llmcompressor-sft-v2

llmcompressor-sft-wing

lora-fsdp2-doc

lora-kernels-deepspeed

lora-kernels-doc-fix

lora-quant-state-offset

lora_bf16

lora_kernels_fsdp

main

main-base

map-dataset-fetcher-fix

maverick-example

merge-lora-on-complete

merge-lora-tests

merged-2554

mistral-support

mixtral_optimized

mixtral_swiglu

mm2

mm3

mm_mc_chat

modal-upgrade-builder

model-loader-refactor

moekernels

mora

multi-gpu-state

multipack

multipack-dpo

multipack-pretraining

muon-validation

nca-pair

nd_parallel

neft-v2

no-bos-tokens-packing

no-seq-len

no-zero-ds-train

offload-activations-disk

olmo-no-position_ids

online-topk-kd

openorca

openorca-fix-mask

openorca-v2

optimizer-checkpoint

optimizer-compile

optimizers-refactor

packing-attn-limit-fa2-rebased

patch_lora_post_model_load

peft-update

phi-moe

pixtral_integration

pre-commit-update

preprocess_grpo-fix

pretrain-dataset

print_venv

pytest-each-flakey

pytest-skip-s2

q-galore

quantize-ptq-cli

quartodoc

quartodoc-fix

rala

rala-v2

reentrant-w-offloading

refactor-flash-attention

relaxed-recursive-transformers

release-0.10.x

release-0.8.x

release-v0.11.x

release-v0.12.x

release-v0.13.x

release-v0.9.x

remove-gptq-warn

revert-2332-fix_sample_packing

revert-2906-checkpoint-on-step-1

revert-multipack-changes

rl-trainers-sp

runpod-sls

sac

sageattention

save_only_model

scatter_moe

scatter_moe_eric

scattermoe-lora-optim-dtypestest

scattermoe-nanotron

sdpa-cp

sdpa-multipack

seq-parallel-ring

sequence-parallelism

shampoo

shampoo-low_bit

shared-prepared-ci

sharegpt-batched

sharegpt-field-conversations

smaller-rand-model

smol-ci

soap-optim

sp-fix-masking

sp-restore-buffers

sp-rl

sp-rl-v3

split-batches-sizes

sppo

squash_position_ids

ssmi-main

stable

streaming

streaming-on-the-fly-preprocess

streaming-remote-dataset

streaming-v2

swe-rebench-rl-rebase

telemetry

telemetry-opt-in

tensor-parallel

tensorboard-loss-check

testingci

textui

tinyllama-example

tool-mpm

topk-logprobs-triton

torch-211-base

torch_tensor_parallel

tp_support

train-refactor

transformers-4511

transformers-4513

transformers-4573

transformers-4_47_0_v2

transformers-fsdp-check

transformers-itl-refactor

tui

unsloth_modules

update-examples-llama3-ez

update-lgpl

update-vllm

upgrade-liger-test

upgrade-torchao-0.15

upgrade-trl-v0.12.0_2

upgrade_liger-tr4.46.1

uv-first

uv-fixup

vendor-moe

version-dev

vllm-0191

wait-distributed-close

weight-scale-norm

xformers-wo-packing

yayi2

zero3-8bit-lora

v0.1.0

v0.10.0

v0.10.1

v0.11.0

v0.11.0.post1

v0.12.0

v0.12.1

v0.12.2

v0.13.0

v0.13.1

v0.13.2

v0.14.0

v0.15.0

v0.16.0

v0.16.1

v0.2.0

v0.2.1

v0.3.0

v0.4.0

v0.5.0

v0.5.1

v0.5.1.post1

v0.5.2

v0.6.0

v0.7.0

v0.7.1

v0.8.0

v0.8.1

v0.9.0

v0.9.1

v0.9.1.post1

v0.9.2

fd55bc87e2 use math.ceil instead of round /cc #498 Aman Karmani 2023-08-29 01:03:41 +00:00
8e197f6fb4 pad_to_worst_case_seq_len boolean, for testing memory limits (#498) Birch-san 2023-08-28 23:47:16 +01:00
45848a9285 gather benchmarks from all ranks Wing Lian 2023-08-28 11:29:59 -04:00
267b7b24e5 simplify linear layer locator Aman Karmani 2023-08-28 13:10:26 +00:00
d6cea18034 improve support for customized dataset for bench evals Wing Lian 2023-08-28 06:03:53 -04:00
606846e0a5 missing transformers import Wing Lian 2023-08-28 05:43:19 -04:00
a6c9223114 more fixes Wing Lian 2023-08-25 22:46:28 -04:00
8b16ecd448 updated dataset Wing Lian 2023-08-25 21:59:09 -04:00
f5db88a10d fixes Wing Lian 2023-08-25 21:49:04 -04:00
99d844f215 benchmark callback has its own dataloader and collator Wing Lian 2023-08-25 20:38:43 -04:00
aefd4d74fa better handling when no subjects Wing Lian 2023-08-21 17:22:06 -04:00
24b0e93235 dataset handling and aggregate across benchmark Wing Lian 2023-08-21 16:56:40 -04:00
2455254b92 more fixes Wing Lian 2023-08-21 04:58:54 -04:00
918e040601 rename mmlu to bench Wing Lian 2023-08-21 04:38:51 -04:00
ef062d8fcb more fixes Wing Lian 2023-08-21 04:31:15 -04:00
d4c8b66f3d fix elif and add better messaging Wing Lian 2023-08-21 04:05:41 -04:00
64e9824d3e fix the data file Wing Lian 2023-08-21 04:00:22 -04:00
1134654c98 sample benchmarks, ensure we drop long samples Wing Lian 2023-08-21 03:55:06 -04:00
2fc756c289 fix mmlu evals Wing Lian 2023-08-21 03:34:16 -04:00
943b84c490 another callback fix for collator max len attribute Wing Lian 2023-08-19 21:36:24 -04:00
6f166464d8 include metrics in callback Wing Lian 2023-08-19 21:26:32 -04:00
e3b07402a7 make sure to define all the explicit positional args Wing Lian 2023-08-19 21:20:10 -04:00
8d3c8a3eab default to mmlu-zs Wing Lian 2023-08-19 18:58:35 -04:00
c30120e684 use hf dataset for mmlu evals Wing Lian 2023-08-19 18:57:46 -04:00
9aed60fa54 add mmlu callback Wing Lian 2023-08-19 18:26:19 -04:00
98bf76e236 fsdp requires params be the same type too (#493) Wing Lian 2023-08-28 04:33:50 -04:00
4c37bd0b54 Fix(tokenizer): Make sure to add pad for CodeLlamaTokenizer (#489) NanoCode012 2023-08-28 09:39:10 +09:00
f144e98a32 Merge pull request #485 from maximegmd/patch-4 Aman Gupta Karmani 2023-08-27 16:27:47 -04:00
3a011ea1ef fix condition and add logging Aman Karmani 2023-08-27 20:09:26 +00:00
1f613e5aa7 Merge branch 'main' into patch-4 Aman Karmani 2023-08-27 19:57:34 +00:00
f319b0bc67 rename var and reformat Aman Karmani 2023-08-27 19:55:11 +00:00
7fd662dd89 Update src/axolotl/utils/models.py Maxime 2023-08-27 21:01:43 +02:00
9e699683d7 Update src/axolotl/utils/models.py Maxime 2023-08-27 21:01:37 +02:00
35130711d6 Feat(cfg): Add code-llama configs for all sizes (#479) mhenrichsen 2023-08-27 03:20:17 +02:00
3fc9006298 Feat(deepspeed): Add zero2 config (#476) mhenrichsen 2023-08-27 03:10:33 +02:00
ad8be435ad Feat(doc): Update eval_steps doc (#487) NanoCode012 2023-08-27 10:09:09 +09:00
fe4d6baf92 Add example Llama 2 ReLoRA config (#471) Charles O. Goddard 2023-08-26 18:08:34 -07:00
f31301063d Merge pull request #486 from OpenAccess-AI-Collective/adam-bnb-simpler Aman Gupta Karmani 2023-08-26 20:44:19 -04:00
868530c39c let transformers handle adamw_bnb_8bit Aman Karmani 2023-08-26 21:40:12 +00:00
d03887fad5 ignore: address pr review Maxime 2023-08-26 22:45:45 +02:00
17605b85d8 fix: inference did not move the model to the correct device (#483) Maxime 2023-08-26 22:40:56 +02:00
a184549e4c ignore: linter Maxime 2023-08-26 22:36:14 +02:00
f311df9462 fix: finetune model inference needs the dtype fix to work with flash-attn Maxime 2023-08-26 22:34:11 +02:00
c500d02517 Fix missing 'packaging' wheel (#482) Maxime 2023-08-26 18:02:15 +02:00
31f3e71764 fix checkpints on multigpu (#481) Wing Lian 2023-08-26 12:00:03 -04:00
56c4a94caf Merge pull request #484 from OpenAccess-AI-Collective/reqs Aman Gupta Karmani 2023-08-26 11:13:41 -04:00
c29117a0d7 allow newer deps Aman Karmani 2023-08-26 15:06:05 +00:00
0b7ba57ec4 fix types w lora (#478) Wing Lian 2023-08-25 02:03:24 -04:00
71bd06243c Fix(tokenizer): Fix condition to add pad token (#477) NanoCode012 2023-08-25 14:30:50 +09:00
cb9797ef5a improve llama pad token handling (#475) Wing Lian 2023-08-24 13:20:35 -04:00
bde3c5a478 ReLoRA implementation (with quantization) (#322) Charles O. Goddard 2023-08-23 20:07:18 -07:00
55c23c7bcb Fix(doc): Clarify config (#466) NanoCode012 2023-08-24 00:56:01 +09:00
c69faee7a7 workaround so training doesn't hang when packed dataloader batches aren't even (#461) Wing Lian 2023-08-23 10:39:11 -04:00
9aaa4b8ced set model merge dtype based on cfg merge-lora-on-complete Wing Lian 2023-08-23 03:55:44 -04:00
8be7da8999 push merged lora to hf Wing Lian 2023-08-22 23:31:12 -04:00
53e739f11e deduplicate code Wing Lian 2023-08-22 23:17:44 -04:00
f20c8deff1 merge lora on train completion Wing Lian 2023-08-22 23:15:09 -04:00
d5dcf9c350 fix test fixture b/c hf trainer tokenization changed (#464) Wing Lian 2023-08-23 04:04:49 -04:00
f4746507f6 feat: add Metharme prompt strategy (#446) TearGosling 2023-08-21 21:21:45 -05:00
96deb6bd67 recast loralayer, norm, lmhead + embed token weights per original qlora (#393) Wing Lian 2023-08-21 18:41:12 -04:00
50682a3c06 always drop samples that are too long (#452) Wing Lian 2023-08-21 16:43:33 -04:00
5a1985ba24 set env var for FSDP layer to wrap (#453) Wing Lian 2023-08-21 16:43:22 -04:00
5e9c6afa10 Merge pull request #451 from OpenAccess-AI-Collective/eval-is-causal Aman Gupta Karmani 2023-08-21 10:43:46 -07:00
a213d9972a fix eval regression caused in 13f7efaf74 Aman Karmani 2023-08-21 10:40:06 -07:00
fbf49a4770 is_causal fix for evals? Wing Lian 2023-08-21 10:36:26 -04:00
58cf7e7fed add missing positional arg (#450) Wing Lian 2023-08-21 04:10:19 -04:00
04a42b6db1 feat(docs): improve user customized prompts (#443) NanoCode012 2023-08-21 12:59:43 +09:00
919f4cac90 feat(doc): add pillow to lambda instructions (#445) NanoCode012 2023-08-21 12:59:23 +09:00
ee262818ef fix evals (#447) Wing Lian 2023-08-20 23:39:42 -04:00
9d629d8bff gracefully handle empty input (#442) Wing Lian 2023-08-20 09:18:18 -04:00
d2e7f27240 support user defined prompters, pretokenized datasets in config, local parquet, local arrow files (#348) Wing Lian 2023-08-20 09:17:49 -04:00
d21318dfb9 docs(readme): add cd axolotl (#440) Philpax 2023-08-20 01:14:05 +02:00
f733d0f31e disable eval using multipack for now (#437) Wing Lian 2023-08-19 10:35:04 -04:00
008505c8ae fix comma, not a tuple (#436) Wing Lian 2023-08-19 00:57:40 -04:00
b3f5e00ff5 use save_strategy from config if available (#434) Wing Lian 2023-08-18 20:28:23 -04:00
5247c5004e set env for FSDP offload params (#433) Wing Lian 2023-08-18 20:28:09 -04:00
cf6654769a flash attn pip install (#426) mhenrichsen 2023-08-19 01:00:27 +02:00
06edf175ac standardize attn hijack patches (#381) Aman Gupta Karmani 2023-08-18 09:54:16 -07:00
0a228479b3 adds color (#425) mhenrichsen 2023-08-18 16:59:43 +02:00
82e111aba9 remove extra accelearate in requirements (#430) Wing Lian 2023-08-18 10:56:14 -04:00
cf00e20270 experiment w latent space latent-space Wing Lian 2023-08-18 05:47:26 -04:00
587dbbfc02 fix the patch to work properly and work with FSDP attn-patches Wing Lian 2023-08-17 20:33:56 -04:00
6c306d9186 enable eval dataloader using multipack again Wing Lian 2023-08-17 18:07:02 -04:00
7565fb9d63 update forwards so we only calculate cu_seqlens once Wing Lian 2023-08-17 18:02:41 -04:00
a6b737d5ff copy LlamaModel.forward and LlamaDecoderLayer.forward into monkeypatch Wing Lian 2023-08-17 18:01:46 -04:00
cf95b57c0a fix patch to check position ids and don't use multipack for evals Wing Lian 2023-08-17 17:14:29 -04:00
13f7efaf74 speed up flash-attn inference Aman Karmani 2023-08-13 18:03:38 +00:00
d773384f74 update flash-attn patch for 70B/GQA and inference using helper from flash-attn tests Aman Karmani 2023-08-13 15:41:44 +00:00
985dcbc051 sync xformers patch to follow shared format and be diffable Aman Karmani 2023-08-13 15:41:06 +00:00
5d0b27e5a1 split sdp attn into its own patch Aman Karmani 2023-08-13 15:40:43 +00:00
8cace80175 fix fixture for new tokenizer handling in transformers (#428) Wing Lian 2023-08-17 17:01:52 -04:00
1b7e8604bb fix orca prompts (#422) Wing Lian 2023-08-16 11:21:03 -04:00
3d1f203b62 Fix(docs): Remove gptq+lora and fix xformer compat list (#423) NanoCode012 2023-08-16 13:56:48 +09:00
d3d6fd6ae6 just resort to tags ans use main-latest (#424) Wing Lian 2023-08-16 00:39:57 -04:00
b7449a997f Fix(template): Inform to place stack trace to Issue (#417) NanoCode012 2023-08-16 11:55:48 +09:00
5f80b3560b use inputs for image rather than outputs for docker metadata (#420) Wing Lian 2023-08-15 18:26:59 -04:00
24959091d7 hopefully improve the README (#419) Wing Lian 2023-08-15 15:30:53 -04:00
7af816699e tag with latest as well for axolotl-runpod (#418) Wing Lian 2023-08-15 15:30:41 -04:00
f806e86a6e Merge pull request #413 from mhenrichsen/chore/update-deepseed-config mhenrichsen 2023-08-15 20:08:23 +02:00
2b990eb628 Feat(doc): Add lr_quadratic_warmup to readme (#412) NanoCode012 2023-08-16 02:55:48 +09:00

... 47 48 49 50 51 ...