Compare commits

..

61 Commits

Author SHA1 Message Date
Wing Lian
4a0ab11fcf chore: lint 2025-01-13 14:05:56 -05:00
Wing Lian
73b6b0a580 chore: lint 2025-01-13 13:56:16 -05:00
Wing Lian
9db5072407 make sure to use tensorboard to capture loss for checks 2025-01-13 13:56:16 -05:00
Wing Lian
42d3e36a6f fix adapter model check 2025-01-13 13:56:15 -05:00
Wing Lian
b12d93bedf make sure to use the correct tokenizer 2025-01-13 13:56:15 -05:00
Wing Lian
08ec9c0e5b make sure to set tokenizer from l3 70b and save safetensors 2025-01-13 13:56:15 -05:00
Wing Lian
9abac55f92 lower lr 2025-01-13 13:56:15 -05:00
Wing Lian
800e7fa41e set lora_dropout explicitly 2025-01-13 13:56:15 -05:00
Wing Lian
5a1c1b82d4 make the kd e2e fit in vram for ci and add lora version 2025-01-13 13:56:15 -05:00
Wing Lian
efb3f70d38 rename test files so it gets picked up 2025-01-13 13:56:15 -05:00
Wing Lian
58d9896777 linting 2025-01-13 13:56:15 -05:00
Wing Lian
f7963083b8 add kd trainer e2e test 2025-01-13 13:56:15 -05:00
Wing Lian
f0b6581f8c reward model doesn't work well with batched 2025-01-13 13:56:15 -05:00
Wing Lian
27bb21c459 improve check for batched 2025-01-13 13:56:15 -05:00
Wing Lian
74d98ca6d8 fix reward trainer calls for tokenization 2025-01-13 13:56:14 -05:00
Wing Lian
ec4dfb02c8 reward can use same batch check 2025-01-13 13:56:14 -05:00
Wing Lian
28ef5e8d5a tweak check for batched prompt data 2025-01-13 13:56:14 -05:00
Wing Lian
5ed2823855 ensure that batch vs single is done properly 2025-01-13 13:56:14 -05:00
Wing Lian
fb0775d264 improve iterable support 2025-01-13 13:56:12 -05:00
Wing Lian
7cd0a317cb support streaming for processing sft datasts? 2025-01-13 13:41:36 -05:00
Wing Lian
1cc3a2d16c make loss torch script compat 2025-01-13 13:41:36 -05:00
Wing Lian
287d2ca8d5 kd sample packing 2025-01-13 13:41:36 -05:00
Wing Lian
03b86df506 be a bit pickier about loading dynamic prompt strategies 2025-01-13 13:41:36 -05:00
Wing Lian
2ed4246949 more info on preprocess for kd and fix import 2025-01-13 13:41:35 -05:00
Wing Lian
35bc2e2d3f remove duplicate code 2025-01-13 13:41:35 -05:00
Wing Lian
94f1094805 add copyrights 2025-01-13 13:41:35 -05:00
Wing Lian
a0070bf94e increase logging around loading plugins 2025-01-13 13:41:35 -05:00
Wing Lian
2ee2ffd834 make plugin setup concise 2025-01-13 13:41:35 -05:00
Wing Lian
723b0a2dee remove moved class from import 2025-01-13 13:41:35 -05:00
Wing Lian
327739c9e3 move more things to kd plugin 2025-01-13 13:41:35 -05:00
Wing Lian
8aafe142f2 refactor kd chat template loader 2025-01-13 13:41:35 -05:00
Wing Lian
a0d6d8895e support for custom trainer classes from plugins 2025-01-13 13:41:34 -05:00
Wing Lian
55b33cc44d handle token/logprob shifting 2025-01-13 13:41:34 -05:00
Wing Lian
69ed25e82c remove references to triton kd for now 2025-01-13 13:41:34 -05:00
Wing Lian
2ea8b7e518 add license block 2025-01-13 13:41:34 -05:00
Wing Lian
aa081e0e76 refactor so we can easily add new loss functions 2025-01-13 13:41:34 -05:00
Wing Lian
3f97ec45fb chore: lint 2025-01-13 13:41:34 -05:00
Wing Lian
7b5a24b0d2 var naming and add todo 2025-01-13 13:41:34 -05:00
Wing Lian
4ddd089d0a fix kd loss so it's causal (fixes repeating tokens) 2025-01-13 13:41:34 -05:00
Wing Lian
b88128d067 use kd_alpha in the correct loss method 2025-01-13 13:41:32 -05:00
Wing Lian
2e6422a711 hash for temperature too 2025-01-13 13:40:19 -05:00
Wing Lian
6ad809287b better rescaling for temperatures 2025-01-13 13:40:19 -05:00
Wing Lian
e376e00386 don't use triton for now 2025-01-13 13:40:19 -05:00
Wing Lian
23d7ae6caa fix kwarg 2025-01-13 13:40:19 -05:00
Wing Lian
19638590d5 v3 2025-01-13 13:40:18 -05:00
Wing Lian
73f5b83431 no torch.tensor 2025-01-13 13:40:18 -05:00
Wing Lian
9b1164b841 no log etc 2025-01-13 13:40:18 -05:00
Wing Lian
5a7d6f6175 no torch.exp inside triton kernel 2025-01-13 13:40:18 -05:00
Wing Lian
a803c3d3ee v2 trial 2025-01-13 13:40:18 -05:00
Wing Lian
48ccf55752 no where support 2025-01-13 13:40:18 -05:00
Wing Lian
bc3326a808 triton wip 2025-01-13 13:40:18 -05:00
Wing Lian
cf8174db75 chore: lint 2025-01-13 13:40:18 -05:00
Wing Lian
222dc27410 make sure to multiply against the correct loss 2025-01-13 13:40:18 -05:00
Wing Lian
1107f1f603 cross entropy loss coefficient during KD 2025-01-13 13:40:18 -05:00
Wing Lian
1c603da96a flipped the slice 2025-01-13 13:40:17 -05:00
Wing Lian
283faf3909 make it work 2025-01-13 13:40:17 -05:00
Wing Lian
472f7048e5 handle padding/collation for KD datasets 2025-01-13 13:40:17 -05:00
Wing Lian
3d1e2dcef4 make batch smaller 2025-01-13 13:40:17 -05:00
Wing Lian
9e218fbcfd filter bad rows 2025-01-13 13:40:17 -05:00
Wing Lian
11caf52529 KD dataset loading and KD with logprobs 2025-01-13 13:40:17 -05:00
Wing Lian
17ba9dcfdb refactor trainer to prevent circular dependencies later
fix loader default
2025-01-13 13:40:17 -05:00
8 changed files with 23 additions and 35 deletions

View File

@@ -11,7 +11,7 @@ from datasets import Dataset
import axolotl.monkeypatch.data.batch_dataset_fetcher # pylint: disable=unused-import # noqa: F401
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
from axolotl.utils.data import prepare_dataset
from axolotl.utils.data.rl import load_prepare_preference_datasets
from axolotl.utils.data.rl import load_prepare_dpo_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_processor, load_tokenizer
from axolotl.utils.tokenization import check_dataset_labels
@@ -109,9 +109,9 @@ def load_preference_datasets(
cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
) -> TrainDatasetMeta:
"""
Loads one or more training or evaluation datasets for RL training using paired
preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
Optionally, logs out debug information.
Loads one or more training or evaluation datasets for DPO training, calling
`axolotl.utils.data.rl.load_prepare_dpo_datasets`. Optionally, logs out debug
information.
Args:
cfg: Dictionary mapping `axolotl` config keys to values.
@@ -121,7 +121,7 @@ def load_preference_datasets(
Dataclass with fields for training and evaluation datasets and the computed
`total_num_steps`.
"""
train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
total_num_steps = int(
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
)

View File

@@ -5,7 +5,7 @@ from axolotl.utils.data.pretraining import ( # noqa: F401
encode_pretraining,
wrap_pretraining_dataset,
)
from axolotl.utils.data.rl import load_prepare_preference_datasets # noqa: F401
from axolotl.utils.data.rl import load_prepare_dpo_datasets # noqa: F401
from axolotl.utils.data.sft import ( # noqa: F401
get_dataset_wrapper,
load_prepare_datasets,

View File

@@ -18,13 +18,10 @@ LOG = logging.getLogger("axolotl")
def encode_pretraining(
tokenizer: PreTrainedTokenizerBase,
max_tokens: int,
examples: Dict[str, List],
text_column: str = "text",
tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: Dict[str, List]
) -> Dict[str, List]:
res = tokenizer(
examples[text_column],
examples["text"],
truncation=True,
max_length=max_tokens - 2,
add_special_tokens=True,
@@ -199,12 +196,7 @@ def wrap_pretraining_dataset(
# set this to 1 so downstream data_loader doesn't try to increase the batch again
cfg.micro_batch_size = 1
else:
encode = functools.partial(
encode_pretraining,
tokenizer,
max_tokens,
text_column=cfg.pretraining_dataset[0].text_column or "text",
)
encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
if cfg.shuffle_merged_datasets:
dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)

View File

@@ -115,7 +115,7 @@ def drop_long_rl_seq(
raise ValueError("Unknown RL type")
def load_prepare_preference_datasets(cfg):
def load_prepare_dpo_datasets(cfg):
def load_split(dataset_cfgs, _cfg):
split_datasets: List[Any] = []
for i, ds_cfg in enumerate(dataset_cfgs):

View File

@@ -1057,7 +1057,7 @@ class ModelLoader:
)
if (
hasattr(self.model, "get_input_embeddings")
and self.model.get_input_embeddings().num_embeddings != embeddings_len
and self.model.get_input_embeddings().num_embeddings < embeddings_len
):
resize_kwargs = {}
if self.cfg.mean_resizing_embeddings is not None:

View File

@@ -4,8 +4,7 @@ E2E tests for llama pretrain
import logging
import os
import pytest
import unittest
from axolotl.cli.args import TrainerCliArgs
from axolotl.common.datasets import load_datasets
@@ -13,22 +12,19 @@ from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault
from .utils import check_model_output_exists
from .utils import check_model_output_exists, with_temp_dir
LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
class TestPretrainLlama:
class TestPretrainLlama(unittest.TestCase):
"""
Test case for Llama models w pretraining
"""
@pytest.mark.parametrize(
"sample_packing",
[True, False],
)
def test_pretrain(self, temp_dir, sample_packing):
@with_temp_dir
def test_pretrain_w_sample_packing(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
@@ -36,7 +32,7 @@ class TestPretrainLlama:
"tokenizer_type": "LlamaTokenizer",
"flash_attention": True,
"sequence_len": 1024,
"sample_packing": sample_packing,
"sample_packing": True,
"special_tokens": {
"unk_token": "<unk>",
"bos_token": "<s>",

View File

@@ -17,7 +17,7 @@ from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from axolotl.utils.data import load_tokenized_prepared_datasets
from axolotl.utils.data.rl import load_prepare_preference_datasets
from axolotl.utils.data.rl import load_prepare_dpo_datasets
from axolotl.utils.dict import DictDefault
@@ -280,7 +280,7 @@ class TestDatasetPreparation(unittest.TestCase):
}
)
train_dataset, _ = load_prepare_preference_datasets(cfg)
train_dataset, _ = load_prepare_dpo_datasets(cfg)
assert len(train_dataset) == 1800
assert "conversation" in train_dataset.features
@@ -329,7 +329,7 @@ class TestDatasetPreparation(unittest.TestCase):
}
)
train_dataset, _ = load_prepare_preference_datasets(cfg)
train_dataset, _ = load_prepare_dpo_datasets(cfg)
assert len(train_dataset) == 1800
assert "conversation" in train_dataset.features

View File

@@ -12,7 +12,7 @@ from datasets import Dataset
from transformers import AutoTokenizer
from axolotl.utils.data import prepare_dataset
from axolotl.utils.data.rl import load_prepare_preference_datasets
from axolotl.utils.data.rl import load_prepare_dpo_datasets
from axolotl.utils.data.utils import deduplicate_and_log_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_processor, load_tokenizer
@@ -236,7 +236,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
"""Verify that loading with deduplication removes duplicates."""
# Load the dataset using the deduplication setting
train_dataset, _ = load_prepare_preference_datasets(self.cfg)
train_dataset, _ = load_prepare_dpo_datasets(self.cfg)
# Verify that the dataset has been deduplicated
assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
@@ -245,7 +245,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
"""Verify that loading without deduplication retains duplicates."""
self.cfg.dataset_exact_deduplication = False
# Load the dataset without deduplication
train_dataset, _ = load_prepare_preference_datasets(self.cfg)
train_dataset, _ = load_prepare_dpo_datasets(self.cfg)
# Verify that the dataset retains duplicates
assert (