chore: lint

make sure to use tensorboard to capture loss for checks
2025-01-13 14:05:56 -05:00 · 2025-01-13 13:56:16 -05:00 · 2025-01-13 13:56:16 -05:00 · 2025-01-13 13:56:15 -05:00 · 2025-01-13 13:56:15 -05:00 · 2025-01-13 13:56:15 -05:00
8 changed files with 23 additions and 35 deletions
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -11,7 +11,7 @@ from datasets import Dataset
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
@@ -109,9 +109,9 @@ def load_preference_datasets(
    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
    """
-    Loads one or more training or evaluation datasets for RL training using paired
-    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
-    Optionally, logs out debug information.
+    Loads one or more training or evaluation datasets for DPO training, calling
+    `axolotl.utils.data.rl.load_prepare_dpo_datasets`. Optionally, logs out debug
+    information.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -121,7 +121,7 @@ def load_preference_datasets(
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
-    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
+    train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
    total_num_steps = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -5,7 +5,7 @@ from axolotl.utils.data.pretraining import (  # noqa: F401
    encode_pretraining,
    wrap_pretraining_dataset,
 )
-from axolotl.utils.data.rl import load_prepare_preference_datasets  # noqa: F401
+from axolotl.utils.data.rl import load_prepare_dpo_datasets  # noqa: F401
 from axolotl.utils.data.sft import (  # noqa: F401
    get_dataset_wrapper,
    load_prepare_datasets,
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -18,13 +18,10 @@ LOG = logging.getLogger("axolotl")


 def encode_pretraining(
-    tokenizer: PreTrainedTokenizerBase,
-    max_tokens: int,
-    examples: Dict[str, List],
-    text_column: str = "text",
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: Dict[str, List]
 ) -> Dict[str, List]:
    res = tokenizer(
-        examples[text_column],
+        examples["text"],
        truncation=True,
        max_length=max_tokens - 2,
        add_special_tokens=True,
@@ -199,12 +196,7 @@ def wrap_pretraining_dataset(
        # set this to 1 so downstream data_loader doesn't try to increase the batch again
        cfg.micro_batch_size = 1
    else:
-        encode = functools.partial(
-            encode_pretraining,
-            tokenizer,
-            max_tokens,
-            text_column=cfg.pretraining_dataset[0].text_column or "text",
-        )
+        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)

    if cfg.shuffle_merged_datasets:
        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -115,7 +115,7 @@ def drop_long_rl_seq(
    raise ValueError("Unknown RL type")


-def load_prepare_preference_datasets(cfg):
+def load_prepare_dpo_datasets(cfg):
    def load_split(dataset_cfgs, _cfg):
        split_datasets: List[Any] = []
        for i, ds_cfg in enumerate(dataset_cfgs):
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -1057,7 +1057,7 @@ class ModelLoader:
        )
        if (
            hasattr(self.model, "get_input_embeddings")
-            and self.model.get_input_embeddings().num_embeddings != embeddings_len
+            and self.model.get_input_embeddings().num_embeddings < embeddings_len
        ):
            resize_kwargs = {}
            if self.cfg.mean_resizing_embeddings is not None:
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -4,8 +4,7 @@ E2E tests for llama pretrain

 import logging
 import os
-
-import pytest
+import unittest

 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
@@ -13,22 +12,19 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import check_model_output_exists
+from .utils import check_model_output_exists, with_temp_dir

 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"


-class TestPretrainLlama:
+class TestPretrainLlama(unittest.TestCase):
    """
    Test case for Llama models w pretraining
    """

-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_pretrain(self, temp_dir, sample_packing):
+    @with_temp_dir
+    def test_pretrain_w_sample_packing(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
@@ -36,7 +32,7 @@ class TestPretrainLlama:
                "tokenizer_type": "LlamaTokenizer",
                "flash_attention": True,
                "sequence_len": 1024,
-                "sample_packing": sample_packing,
+                "sample_packing": True,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -17,7 +17,7 @@ from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from axolotl.utils.data import load_tokenized_prepared_datasets
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault


@@ -280,7 +280,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )

-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)

        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
@@ -329,7 +329,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )

-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)

        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -12,7 +12,7 @@ from datasets import Dataset
 from transformers import AutoTokenizer

 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
@@ -236,7 +236,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading with deduplication removes duplicates."""

        # Load the dataset using the deduplication setting
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)

        # Verify that the dataset has been deduplicated
        assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
@@ -245,7 +245,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading without deduplication retains duplicates."""
        self.cfg.dataset_exact_deduplication = False
        # Load the dataset without deduplication
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)

        # Verify that the dataset retains duplicates
        assert (
Author	SHA1	Message	Date
Wing Lian	4a0ab11fcf	chore: lint	2025-01-13 14:05:56 -05:00
Wing Lian	73b6b0a580	chore: lint	2025-01-13 13:56:16 -05:00
Wing Lian	9db5072407	make sure to use tensorboard to capture loss for checks	2025-01-13 13:56:16 -05:00
Wing Lian	42d3e36a6f	fix adapter model check	2025-01-13 13:56:15 -05:00
Wing Lian	b12d93bedf	make sure to use the correct tokenizer	2025-01-13 13:56:15 -05:00
Wing Lian	08ec9c0e5b	make sure to set tokenizer from l3 70b and save safetensors	2025-01-13 13:56:15 -05:00
Wing Lian	9abac55f92	lower lr	2025-01-13 13:56:15 -05:00
Wing Lian	800e7fa41e	set lora_dropout explicitly	2025-01-13 13:56:15 -05:00
Wing Lian	5a1c1b82d4	make the kd e2e fit in vram for ci and add lora version	2025-01-13 13:56:15 -05:00
Wing Lian	efb3f70d38	rename test files so it gets picked up	2025-01-13 13:56:15 -05:00
Wing Lian	58d9896777	linting	2025-01-13 13:56:15 -05:00
Wing Lian	f7963083b8	add kd trainer e2e test	2025-01-13 13:56:15 -05:00
Wing Lian	f0b6581f8c	reward model doesn't work well with batched	2025-01-13 13:56:15 -05:00
Wing Lian	27bb21c459	improve check for batched	2025-01-13 13:56:15 -05:00
Wing Lian	74d98ca6d8	fix reward trainer calls for tokenization	2025-01-13 13:56:14 -05:00
Wing Lian	ec4dfb02c8	reward can use same batch check	2025-01-13 13:56:14 -05:00
Wing Lian	28ef5e8d5a	tweak check for batched prompt data	2025-01-13 13:56:14 -05:00
Wing Lian	5ed2823855	ensure that batch vs single is done properly	2025-01-13 13:56:14 -05:00
Wing Lian	fb0775d264	improve iterable support	2025-01-13 13:56:12 -05:00
Wing Lian	7cd0a317cb	support streaming for processing sft datasts?	2025-01-13 13:41:36 -05:00
Wing Lian	1cc3a2d16c	make loss torch script compat	2025-01-13 13:41:36 -05:00
Wing Lian	287d2ca8d5	kd sample packing	2025-01-13 13:41:36 -05:00
Wing Lian	03b86df506	be a bit pickier about loading dynamic prompt strategies	2025-01-13 13:41:36 -05:00
Wing Lian	2ed4246949	more info on preprocess for kd and fix import	2025-01-13 13:41:35 -05:00
Wing Lian	35bc2e2d3f	remove duplicate code	2025-01-13 13:41:35 -05:00
Wing Lian	94f1094805	add copyrights	2025-01-13 13:41:35 -05:00
Wing Lian	a0070bf94e	increase logging around loading plugins	2025-01-13 13:41:35 -05:00
Wing Lian	2ee2ffd834	make plugin setup concise	2025-01-13 13:41:35 -05:00
Wing Lian	723b0a2dee	remove moved class from import	2025-01-13 13:41:35 -05:00
Wing Lian	327739c9e3	move more things to kd plugin	2025-01-13 13:41:35 -05:00
Wing Lian	8aafe142f2	refactor kd chat template loader	2025-01-13 13:41:35 -05:00
Wing Lian	a0d6d8895e	support for custom trainer classes from plugins	2025-01-13 13:41:34 -05:00
Wing Lian	55b33cc44d	handle token/logprob shifting	2025-01-13 13:41:34 -05:00
Wing Lian	69ed25e82c	remove references to triton kd for now	2025-01-13 13:41:34 -05:00
Wing Lian	2ea8b7e518	add license block	2025-01-13 13:41:34 -05:00
Wing Lian	aa081e0e76	refactor so we can easily add new loss functions	2025-01-13 13:41:34 -05:00
Wing Lian	3f97ec45fb	chore: lint	2025-01-13 13:41:34 -05:00
Wing Lian	7b5a24b0d2	var naming and add todo	2025-01-13 13:41:34 -05:00
Wing Lian	4ddd089d0a	fix kd loss so it's causal (fixes repeating tokens)	2025-01-13 13:41:34 -05:00
Wing Lian	b88128d067	use kd_alpha in the correct loss method	2025-01-13 13:41:32 -05:00
Wing Lian	2e6422a711	hash for temperature too	2025-01-13 13:40:19 -05:00
Wing Lian	6ad809287b	better rescaling for temperatures	2025-01-13 13:40:19 -05:00
Wing Lian	e376e00386	don't use triton for now	2025-01-13 13:40:19 -05:00
Wing Lian	23d7ae6caa	fix kwarg	2025-01-13 13:40:19 -05:00
Wing Lian	19638590d5	v3	2025-01-13 13:40:18 -05:00
Wing Lian	73f5b83431	no torch.tensor	2025-01-13 13:40:18 -05:00
Wing Lian	9b1164b841	no log etc	2025-01-13 13:40:18 -05:00
Wing Lian	5a7d6f6175	no torch.exp inside triton kernel	2025-01-13 13:40:18 -05:00
Wing Lian	a803c3d3ee	v2 trial	2025-01-13 13:40:18 -05:00
Wing Lian	48ccf55752	no where support	2025-01-13 13:40:18 -05:00
Wing Lian	bc3326a808	triton wip	2025-01-13 13:40:18 -05:00
Wing Lian	cf8174db75	chore: lint	2025-01-13 13:40:18 -05:00
Wing Lian	222dc27410	make sure to multiply against the correct loss	2025-01-13 13:40:18 -05:00
Wing Lian	1107f1f603	cross entropy loss coefficient during KD	2025-01-13 13:40:18 -05:00
Wing Lian	1c603da96a	flipped the slice	2025-01-13 13:40:17 -05:00
Wing Lian	283faf3909	make it work	2025-01-13 13:40:17 -05:00
Wing Lian	472f7048e5	handle padding/collation for KD datasets	2025-01-13 13:40:17 -05:00
Wing Lian	3d1e2dcef4	make batch smaller	2025-01-13 13:40:17 -05:00
Wing Lian	9e218fbcfd	filter bad rows	2025-01-13 13:40:17 -05:00
Wing Lian	11caf52529	KD dataset loading and KD with logprobs	2025-01-13 13:40:17 -05:00
Wing Lian	17ba9dcfdb	refactor trainer to prevent circular dependencies later fix loader default	2025-01-13 13:40:17 -05:00