fix

review comments
make sure the batch dataset patcher for multipack is always loaded when handling datasets
2025-01-13 17:25:12 +00:00 · 2025-01-13 17:20:10 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00
9 changed files with 24 additions and 36 deletions
--- a/scripts/chat_datasets.py
+++ b/scripts/chat_datasets.py
@@ -30,7 +30,7 @@ def parse_dataset(dataset=None, split="train"):
        )
    ds_cfg["field_messages"] = field_messages

-    message_fields = features[field_messages][0].keys()
+    message_fields = features["conversations"][0].keys()
    message_field_role = None
    for key in ["from", "role"]:
        if key in message_fields:
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -11,7 +11,7 @@ from datasets import Dataset
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
@@ -103,9 +103,9 @@ def load_preference_datasets(
    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
    """
-    Loads one or more training or evaluation datasets for RL training using paired
-    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
-    Optionally, logs out debug information.
+    Loads one or more training or evaluation datasets for DPO training, calling
+    `axolotl.utils.data.rl.load_prepare_dpo_datasets`. Optionally, logs out debug
+    information.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -115,7 +115,7 @@ def load_preference_datasets(
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
-    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
+    train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
    total_num_steps = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -5,7 +5,7 @@ from axolotl.utils.data.pretraining import (  # noqa: F401
    encode_pretraining,
    wrap_pretraining_dataset,
 )
-from axolotl.utils.data.rl import load_prepare_preference_datasets  # noqa: F401
+from axolotl.utils.data.rl import load_prepare_dpo_datasets  # noqa: F401
 from axolotl.utils.data.sft import (  # noqa: F401
    get_dataset_wrapper,
    load_prepare_datasets,
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -18,13 +18,10 @@ LOG = logging.getLogger("axolotl")


 def encode_pretraining(
-    tokenizer: PreTrainedTokenizerBase,
-    max_tokens: int,
-    examples: Dict[str, List],
-    text_column: str = "text",
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: Dict[str, List]
 ) -> Dict[str, List]:
    res = tokenizer(
-        examples[text_column],
+        examples["text"],
        truncation=True,
        max_length=max_tokens - 2,
        add_special_tokens=True,
@@ -199,12 +196,7 @@ def wrap_pretraining_dataset(
        # set this to 1 so downstream data_loader doesn't try to increase the batch again
        cfg.micro_batch_size = 1
    else:
-        encode = functools.partial(
-            encode_pretraining,
-            tokenizer,
-            max_tokens,
-            text_column=cfg.pretraining_dataset[0].text_column or "text",
-        )
+        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)

    if cfg.shuffle_merged_datasets:
        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -115,7 +115,7 @@ def drop_long_rl_seq(
    raise ValueError("Unknown RL type")


-def load_prepare_preference_datasets(cfg):
+def load_prepare_dpo_datasets(cfg):
    def load_split(dataset_cfgs, _cfg):
        split_datasets: List[Any] = []
        for i, ds_cfg in enumerate(dataset_cfgs):
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -1057,7 +1057,7 @@ class ModelLoader:
        )
        if (
            hasattr(self.model, "get_input_embeddings")
-            and self.model.get_input_embeddings().num_embeddings != embeddings_len
+            and self.model.get_input_embeddings().num_embeddings < embeddings_len
        ):
            resize_kwargs = {}
            if self.cfg.mean_resizing_embeddings is not None:
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -4,8 +4,7 @@ E2E tests for llama pretrain

 import logging
 import os
-
-import pytest
+import unittest

 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
@@ -13,22 +12,19 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import check_model_output_exists
+from .utils import check_model_output_exists, with_temp_dir

 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"


-class TestPretrainLlama:
+class TestPretrainLlama(unittest.TestCase):
    """
    Test case for Llama models w pretraining
    """

-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_pretrain(self, temp_dir, sample_packing):
+    @with_temp_dir
+    def test_pretrain_w_sample_packing(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
@@ -36,7 +32,7 @@ class TestPretrainLlama:
                "tokenizer_type": "LlamaTokenizer",
                "flash_attention": True,
                "sequence_len": 1024,
-                "sample_packing": sample_packing,
+                "sample_packing": True,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -17,7 +17,7 @@ from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer

 from axolotl.utils.data import load_tokenized_prepared_datasets
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault


@@ -280,7 +280,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )

-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)

        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
@@ -329,7 +329,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )

-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)

        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -12,7 +12,7 @@ from datasets import Dataset
 from transformers import AutoTokenizer

 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
@@ -236,7 +236,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading with deduplication removes duplicates."""

        # Load the dataset using the deduplication setting
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)

        # Verify that the dataset has been deduplicated
        assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
@@ -245,7 +245,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading without deduplication retains duplicates."""
        self.cfg.dataset_exact_deduplication = False
        # Load the dataset without deduplication
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)

        # Verify that the dataset retains duplicates
        assert (
Author	SHA1	Message	Date
Dan Saunders	a030dad657	fix	2025-01-13 17:25:12 +00:00
Dan Saunders	3b82fc36ec	review comments	2025-01-13 17:20:10 +00:00
Wing Lian	18a36b31ef	make sure the batch dataset patcher for multipack is always loaded when handling datasets	2025-01-13 17:19:06 +00:00
Dan Saunders	705e7dc270	typing fixes	2025-01-13 17:19:06 +00:00
Dan Saunders	c9e37496cb	Fix	2025-01-13 17:19:06 +00:00
Dan Saunders	210c58a4db	fix	2025-01-13 17:19:06 +00:00
Dan Saunders	5ff1322f32	review comments	2025-01-13 17:19:06 +00:00
Dan Saunders	2b7b37413d	pytest fixes	2025-01-13 17:19:06 +00:00
Dan Saunders	6e72baf287	continued cleanup and documentation	2025-01-13 17:19:02 +00:00
Dan Saunders	929ee15cc3	remove finetune.py script	2025-01-13 17:05:38 +00:00
Dan Saunders	773c3b51cd	Adding documentation and continuing cleanup (in progress)	2025-01-13 17:05:38 +00:00
Dan Saunders	324c533adb	cleanup and (partial) docs	2025-01-13 17:05:38 +00:00
Dan Saunders	6f80d1d670	fix	2025-01-13 17:05:38 +00:00
Dan Saunders	541f9b39ff	CLI init refactor	2025-01-13 17:05:38 +00:00