fix

review comments
make sure the batch dataset patcher for multipack is always loaded when handling datasets
2025-01-13 17:25:12 +00:00 · 2025-01-13 17:20:10 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00 · 2025-01-13 17:19:06 +00:00
20 changed files with 54 additions and 150 deletions
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -6,6 +6,5 @@ python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
 pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
 # pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/
 pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
+pytest -v --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -244,8 +244,6 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
 # whether to concatenate samples during pretraining
 pretraining_sample_concatenation:
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,9 +13,9 @@ liger-kernel==0.5.2
 packaging==23.2
 peft==0.14.0
-transformers @ git+https://github.com/huggingface/transformers.git@mueller-trainer-refactor
+transformers==4.47.1
 tokenizers>=0.21.0
-accelerate==1.3.0
+accelerate==1.2.1
 datasets==3.2.0
 deepspeed==0.16.1
 trl==0.13.0
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -11,7 +11,7 @@ from datasets import Dataset
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
@@ -103,9 +103,9 @@ def load_preference_datasets(
    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
    """
-    Loads one or more training or evaluation datasets for RL training using paired
+    Loads one or more training or evaluation datasets for DPO training, calling
-    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
+    `axolotl.utils.data.rl.load_prepare_dpo_datasets`. Optionally, logs out debug
-    Optionally, logs out debug information.
+    information.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -115,7 +115,7 @@ def load_preference_datasets(
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
-    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
+    train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
    total_num_steps = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -1877,8 +1877,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
    ):
        if training_args.pretraining:
            if self.cfg.pretraining_sample_concatenation is False:
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
            return None
        if self.cfg.model_config_type == "mamba":
--- a/src/axolotl/monkeypatch/trainer_grad_accum.py
+++ b/src/axolotl/monkeypatch/trainer_grad_accum.py
@@ -14,85 +14,15 @@ LOG = logging.getLogger("axolotl.monkeypatch.trainer_grad_accum")
 ORIGINAL_CONTEXT_CODE = """
    with self.compute_loss_context_manager():
        if self.model_accepts_loss_kwargs:
            loss = self.compute_loss(model, inputs)
        else:
        loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
    del inputs
    if (
        self.args.torch_empty_cache_steps is not None
        and self.state.global_step % self.args.torch_empty_cache_steps == 0
    ):
        if is_torch_xpu_available():
            torch.xpu.empty_cache()
        elif is_torch_mlu_available():
            torch.mlu.empty_cache()
        elif is_torch_musa_available():
            torch.musa.empty_cache()
        elif is_torch_npu_available():
            torch.npu.empty_cache()
        elif is_torch_mps_available(min_version="2.0"):
            torch.mps.empty_cache()
        else:
            torch.cuda.empty_cache()
    kwargs = {}
    # For LOMO optimizers you need to explicitly use the learnign rate
    if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
        kwargs["learning_rate"] = self._get_learning_rate()
    if self.args.n_gpu > 1:
        loss = loss.mean()  # mean() to average on multi-gpu parallel training
    if self.use_apex:
        with amp.scale_loss(loss, self.optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        # Finally we need to normalize the loss for reporting
        if num_items_in_batch is None:
            loss = loss / self.args.gradient_accumulation_steps
 """
 PATCHED_CONTEXT_CODE = """
    with self.compute_loss_context_manager():
        if self.model_accepts_loss_kwargs:
            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
    del inputs
    if (
        self.args.torch_empty_cache_steps is not None
        and self.state.global_step % self.args.torch_empty_cache_steps == 0
    ):
        if is_torch_xpu_available():
            torch.xpu.empty_cache()
        elif is_torch_mlu_available():
            torch.mlu.empty_cache()
        elif is_torch_musa_available():
            torch.musa.empty_cache()
        elif is_torch_npu_available():
            torch.npu.empty_cache()
        elif is_torch_mps_available(min_version="2.0"):
            torch.mps.empty_cache()
        else:
-            torch.cuda.empty_cache()
+            loss = self.compute_loss(model, inputs)
    kwargs = {}
    # For LOMO optimizers you need to explicitly use the learnign rate
    if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
        kwargs["learning_rate"] = self._get_learning_rate()
    if self.args.n_gpu > 1:
        loss = loss.mean()  # mean() to average on multi-gpu parallel training
    if self.use_apex:
        with amp.scale_loss(loss, self.optimizer) as scaled_loss:
            scaled_loss.backward()
    else:
        # Finally we need to normalize the loss for reporting
        if not self.model_accepts_loss_kwargs and self.compute_loss_func is None:
            loss = loss / self.args.gradient_accumulation_steps
 """
 ORIGINAL_LLAMA_FCLM_CODE = """
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -706,12 +706,6 @@ class AxolotlInputConfig(
    pad_to_sequence_len: Optional[bool] = None
    curriculum_sampling: Optional[bool] = None
    multipack_real_batches: Optional[bool] = None
    pretraining_sample_concatenation: Optional[bool] = Field(
        default=None,
        json_schema_extra={
            "description": "whether to soft pack/concatenate samples during pretraining",
        },
    )
    batch_flattening: Optional[Union[Literal["auto"], bool]] = None
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -5,7 +5,7 @@ from axolotl.utils.data.pretraining import (  # noqa: F401
    encode_pretraining,
    wrap_pretraining_dataset,
 )
-from axolotl.utils.data.rl import load_prepare_preference_datasets  # noqa: F401
+from axolotl.utils.data.rl import load_prepare_dpo_datasets  # noqa: F401
 from axolotl.utils.data.sft import (  # noqa: F401
    get_dataset_wrapper,
    load_prepare_datasets,
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -18,14 +18,10 @@ LOG = logging.getLogger("axolotl")
 def encode_pretraining(
-    tokenizer: PreTrainedTokenizerBase,
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: Dict[str, List]
    max_tokens: int,
    examples: Dict[str, List],
    text_column: str = "text",
    concatenate: bool = True,
 ) -> Dict[str, List]:
    res = tokenizer(
-        examples[text_column],
+        examples["text"],
        truncation=True,
        max_length=max_tokens - 2,
        add_special_tokens=True,
@@ -34,13 +30,6 @@ def encode_pretraining(
    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
    targets = [torch.tensor(seq) for seq in res["input_ids"]]
    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
    if not concatenate:
        return {
            "input_ids": [seq.tolist() for seq in input_ids],
            "labels": [seq.tolist() for seq in targets],
            "attention_mask": [seq.tolist() for seq in attention_mask],
        }
    new_input_ids = []
    new_labels = []
    new_attention_mask = []
@@ -207,13 +196,7 @@ def wrap_pretraining_dataset(
        # set this to 1 so downstream data_loader doesn't try to increase the batch again
        cfg.micro_batch_size = 1
    else:
-        encode = functools.partial(
+        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
            encode_pretraining,
            tokenizer,
            max_tokens,
            text_column=cfg.pretraining_dataset[0].text_column or "text",
            concatenate=cfg.pretraining_sample_concatenation is True,
        )
    if cfg.shuffle_merged_datasets:
        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -115,7 +115,7 @@ def drop_long_rl_seq(
    raise ValueError("Unknown RL type")
-def load_prepare_preference_datasets(cfg):
+def load_prepare_dpo_datasets(cfg):
    def load_split(dataset_cfgs, _cfg):
        split_datasets: List[Any] = []
        for i, ds_cfg in enumerate(dataset_cfgs):
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -386,15 +386,16 @@ class ModelLoader:
        if self.cfg.flash_attention:
            self.patch_attention()
-        # if self.cfg.model_config_type == "llama":
+        if self.cfg.model_config_type == "llama":
-        #     from axolotl.monkeypatch.trainer_grad_accum import (  # patch_forward_for_ga,
+            from axolotl.monkeypatch.trainer_grad_accum import (
-        #         patch_flash_attention_forward,
+                patch_flash_attention_forward,
-        #         patch_training_step_for_ga,
+                patch_forward_for_ga,
-        #     )
+                patch_training_step_for_ga,
-        #
+            )
-        #     patch_flash_attention_forward()
+
-        #     # patch_forward_for_ga()
+            patch_flash_attention_forward()
-        #     patch_training_step_for_ga()
+            patch_forward_for_ga()
            patch_training_step_for_ga()
        if self.cfg.sample_packing and self.cfg.s2_attention:
            raise ValueError(
@@ -1056,7 +1057,7 @@ class ModelLoader:
        )
        if (
            hasattr(self.model, "get_input_embeddings")
-            and self.model.get_input_embeddings().num_embeddings != embeddings_len
+            and self.model.get_input_embeddings().num_embeddings < embeddings_len
        ):
            resize_kwargs = {}
            if self.cfg.mean_resizing_embeddings is not None:
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -102,5 +102,9 @@ class TestMixtral(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-        train(cfg=cfg, dataset_meta=dataset_meta)
+        model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            "MixtralFlashAttention2"
            in model.model.layers[0].self_attn.__class__.__name__
        )
        check_model_output_exists(temp_dir, cfg)
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -49,7 +49,12 @@ class TestModelPatches(unittest.TestCase):
        )
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
-        load_model(cfg, tokenizer, inference=False)
+        model, _ = load_model(cfg, tokenizer, inference=False)
        assert (
            "MixtralFlashAttention2"
            in model.model.layers[0].self_attn.__class__.__name__
        )
    @with_temp_dir
    def test_mistral_multipack(self, temp_dir):
--- a/tests/e2e/patched/test_unsloth_integration.py
+++ b/tests/e2e/patched/test_unsloth_integration.py
@@ -3,6 +3,8 @@ import unittest
 import pytest
 from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable
@pytest.mark.skip(
    reason="Unsloth integration will be broken going into latest transformers"
@@ -11,8 +13,6 @@ class TestUnslothIntegration(unittest.TestCase):
    """Unsloth monkeypatch integration tests."""
    def test_is_self_attn_patchable(self):
        from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable
        # ensures the current version of transformers has loss code that matches our patching code
        self.assertTrue(
            check_self_attn_is_patchable(),
--- a/tests/e2e/solo/init.py
+++ b/tests/e2e/solo/init.py
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -4,8 +4,7 @@ E2E tests for llama pretrain
 import logging
 import os
-
+import unittest
 import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
@@ -13,22 +12,19 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
-from .utils import check_model_output_exists
+from .utils import check_model_output_exists, with_temp_dir
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
-class TestPretrainLlama:
+class TestPretrainLlama(unittest.TestCase):
    """
    Test case for Llama models w pretraining
    """
-    @pytest.mark.parametrize(
+    @with_temp_dir
-        "sample_packing",
+    def test_pretrain_w_sample_packing(self, temp_dir):
        [True, False],
    )
    def test_pretrain(self, temp_dir, sample_packing):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
@@ -36,7 +32,7 @@ class TestPretrainLlama:
                "tokenizer_type": "LlamaTokenizer",
                "flash_attention": True,
                "sequence_len": 1024,
-                "sample_packing": sample_packing,
+                "sample_packing": True,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -13,7 +13,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
-from ..utils import check_model_output_exists, check_tensorboard, with_temp_dir
+from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
--- a/tests/patched/test_llama_trainer_ga.py
+++ b/tests/patched/test_llama_trainer_ga.py
@@ -1,8 +1,6 @@
 """"Test module for checking whether the Hugging Face Transformers is working as expected."""
 import unittest
 import pytest
 from axolotl.monkeypatch.trainer_grad_accum import (
    check_forward_is_patchable,
    check_training_step_is_patchable,
@@ -12,7 +10,6 @@ from axolotl.monkeypatch.trainer_grad_accum import (
 class TestTrainerGAIntegration(unittest.TestCase):
    """llama monkeypatch integration tests."""
    @pytest.mark.skip("may not be needed for latest transformers version")
    def test_train_step_patchable(self):
        # ensures the current version of transformers has loss code that matches our patching code
        self.assertTrue(
@@ -20,7 +17,6 @@ class TestTrainerGAIntegration(unittest.TestCase):
            "HF transformers Trainer.training_step has changed and isn't patchable",
        )
    @pytest.mark.skip("may not be needed for latest transformers version")
    def test_model_forward_patchable(self):
        # ensures the current version of transformers has loss code that matches our patching code
        self.assertTrue(
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -17,7 +17,7 @@ from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 from axolotl.utils.data import load_tokenized_prepared_datasets
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.dict import DictDefault
@@ -280,7 +280,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )
-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)
        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
@@ -329,7 +329,7 @@ class TestDatasetPreparation(unittest.TestCase):
            }
        )
-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(cfg)
        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -12,7 +12,7 @@ from datasets import Dataset
 from transformers import AutoTokenizer
 from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_dpo_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
@@ -236,7 +236,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading with deduplication removes duplicates."""
        # Load the dataset using the deduplication setting
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)
        # Verify that the dataset has been deduplicated
        assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
@@ -245,7 +245,7 @@ class TestDeduplicateRLDataset(unittest.TestCase):
        """Verify that loading without deduplication retains duplicates."""
        self.cfg.dataset_exact_deduplication = False
        # Load the dataset without deduplication
-        train_dataset, _ = load_prepare_preference_datasets(self.cfg)
+        train_dataset, _ = load_prepare_dpo_datasets(self.cfg)
        # Verify that the dataset retains duplicates
        assert (
Author	SHA1	Message	Date
Dan Saunders	a030dad657	fix	2025-01-13 17:25:12 +00:00
Dan Saunders	3b82fc36ec	review comments	2025-01-13 17:20:10 +00:00
Wing Lian	18a36b31ef	make sure the batch dataset patcher for multipack is always loaded when handling datasets	2025-01-13 17:19:06 +00:00
Dan Saunders	705e7dc270	typing fixes	2025-01-13 17:19:06 +00:00
Dan Saunders	c9e37496cb	Fix	2025-01-13 17:19:06 +00:00
Dan Saunders	210c58a4db	fix	2025-01-13 17:19:06 +00:00
Dan Saunders	5ff1322f32	review comments	2025-01-13 17:19:06 +00:00
Dan Saunders	2b7b37413d	pytest fixes	2025-01-13 17:19:06 +00:00
Dan Saunders	6e72baf287	continued cleanup and documentation	2025-01-13 17:19:02 +00:00
Dan Saunders	929ee15cc3	remove finetune.py script	2025-01-13 17:05:38 +00:00
Dan Saunders	773c3b51cd	Adding documentation and continuing cleanup (in progress)	2025-01-13 17:05:38 +00:00
Dan Saunders	324c533adb	cleanup and (partial) docs	2025-01-13 17:05:38 +00:00
Dan Saunders	6f80d1d670	fix	2025-01-13 17:05:38 +00:00
Dan Saunders	541f9b39ff	CLI init refactor	2025-01-13 17:05:38 +00:00