review comments, docstrings

2025-03-19 17:35:09 +00:00
parent a26985c53c
commit 2f0b4626b9
5 changed files with 35 additions and 35 deletions
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -12,6 +12,7 @@ from constants import ALPACA_MESSAGES_CONFIG_REVISION, SPECIAL_TOKENS
 from datasets import Dataset
 from transformers import AutoTokenizer

+from axolotl.utils.config import normalize_config
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
@@ -262,6 +263,7 @@ class TestDeduplicateNonRL(unittest.TestCase):
        self.tokenizer.add_special_tokens(SPECIAL_TOKENS)
        self.cfg_1 = DictDefault(
            {
+                "base_model": "huggyllama/llama-7b",
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
                "dataset_exact_deduplication": True,
@@ -280,9 +282,9 @@ class TestDeduplicateNonRL(unittest.TestCase):
                "batch_size": 10,
                "micro_batch_size": 10,
                "num_epochs": 1,
-                "sequence_parallel_degree": 1,
            }
        )
+        normalize_config(self.cfg_1)

    def test_prepare_dataset_with_deduplication_train(self):
        """Verify that prepare_dataset function processes the dataset correctly with deduplication."""