Merge branch 'main' into print_venv

Fix: do not call preprocess in multimodal or pretraining case (#2861 )
* fix: let users know to not call preprocess for vision mode * fix: improve ux for pretraining dataset and skip prepare ds * feat: add info to doc * Update src/axolotl/cli/preprocess.py following comment Co-authored-by: salman <salman.mohammadi@outlook.com> --------- Co-authored-by: salman <salman.mohammadi@outlook.com>
2025-07-07 10:01:00 +01:00 · 2025-07-06 21:55:33 -04:00 · 2025-07-06 21:55:09 -04:00 · 2025-07-06 21:20:41 -04:00 · 2025-07-04 12:44:49 +01:00 · 2025-07-04 12:40:58 +01:00
9 changed files with 313 additions and 299 deletions
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -22,9 +22,11 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
+    && conda init bash \
+    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc

-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -22,9 +22,11 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
+    && conda init bash \
+    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc

-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -22,9 +22,11 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
+    && conda init bash \
+    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc

-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -51,6 +51,10 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```

+**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
+
+> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
+
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    check_accelerate_default_config()
    check_user_token()

+    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
+        if cfg.get("key"):
+            raise ValueError(
+                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
+            )
+
    if not cfg.dataset_prepared_path:
        msg = (
            Fore.RED
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
    if len(datasets) == 1:
        ds = datasets[0]

-        # Do not shuffle if curriculum sampling is enabled
-        if cfg.curriculum_sampling:
+        # Do not shuffle if curriculum sampling is enabled or
+        # shuffle_merged_datasets is disabled
+        if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
            return ds

        return ds.shuffle(seed=cfg.seed)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg):
    if cfg.qlora_sharded_model_loading:
        # model loading is forked after the tokenizer
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if cfg.sample_packing:
+        # multipack parallel packing sampler defaults to using fork
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"


 def setup_trainer(
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@ import shutil
 import sys
 import tempfile
 import time
-from pathlib import Path
+from pathlib import Path, PosixPath
 from typing import Generator

 import datasets
@@ -423,13 +423,9 @@ def temp_dir() -> Generator[str, None, None]:
    shutil.rmtree(_temp_dir)


-@pytest.fixture(scope="module")
-def module_temp_dir() -> Generator[str, None, None]:
-    # Create a temporary directory
-    _temp_dir = tempfile.mkdtemp()
-    yield _temp_dir
-    # Clean up the directory after the test
-    shutil.rmtree(_temp_dir)
+@pytest.fixture(scope="function", autouse=True)
+def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
+    os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"


@pytest.fixture(scope="function", autouse=True)
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu lora tinyllama
 """

-# pylint: disable=redefined-outer-name
-
 from pathlib import Path

 import pytest
@@ -27,60 +25,6 @@ def download_model():
    snapshot_download("HuggingFaceTB/SmolLM2-135M")


-@pytest.fixture(scope="module")
-def sft_base_cfg():
-    cfg = DictDefault(
-        base_model="HuggingFaceTB/SmolLM2-135M",
-        tokenizer_config="HuggingFaceTB/SmolLM2-135M",  # this has to be manually set since we haven't done validation
-        sequence_len=1024,
-        special_tokens={
-            "pad_token": "<|endoftext|>",
-        },
-        datasets=[
-            {
-                "path": "tatsu-lab/alpaca",
-                "type": "alpaca",
-                "split": "train[:10%]",
-            },
-        ],
-        val_set_size=0.1,
-        sample_packing=True,
-        flash_attention=True,
-        learning_rate=0.00001,
-        optimizer="adamw_8bit",
-        seed=42,
-        # these need to be set since we aren't running schema validation
-        micro_batch_size=2,
-        gradient_accumulation_steps=1,
-    )
-
-    return cfg
-
-
-@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
-def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
-    dataset_prepared_path = module_temp_dir + "/last_run_prepared"
-    cfg = sft_base_cfg | DictDefault(
-        dataset_prepared_path=dataset_prepared_path,
-    )
-
-    Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
-    with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-    execute_subprocess_async(
-        [
-            "axolotl",
-            "preprocess",
-            str(Path(module_temp_dir) / "config.yaml"),
-        ]
-    )
-
-    # unset flash attention since we have some flex attention tests too
-    cfg.flash_attention = None
-    return cfg
-
-
 def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)

@@ -153,36 +97,45 @@ class TestMultiGPULlama:
        "gradient_accumulation_steps",
        [1, 2],
    )
-    def test_lora_ddp_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
-    ):
+    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "eval_sample_packing": False,
-                    "pad_to_sequence_len": True,
-                    "adapter": "lora",
-                    "lora_r": 8,
-                    "lora_alpha": 16,
-                    "lora_dropout": 0.05,
-                    "lora_target_linear": True,
-                    "val_set_size": 0.05,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_8bit",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "use_tensorboard": True,
-                    "bf16": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 2048,
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:20%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "use_tensorboard": True,
+                "bf16": True,
+            }
        )

        # write cfg to yaml file
@@ -432,50 +385,59 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "fsdp_state_dict_type",
        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
    )
-    def test_fsdp_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
-    ):
+    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 2,
-                    "gradient_accumulation_steps": 2,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "fsdp": [
-                        "full_shard",
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_limit_all_gathers": True,
-                        "fsdp_offload_params": False,
-                        "fsdp_sync_module_states": True,
-                        "fsdp_use_orig_params": False,
-                        "fsdp_cpu_ram_efficient_loading": False,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": fsdp_state_dict_type,
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 2,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": fsdp_state_dict_type,
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+                "use_tensorboard": True,
+            }
        )

        # write cfg to yaml file
@@ -496,7 +458,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_6_0
@@ -509,43 +471,51 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_fsdp2_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        attention_backend,
-        fsdp_reshard_after_forward,
+        self, temp_dir, attention_backend, fsdp_reshard_after_forward
    ):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 4,
-                    "gradient_accumulation_steps": 2,
-                    "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_8bit",
-                    "lr_scheduler": "cosine",
-                    "fsdp": [
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_version": 2,
-                        # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
-                        "fsdp_offload_params": False,
-                        "fsdp_cpu_ram_efficient_loading": False,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                        "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 2048,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_8bit",
+                "lr_scheduler": "cosine",
+                "fsdp": [
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_version": 2,
+                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
+                },
+                "use_tensorboard": True,
+            }
        )
        if attention_backend == "flash":
            cfg.flash_attention = True
@@ -573,55 +543,64 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )

-    def test_fsdp_qlora_prequant_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg
-    ):
+    def test_fsdp_qlora_prequant_packed(self, temp_dir):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
-                    "adapter": "qlora",
-                    "mean_resizing_embeddings": True,
-                    "load_in_4bit": True,
-                    "lora_r": 8,
-                    "lora_alpha": 16,
-                    "lora_dropout": 0.05,
-                    "lora_target_linear": True,
-                    # "lora_modules_to_save": [
-                    #     "embed_tokens",
-                    #     "lm_head",
-                    # ],
-                    "eval_sample_packing": False,
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 2,
-                    "gradient_accumulation_steps": 2,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "fsdp": [
-                        "full_shard",
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_limit_all_gathers": True,
-                        "fsdp_offload_params": False,
-                        "fsdp_sync_module_states": True,
-                        "fsdp_use_orig_params": False,
-                        "fsdp_cpu_ram_efficient_loading": True,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+        cfg = DictDefault(
+            {
+                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                "adapter": "qlora",
+                "mean_resizing_embeddings": True,
+                "load_in_4bit": True,
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                # "lora_modules_to_save": [
+                #     "embed_tokens",
+                #     "lm_head",
+                # ],
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 2,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+                "use_tensorboard": True,
+            }
        )

        # write cfg to yaml file
@@ -662,12 +641,7 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_ds_zero3_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        deepspeed,
-        qlora,
+        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
    ):
        # pylint: disable=duplicate-code
        if qlora:
@@ -681,25 +655,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / deepspeed),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -720,7 +706,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -731,13 +717,7 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero2_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        qlora,
-    ):
+    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -750,25 +730,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -789,7 +781,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -800,13 +792,7 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero1_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        qlora,
-    ):
+    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -819,25 +805,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -858,7 +856,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.skip(
Author	SHA1	Message	Date
salman	454eea049f	Merge branch 'main' into print_venv	2025-07-07 10:01:00 +01:00
NanoCode012	5a961ecadf	Fix: do not call preprocess in multimodal or pretraining case (#2861 ) * fix: let users know to not call preprocess for vision mode * fix: improve ux for pretraining dataset and skip prepare ds * feat: add info to doc * Update src/axolotl/cli/preprocess.py following comment Co-authored-by: salman <salman.mohammadi@outlook.com> --------- Co-authored-by: salman <salman.mohammadi@outlook.com>	2025-07-06 21:55:33 -04:00
Wing Lian	b37ddf9778	don't use tokenizer parallelism when using packing (#2862 ) [skip ci]	2025-07-06 21:55:09 -04:00
Wing Lian	bf38e507fb	respect shuffle_merged_datasets for single dataset too (#2866 ) [skip ci] * respect shuffle_merged_datasets for single dataset too * update inline comment for behavior Co-authored-by: NanoCode012 <nano@axolotl.ai> --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-07-06 21:20:41 -04:00
Salman Mohammadi	d00bd99279	Merge branch 'print_venv' of github.com:axolotl-ai-cloud/axolotl into print_venv	2025-07-04 12:44:49 +01:00
Salman Mohammadi	2b41bfe9eb	reverting	2025-07-04 12:40:58 +01:00
salman	5bbbd599b4	Merge branch 'main' into print_venv	2025-07-04 12:36:13 +01:00
Salman Mohammadi	26c782183d	merging commands	2025-07-04 12:35:20 +01:00
Salman Mohammadi	8065fed126	adding venv to prompt	2025-07-02 15:27:42 +01:00