Compare commits

...

9 Commits

Author SHA1 Message Date
salman
454eea049f Merge branch 'main' into print_venv 2025-07-07 10:01:00 +01:00
NanoCode012
5a961ecadf Fix: do not call preprocess in multimodal or pretraining case (#2861)
* fix: let users know to not call preprocess for vision mode

* fix: improve ux for pretraining dataset and skip prepare ds

* feat: add info to doc

* Update src/axolotl/cli/preprocess.py following comment

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
2025-07-06 21:55:33 -04:00
Wing Lian
b37ddf9778 don't use tokenizer parallelism when using packing (#2862) [skip ci] 2025-07-06 21:55:09 -04:00
Wing Lian
bf38e507fb respect shuffle_merged_datasets for single dataset too (#2866) [skip ci]
* respect shuffle_merged_datasets for single dataset too

* update inline comment for behavior

Co-authored-by: NanoCode012 <nano@axolotl.ai>

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
2025-07-06 21:20:41 -04:00
Salman Mohammadi
d00bd99279 Merge branch 'print_venv' of github.com:axolotl-ai-cloud/axolotl into print_venv 2025-07-04 12:44:49 +01:00
Salman Mohammadi
2b41bfe9eb reverting 2025-07-04 12:40:58 +01:00
salman
5bbbd599b4 Merge branch 'main' into print_venv 2025-07-04 12:36:13 +01:00
Salman Mohammadi
26c782183d merging commands 2025-07-04 12:35:20 +01:00
Salman Mohammadi
8065fed126 adding venv to prompt 2025-07-02 15:27:42 +01:00
7 changed files with 28 additions and 8 deletions

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace

View File

@@ -22,9 +22,11 @@ RUN apt-get update \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
&& conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
&& conda init bash \
&& echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
WORKDIR /workspace

View File

@@ -51,6 +51,10 @@ description: Frequently asked questions
> pad_token: "..."
> ```
**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
### Chat templates
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**

View File

@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
check_accelerate_default_config()
check_user_token()
for key in ["skip_prepare_dataset", "pretraining_dataset"]:
if cfg.get("key"):
raise ValueError(
f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
)
if not cfg.dataset_prepared_path:
msg = (
Fore.RED

View File

@@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
if len(datasets) == 1:
ds = datasets[0]
# Do not shuffle if curriculum sampling is enabled
if cfg.curriculum_sampling:
# Do not shuffle if curriculum sampling is enabled or
# shuffle_merged_datasets is disabled
if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
return ds
return ds.shuffle(seed=cfg.seed)

View File

@@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg):
if cfg.qlora_sharded_model_loading:
# model loading is forked after the tokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if cfg.sample_packing:
# multipack parallel packing sampler defaults to using fork
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def setup_trainer(