diff --git a/README.md b/README.md index 0a2c64e35..55689696e 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ use_cpu: false - Once you start your runpod, and SSH into it: ```shell export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" -source <(curl -s https://raw.githubusercontent.com/winglian/axolotl/main/scripts/setup-runpod.sh) +source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh) ``` - Once the setup script completes diff --git a/scripts/setup-runpod.sh b/scripts/setup-runpod.sh index 660df086f..f1389de3d 100644 --- a/scripts/setup-runpod.sh +++ b/scripts/setup-runpod.sh @@ -29,14 +29,14 @@ fi # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install mkdir -p /workspace/wheels cd /workspace/wheels -curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl -curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl +curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl +curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies cd /workspace/ -git clone https://github.com/winglian/axolotl.git +git clone https://github.com/OpenAccess-AI-Collective/axolotl.git cd axolotl pip install -e .[int4] mkdir -p ~/.cache/huggingface/accelerate/ diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 98fc00faf..306213b19 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): ) dataset = Dataset.from_list([_ for _ in constant_len_dataset]) + # filter out bad data + dataset = Dataset.from_list( + [ + d + for d in dataset + if len(d["input_ids"]) < cfg.sequence_len + and len(d["input_ids"]) > 0 + and len(d["input_ids"]) == len(d["attention_mask"]) + and len(d["input_ids"]) == len(d["labels"]) + ] + ) + if cfg.local_rank == 0: logging.info( f"Saving packed prepared dataset to disk... {prepared_ds_path}" @@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): tokenizer, cfg, default_dataset_prepared_path ) - # filter out bad data - dataset = Dataset.from_list( - [ - d - for d in dataset - if len(d["input_ids"]) < cfg.sequence_len - and len(d["input_ids"]) > 0 - and len(d["input_ids"]) == len(d["attention_mask"]) - and len(d["input_ids"]) == len(d["labels"]) - ] - ) - if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None: logging.info( f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"