move filter to before saving so it doesn't happen everytime, update runpod manual script
This commit is contained in:
@@ -155,7 +155,7 @@ use_cpu: false
|
|||||||
- Once you start your runpod, and SSH into it:
|
- Once you start your runpod, and SSH into it:
|
||||||
```shell
|
```shell
|
||||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
||||||
source <(curl -s https://raw.githubusercontent.com/winglian/axolotl/main/scripts/setup-runpod.sh)
|
source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh)
|
||||||
```
|
```
|
||||||
|
|
||||||
- Once the setup script completes
|
- Once the setup script completes
|
||||||
|
|||||||
@@ -29,14 +29,14 @@ fi
|
|||||||
# install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
|
# install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
|
||||||
mkdir -p /workspace/wheels
|
mkdir -p /workspace/wheels
|
||||||
cd /workspace/wheels
|
cd /workspace/wheels
|
||||||
curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
||||||
curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
||||||
pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
||||||
pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
||||||
pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
|
pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
|
||||||
|
|
||||||
cd /workspace/
|
cd /workspace/
|
||||||
git clone https://github.com/winglian/axolotl.git
|
git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||||
cd axolotl
|
cd axolotl
|
||||||
pip install -e .[int4]
|
pip install -e .[int4]
|
||||||
mkdir -p ~/.cache/huggingface/accelerate/
|
mkdir -p ~/.cache/huggingface/accelerate/
|
||||||
|
|||||||
@@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|||||||
)
|
)
|
||||||
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
|
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
|
||||||
|
|
||||||
|
# filter out bad data
|
||||||
|
dataset = Dataset.from_list(
|
||||||
|
[
|
||||||
|
d
|
||||||
|
for d in dataset
|
||||||
|
if len(d["input_ids"]) < cfg.sequence_len
|
||||||
|
and len(d["input_ids"]) > 0
|
||||||
|
and len(d["input_ids"]) == len(d["attention_mask"])
|
||||||
|
and len(d["input_ids"]) == len(d["labels"])
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
if cfg.local_rank == 0:
|
if cfg.local_rank == 0:
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
||||||
@@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|||||||
tokenizer, cfg, default_dataset_prepared_path
|
tokenizer, cfg, default_dataset_prepared_path
|
||||||
)
|
)
|
||||||
|
|
||||||
# filter out bad data
|
|
||||||
dataset = Dataset.from_list(
|
|
||||||
[
|
|
||||||
d
|
|
||||||
for d in dataset
|
|
||||||
if len(d["input_ids"]) < cfg.sequence_len
|
|
||||||
and len(d["input_ids"]) > 0
|
|
||||||
and len(d["input_ids"]) == len(d["attention_mask"])
|
|
||||||
and len(d["input_ids"]) == len(d["labels"])
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
||||||
logging.info(
|
logging.info(
|
||||||
f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
|
f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
|
||||||
|
|||||||
Reference in New Issue
Block a user