diff --git a/README.md b/README.md
index 0a2c64e35..55689696e 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ use_cpu: false
 - Once you start your runpod, and SSH into it:
 ```shell
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-source <(curl -s https://raw.githubusercontent.com/winglian/axolotl/main/scripts/setup-runpod.sh)
+source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh)
 ```
 
 - Once the setup script completes
diff --git a/scripts/setup-runpod.sh b/scripts/setup-runpod.sh
index 660df086f..f1389de3d 100644
--- a/scripts/setup-runpod.sh
+++ b/scripts/setup-runpod.sh
@@ -29,14 +29,14 @@ fi
 # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
 mkdir -p /workspace/wheels
 cd /workspace/wheels
-curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
-curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
+curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
+curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
 pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
 pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
 pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
 
 cd /workspace/
-git clone https://github.com/winglian/axolotl.git
+git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
 cd axolotl
 pip install -e .[int4]
 mkdir -p ~/.cache/huggingface/accelerate/
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 98fc00faf..306213b19 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
             )
             dataset = Dataset.from_list([_ for _ in constant_len_dataset])
 
+            # filter out bad data
+            dataset = Dataset.from_list(
+                [
+                    d
+                    for d in dataset
+                    if len(d["input_ids"]) < cfg.sequence_len
+                       and len(d["input_ids"]) > 0
+                       and len(d["input_ids"]) == len(d["attention_mask"])
+                       and len(d["input_ids"]) == len(d["labels"])
+                ]
+            )
+
             if cfg.local_rank == 0:
                 logging.info(
                     f"Saving packed prepared dataset to disk... {prepared_ds_path}"
@@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
             tokenizer, cfg, default_dataset_prepared_path
         )
 
-    # filter out bad data
-    dataset = Dataset.from_list(
-        [
-            d
-            for d in dataset
-            if len(d["input_ids"]) < cfg.sequence_len
-               and len(d["input_ids"]) > 0
-               and len(d["input_ids"]) == len(d["attention_mask"])
-               and len(d["input_ids"]) == len(d["labels"])
-        ]
-    )
-
     if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
         logging.info(
             f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"