From a1f9850b91c34cdb5d819351fdcc36f9bbf9221f Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 29 May 2023 22:26:26 +0900
Subject: [PATCH] Fix security issue or ignore false positives

---
 scripts/finetune.py              | 4 ++--
 src/axolotl/prompt_tokenizers.py | 8 ++++----
 src/axolotl/utils/data.py        | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 4716744b2..6c42b3061 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -136,7 +136,7 @@ def train(
 
     # load the config from the yaml file
     with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.load(file, Loader=yaml.Loader))
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
     # if there are any options passed in the cli, if it is something that seems valid from the yaml,
     # then overwrite the value
     cfg_keys = cfg.keys()
@@ -185,7 +185,7 @@ def train(
         logging.info("check_dataset_labels...")
         check_dataset_labels(
             train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
+                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
             ),
             tokenizer,
         )
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 3acae91b8..582c35ebd 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -11,10 +11,10 @@ from transformers import PreTrainedTokenizer
 from axolotl.prompters import IGNORE_TOKEN_ID
 
 IGNORE_INDEX = -100
-LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"
-LLAMA_DEFAULT_EOS_TOKEN = "</s>"
-LLAMA_DEFAULT_BOS_TOKEN = "<s>"
-LLAMA_DEFAULT_UNK_TOKEN = "<unk>"
+LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
+LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
+LLAMA_DEFAULT_BOS_TOKEN = "<s>"  # nosec
+LLAMA_DEFAULT_UNK_TOKEN = "<unk>"  # nosec
 
 
 class InvalidDataException(Exception):
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index c505cccfa..9534323de 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -40,7 +40,7 @@ def load_tokenized_prepared_datasets(
 ) -> DatasetDict:
     tokenizer_name = tokenizer.__class__.__name__
     ds_hash = str(
-        md5(
+        md5(  # nosec
             (
                 str(cfg.sequence_len)
                 + "@"
@@ -66,7 +66,7 @@ def load_tokenized_prepared_datasets(
                 use_auth_token=use_auth_token,
             )
             dataset = dataset["train"]
-    except Exception:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except # nosec
         pass
 
     if dataset:
@@ -272,7 +272,7 @@ def load_prepare_datasets(
         # see if we can go ahead and load the stacked dataset
         seed = f"@{str(cfg.seed)}" if cfg.seed else ""
         ds_hash = str(
-            md5(
+            md5(  # nosec
                 (
                     str(cfg.sequence_len)
                     + "@"
@@ -304,7 +304,7 @@ def load_prepare_datasets(
                     use_auth_token=use_auth_token,
                 )
                 dataset = dataset["train"]
-        except Exception:  # pylint: disable=broad-except
+        except Exception:  # pylint: disable=broad-except # nosec
             pass
 
         if dataset: