From a1f9850b91c34cdb5d819351fdcc36f9bbf9221f Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Mon, 29 May 2023 22:26:26 +0900 Subject: [PATCH] Fix security issue or ignore false positives --- scripts/finetune.py | 4 ++-- src/axolotl/prompt_tokenizers.py | 8 ++++---- src/axolotl/utils/data.py | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 4716744b2..6c42b3061 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -136,7 +136,7 @@ def train( # load the config from the yaml file with open(config, encoding="utf-8") as file: - cfg: DictDefault = DictDefault(yaml.load(file, Loader=yaml.Loader)) + cfg: DictDefault = DictDefault(yaml.safe_load(file)) # if there are any options passed in the cli, if it is something that seems valid from the yaml, # then overwrite the value cfg_keys = cfg.keys() @@ -185,7 +185,7 @@ def train( logging.info("check_dataset_labels...") check_dataset_labels( train_dataset.select( - [random.randrange(0, len(train_dataset) - 1) for i in range(5)] + [random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec ), tokenizer, ) diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 3acae91b8..582c35ebd 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -11,10 +11,10 @@ from transformers import PreTrainedTokenizer from axolotl.prompters import IGNORE_TOKEN_ID IGNORE_INDEX = -100 -LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" -LLAMA_DEFAULT_EOS_TOKEN = "" -LLAMA_DEFAULT_BOS_TOKEN = "" -LLAMA_DEFAULT_UNK_TOKEN = "" +LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec +LLAMA_DEFAULT_EOS_TOKEN = "" # nosec +LLAMA_DEFAULT_BOS_TOKEN = "" # nosec +LLAMA_DEFAULT_UNK_TOKEN = "" # nosec class InvalidDataException(Exception): diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index c505cccfa..9534323de 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -40,7 +40,7 @@ def load_tokenized_prepared_datasets( ) -> DatasetDict: tokenizer_name = tokenizer.__class__.__name__ ds_hash = str( - md5( + md5( # nosec ( str(cfg.sequence_len) + "@" @@ -66,7 +66,7 @@ def load_tokenized_prepared_datasets( use_auth_token=use_auth_token, ) dataset = dataset["train"] - except Exception: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except # nosec pass if dataset: @@ -272,7 +272,7 @@ def load_prepare_datasets( # see if we can go ahead and load the stacked dataset seed = f"@{str(cfg.seed)}" if cfg.seed else "" ds_hash = str( - md5( + md5( # nosec ( str(cfg.sequence_len) + "@" @@ -304,7 +304,7 @@ def load_prepare_datasets( use_auth_token=use_auth_token, ) dataset = dataset["train"] - except Exception: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except # nosec pass if dataset: