Fix security issue or ignore false positives
This commit is contained in:
@@ -136,7 +136,7 @@ def train(
|
|||||||
|
|
||||||
# load the config from the yaml file
|
# load the config from the yaml file
|
||||||
with open(config, encoding="utf-8") as file:
|
with open(config, encoding="utf-8") as file:
|
||||||
cfg: DictDefault = DictDefault(yaml.load(file, Loader=yaml.Loader))
|
cfg: DictDefault = DictDefault(yaml.safe_load(file))
|
||||||
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
|
# if there are any options passed in the cli, if it is something that seems valid from the yaml,
|
||||||
# then overwrite the value
|
# then overwrite the value
|
||||||
cfg_keys = cfg.keys()
|
cfg_keys = cfg.keys()
|
||||||
@@ -185,7 +185,7 @@ def train(
|
|||||||
logging.info("check_dataset_labels...")
|
logging.info("check_dataset_labels...")
|
||||||
check_dataset_labels(
|
check_dataset_labels(
|
||||||
train_dataset.select(
|
train_dataset.select(
|
||||||
[random.randrange(0, len(train_dataset) - 1) for i in range(5)]
|
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
|
||||||
),
|
),
|
||||||
tokenizer,
|
tokenizer,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -11,10 +11,10 @@ from transformers import PreTrainedTokenizer
|
|||||||
from axolotl.prompters import IGNORE_TOKEN_ID
|
from axolotl.prompters import IGNORE_TOKEN_ID
|
||||||
|
|
||||||
IGNORE_INDEX = -100
|
IGNORE_INDEX = -100
|
||||||
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"
|
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
|
||||||
LLAMA_DEFAULT_EOS_TOKEN = "</s>"
|
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
||||||
LLAMA_DEFAULT_BOS_TOKEN = "<s>"
|
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
||||||
LLAMA_DEFAULT_UNK_TOKEN = "<unk>"
|
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
||||||
|
|
||||||
|
|
||||||
class InvalidDataException(Exception):
|
class InvalidDataException(Exception):
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
) -> DatasetDict:
|
) -> DatasetDict:
|
||||||
tokenizer_name = tokenizer.__class__.__name__
|
tokenizer_name = tokenizer.__class__.__name__
|
||||||
ds_hash = str(
|
ds_hash = str(
|
||||||
md5(
|
md5( # nosec
|
||||||
(
|
(
|
||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
@@ -66,7 +66,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
use_auth_token=use_auth_token,
|
use_auth_token=use_auth_token,
|
||||||
)
|
)
|
||||||
dataset = dataset["train"]
|
dataset = dataset["train"]
|
||||||
except Exception: # pylint: disable=broad-except
|
except Exception: # pylint: disable=broad-except # nosec
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
@@ -272,7 +272,7 @@ def load_prepare_datasets(
|
|||||||
# see if we can go ahead and load the stacked dataset
|
# see if we can go ahead and load the stacked dataset
|
||||||
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
||||||
ds_hash = str(
|
ds_hash = str(
|
||||||
md5(
|
md5( # nosec
|
||||||
(
|
(
|
||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
@@ -304,7 +304,7 @@ def load_prepare_datasets(
|
|||||||
use_auth_token=use_auth_token,
|
use_auth_token=use_auth_token,
|
||||||
)
|
)
|
||||||
dataset = dataset["train"]
|
dataset = dataset["train"]
|
||||||
except Exception: # pylint: disable=broad-except
|
except Exception: # pylint: disable=broad-except # nosec
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
|
|||||||
Reference in New Issue
Block a user