Vram fix attempt (#1164) [skip ci]
* revert order of filter/drop_long step and handle calc for max_input_len only during preprocessing * revert some changes to preparing for packing to allow more flexibility * prepare dataset for packing during pre-processing step * prepare dataset hash based on sample packing too * enclose none check * just cast straight to string for ds hash
This commit is contained in:
@@ -116,6 +116,12 @@ def load_tokenized_prepared_datasets(
|
|||||||
(
|
(
|
||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
|
+ str(cfg.sample_packing)
|
||||||
|
+ "@"
|
||||||
|
+ str(cfg.eval_sample_packing)
|
||||||
|
+ "@"
|
||||||
|
+ str(cfg.group_by_length)
|
||||||
|
+ "@"
|
||||||
+ "|".join(
|
+ "|".join(
|
||||||
sorted(
|
sorted(
|
||||||
[
|
[
|
||||||
@@ -162,7 +168,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
LOG.info("Loading raw datasets...")
|
LOG.info("Loading raw datasets...")
|
||||||
if not cfg.is_preprocess:
|
if not cfg.is_preprocess:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"Processing datasets during training can lead to VRAM instability. Please pre-process your dataset"
|
"Processing datasets during training can lead to VRAM instability. Please pre-process your dataset."
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.seed:
|
if cfg.seed:
|
||||||
|
|||||||
@@ -7,11 +7,11 @@ import numpy as np
|
|||||||
def get_dataset_lengths(dataset):
|
def get_dataset_lengths(dataset):
|
||||||
if "length" in dataset.data.column_names:
|
if "length" in dataset.data.column_names:
|
||||||
lengths = np.array(dataset.data.column("length"))
|
lengths = np.array(dataset.data.column("length"))
|
||||||
|
elif "position_ids" in dataset.data.column_names:
|
||||||
|
position_ids = dataset.data.column("position_ids")
|
||||||
|
lengths = np.array([x[-1] + 1 for x in position_ids])
|
||||||
else:
|
else:
|
||||||
lengths = (
|
input_ids = dataset.data.column("input_ids")
|
||||||
dataset.data.column("position_ids")
|
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
|
||||||
.to_pandas()
|
return lengths
|
||||||
.apply(lambda x: x[-1] + 1)
|
|
||||||
.values
|
|
||||||
)
|
|
||||||
return lengths
|
return lengths
|
||||||
|
|||||||
@@ -109,6 +109,33 @@ def drop_long_seq(sample, sequence_len=2048):
|
|||||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
||||||
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
||||||
with zero_first(is_main_process()):
|
with zero_first(is_main_process()):
|
||||||
|
if cfg.is_preprocess:
|
||||||
|
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
||||||
|
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
||||||
|
|
||||||
|
# Phi doesn't want the attention_mask feature when training
|
||||||
|
if (
|
||||||
|
"CodeGenTokenizer" in tokenizer.__class__.__name__
|
||||||
|
or (cfg.is_mistral_derived_model and cfg.flash_attention)
|
||||||
|
or cfg.model_config_type == "mamba"
|
||||||
|
):
|
||||||
|
LOG.info("dropping attention_mask column")
|
||||||
|
train_dataset = train_dataset.remove_columns("attention_mask")
|
||||||
|
if eval_dataset:
|
||||||
|
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
||||||
|
|
||||||
|
train_dataset = train_dataset.filter(
|
||||||
|
drop_long,
|
||||||
|
num_proc=cfg.dataset_processes,
|
||||||
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
)
|
||||||
|
if eval_dataset:
|
||||||
|
eval_dataset = eval_dataset.filter(
|
||||||
|
drop_long,
|
||||||
|
num_proc=cfg.dataset_processes,
|
||||||
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
)
|
||||||
|
|
||||||
if cfg.group_by_length:
|
if cfg.group_by_length:
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
add_length,
|
add_length,
|
||||||
@@ -130,33 +157,6 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.group_by_length or cfg.sample_packing:
|
|
||||||
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
|
||||||
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
|
||||||
|
|
||||||
train_dataset = train_dataset.filter(
|
|
||||||
drop_long,
|
|
||||||
num_proc=cfg.dataset_processes,
|
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
|
||||||
)
|
|
||||||
if eval_dataset:
|
|
||||||
eval_dataset = eval_dataset.filter(
|
|
||||||
drop_long,
|
|
||||||
num_proc=cfg.dataset_processes,
|
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Phi doesn't want the attention_mask feature when training
|
|
||||||
if (
|
|
||||||
"CodeGenTokenizer" in tokenizer.__class__.__name__
|
|
||||||
or (cfg.is_mistral_derived_model and cfg.flash_attention)
|
|
||||||
or cfg.model_config_type == "mamba"
|
|
||||||
):
|
|
||||||
LOG.info("dropping attention_mask column")
|
|
||||||
train_dataset = train_dataset.remove_columns("attention_mask")
|
|
||||||
if eval_dataset:
|
|
||||||
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
|
||||||
|
|
||||||
return train_dataset, eval_dataset
|
return train_dataset, eval_dataset
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user