Add desc to map/filter (#1162)
* Add desc to map/filter * update descriptions --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -410,7 +410,10 @@ def load_rl_datasets(
|
|||||||
for i, data_set in enumerate(train_datasets):
|
for i, data_set in enumerate(train_datasets):
|
||||||
_type = cfg.datasets[i]["type"]
|
_type = cfg.datasets[i]["type"]
|
||||||
ds_type_fn = locals()[_type]
|
ds_type_fn = locals()[_type]
|
||||||
train_datasets[i] = data_set.map(ds_type_fn)
|
train_datasets[i] = data_set.map(
|
||||||
|
ds_type_fn,
|
||||||
|
desc="Mapping RL Dataset",
|
||||||
|
)
|
||||||
train_dataset = concatenate_datasets(train_datasets)
|
train_dataset = concatenate_datasets(train_datasets)
|
||||||
|
|
||||||
# eval_dataset = eval_dataset.map(intel_apply_chatml)
|
# eval_dataset = eval_dataset.map(intel_apply_chatml)
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ class TokenizedPromptDataset(Dataset):
|
|||||||
num_proc=num_proc,
|
num_proc=num_proc,
|
||||||
remove_columns=features,
|
remove_columns=features,
|
||||||
keep_in_memory=self.keep_in_memory,
|
keep_in_memory=self.keep_in_memory,
|
||||||
|
desc="Tokenizing Prompts",
|
||||||
**map_kwargs,
|
**map_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -792,6 +792,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
|
|||||||
# remove all the existing columns after mapping since they end up having
|
# remove all the existing columns after mapping since they end up having
|
||||||
# a different length than the encoded/tokenized column
|
# a different length than the encoded/tokenized column
|
||||||
remove_columns=dataset.features.keys(),
|
remove_columns=dataset.features.keys(),
|
||||||
|
desc="Encoding Pretraining",
|
||||||
)
|
)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|||||||
@@ -134,12 +134,14 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
drop_long,
|
drop_long,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Dropping Long Sequences",
|
||||||
)
|
)
|
||||||
if eval_dataset:
|
if eval_dataset:
|
||||||
eval_dataset = eval_dataset.filter(
|
eval_dataset = eval_dataset.filter(
|
||||||
drop_long,
|
drop_long,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Dropping Long Sequences",
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.group_by_length:
|
if cfg.group_by_length:
|
||||||
@@ -147,6 +149,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
add_length,
|
add_length,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Group By Length",
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.sample_packing:
|
if cfg.sample_packing:
|
||||||
@@ -154,6 +157,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
add_position_ids,
|
add_position_ids,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Add position_id column (Sample Packing)",
|
||||||
)
|
)
|
||||||
if cfg.eval_sample_packing is not False:
|
if cfg.eval_sample_packing is not False:
|
||||||
if eval_dataset:
|
if eval_dataset:
|
||||||
@@ -161,6 +165,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
add_position_ids,
|
add_position_ids,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Add position_id column (Sample Packing)",
|
||||||
)
|
)
|
||||||
|
|
||||||
return train_dataset, eval_dataset
|
return train_dataset, eval_dataset
|
||||||
@@ -169,9 +174,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
|||||||
def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
|
def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
|
||||||
drop_long = partial(drop_long_seq, sequence_len=sequence_len)
|
drop_long = partial(drop_long_seq, sequence_len=sequence_len)
|
||||||
|
|
||||||
train_dataset = train_dataset.filter(drop_long)
|
train_dataset = train_dataset.filter(
|
||||||
|
drop_long,
|
||||||
|
desc="Dropping Long Sequences",
|
||||||
|
)
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
add_position_ids,
|
add_position_ids,
|
||||||
|
desc="Add position_id column (Pretraining Sample Packing)",
|
||||||
)
|
)
|
||||||
return train_dataset
|
return train_dataset
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user