Add desc to map/filter (#1162)

* Add desc to map/filter

* update descriptions

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
Casper
2024-01-23 03:30:53 +01:00
committed by GitHub
parent cda52dc32b
commit 684038111e
4 changed files with 16 additions and 2 deletions

View File

@@ -410,7 +410,10 @@ def load_rl_datasets(
for i, data_set in enumerate(train_datasets): for i, data_set in enumerate(train_datasets):
_type = cfg.datasets[i]["type"] _type = cfg.datasets[i]["type"]
ds_type_fn = locals()[_type] ds_type_fn = locals()[_type]
train_datasets[i] = data_set.map(ds_type_fn) train_datasets[i] = data_set.map(
ds_type_fn,
desc="Mapping RL Dataset",
)
train_dataset = concatenate_datasets(train_datasets) train_dataset = concatenate_datasets(train_datasets)
# eval_dataset = eval_dataset.map(intel_apply_chatml) # eval_dataset = eval_dataset.map(intel_apply_chatml)

View File

@@ -57,6 +57,7 @@ class TokenizedPromptDataset(Dataset):
num_proc=num_proc, num_proc=num_proc,
remove_columns=features, remove_columns=features,
keep_in_memory=self.keep_in_memory, keep_in_memory=self.keep_in_memory,
desc="Tokenizing Prompts",
**map_kwargs, **map_kwargs,
) )

View File

@@ -792,6 +792,7 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
# remove all the existing columns after mapping since they end up having # remove all the existing columns after mapping since they end up having
# a different length than the encoded/tokenized column # a different length than the encoded/tokenized column
remove_columns=dataset.features.keys(), remove_columns=dataset.features.keys(),
desc="Encoding Pretraining",
) )
return dataset return dataset

View File

@@ -134,12 +134,14 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
drop_long, drop_long,
num_proc=cfg.dataset_processes, num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess, load_from_cache_file=not cfg.is_preprocess,
desc="Dropping Long Sequences",
) )
if eval_dataset: if eval_dataset:
eval_dataset = eval_dataset.filter( eval_dataset = eval_dataset.filter(
drop_long, drop_long,
num_proc=cfg.dataset_processes, num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess, load_from_cache_file=not cfg.is_preprocess,
desc="Dropping Long Sequences",
) )
if cfg.group_by_length: if cfg.group_by_length:
@@ -147,6 +149,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
add_length, add_length,
num_proc=cfg.dataset_processes, num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess, load_from_cache_file=not cfg.is_preprocess,
desc="Group By Length",
) )
if cfg.sample_packing: if cfg.sample_packing:
@@ -154,6 +157,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
add_position_ids, add_position_ids,
num_proc=cfg.dataset_processes, num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess, load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (Sample Packing)",
) )
if cfg.eval_sample_packing is not False: if cfg.eval_sample_packing is not False:
if eval_dataset: if eval_dataset:
@@ -161,6 +165,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
add_position_ids, add_position_ids,
num_proc=cfg.dataset_processes, num_proc=cfg.dataset_processes,
load_from_cache_file=not cfg.is_preprocess, load_from_cache_file=not cfg.is_preprocess,
desc="Add position_id column (Sample Packing)",
) )
return train_dataset, eval_dataset return train_dataset, eval_dataset
@@ -169,9 +174,13 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
def process_pretraining_datasets_for_packing(train_dataset, sequence_len): def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
drop_long = partial(drop_long_seq, sequence_len=sequence_len) drop_long = partial(drop_long_seq, sequence_len=sequence_len)
train_dataset = train_dataset.filter(drop_long) train_dataset = train_dataset.filter(
drop_long,
desc="Dropping Long Sequences",
)
train_dataset = train_dataset.map( train_dataset = train_dataset.map(
add_position_ids, add_position_ids,
desc="Add position_id column (Pretraining Sample Packing)",
) )
return train_dataset return train_dataset