Fix falcon tokenization step (#1441) [skip ci]

* Fix falcon tokenization step

* chore: lint

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
Far El
2024-03-26 15:19:34 -04:00
committed by GitHub
parent c19d060a74
commit bcdc9b1601

View File

@@ -124,9 +124,10 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
eval_dataset = eval_dataset.remove_columns("attention_mask")
if cfg.model_config_type == "falcon":
LOG.info("dropping token_type_ids column")
train_dataset = train_dataset.remove_columns("token_type_ids")
if eval_dataset:
LOG.info("dropping token_type_ids column if it exists")
if "token_type_ids" in train_dataset.column_names:
train_dataset = train_dataset.remove_columns("token_type_ids")
if eval_dataset and "token_type_ids" in eval_dataset.column_names:
eval_dataset = eval_dataset.remove_columns("token_type_ids")
train_dataset = train_dataset.filter(