Update data.py for signature generation (#851)
* Update data.py Change of conversation formatting type should also trigger updating the preprocessed dataset, so it should be part of the signature. * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -99,7 +99,12 @@ def load_tokenized_prepared_datasets(
|
||||
str(cfg.sequence_len)
|
||||
+ "@"
|
||||
+ "|".join(
|
||||
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
|
||||
sorted(
|
||||
[
|
||||
f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
|
||||
for d in cfg.datasets
|
||||
]
|
||||
)
|
||||
)
|
||||
+ "|"
|
||||
+ tokenizer_name
|
||||
|
||||
Reference in New Issue
Block a user