Update data.py for signature generation (#851)
* Update data.py Change of conversation formatting type should also trigger updating the preprocessed dataset, so it should be part of the signature. * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -99,7 +99,12 @@ def load_tokenized_prepared_datasets(
|
|||||||
str(cfg.sequence_len)
|
str(cfg.sequence_len)
|
||||||
+ "@"
|
+ "@"
|
||||||
+ "|".join(
|
+ "|".join(
|
||||||
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
|
sorted(
|
||||||
|
[
|
||||||
|
f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
|
||||||
|
for d in cfg.datasets
|
||||||
|
]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
+ "|"
|
+ "|"
|
||||||
+ tokenizer_name
|
+ tokenizer_name
|
||||||
|
|||||||
Reference in New Issue
Block a user