dataset handling and aggregate across benchmark
This commit is contained in:
@@ -125,13 +125,25 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
]
|
]
|
||||||
bench_split = "eval"
|
bench_split = "eval"
|
||||||
if trainer.args.bench_dataset == "sampled":
|
if trainer.args.bench_dataset == "sampled":
|
||||||
|
|
||||||
|
def transform_subject(example):
|
||||||
|
# Split on ':' and trim whitespace
|
||||||
|
parts = example["subject"].split(":")
|
||||||
|
first_part = parts[0].strip().lower() # Lowercase the first part
|
||||||
|
second_part = (
|
||||||
|
parts[1].strip().replace("-", "_")
|
||||||
|
) # Replace hyphens with underscores
|
||||||
|
|
||||||
|
# Return the transformed values
|
||||||
|
return {"name": first_part, "subject": second_part}
|
||||||
|
|
||||||
bench_dataset = load_dataset(
|
bench_dataset = load_dataset(
|
||||||
"pharaouk/dharma-1",
|
"pharaouk/dharma-1",
|
||||||
data_files={
|
data_files={
|
||||||
"eval": "dharma_eval.json",
|
"eval": "dharma_eval.json",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
# bench_dataset = bench_dataset.remove_columns("subject")
|
bench_dataset["eval"] = bench_dataset["eval"].map(transform_subject)
|
||||||
elif trainer.args.bench_dataset == "mmlu-zs":
|
elif trainer.args.bench_dataset == "mmlu-zs":
|
||||||
bench_dataset = load_dataset(
|
bench_dataset = load_dataset(
|
||||||
"openaccess-ai-collective/mmlu-evals",
|
"openaccess-ai-collective/mmlu-evals",
|
||||||
@@ -228,21 +240,21 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
|||||||
loss_bench += loss.item()
|
loss_bench += loss.item()
|
||||||
# Extract results by subject.
|
# Extract results by subject.
|
||||||
results = {"bench_loss": loss_bench / len(data_loader)}
|
results = {"bench_loss": loss_bench / len(data_loader)}
|
||||||
subject = bench_dataset["subject"]
|
bench_name = bench_dataset["name"]
|
||||||
subjects: dict = {s: {"refs": [], "preds": []} for s in set(subject)}
|
bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
|
||||||
for s, p, r in zip(subject, preds, refs): # pylint: disable=invalid-name
|
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
||||||
subjects[s]["preds"].append(p)
|
bench_names[s]["preds"].append(p)
|
||||||
subjects[s]["refs"].append(r)
|
bench_names[s]["refs"].append(r)
|
||||||
subject_scores = []
|
bench_scores = []
|
||||||
for subject in subjects:
|
for bench_name in bench_names:
|
||||||
subject_score = accuracy.compute(
|
bench_score = accuracy.compute(
|
||||||
references=subjects[subject]["refs"],
|
references=bench_names[bench_name]["refs"],
|
||||||
predictions=subjects[subject]["preds"],
|
predictions=bench_names[bench_name]["preds"],
|
||||||
)["accuracy"]
|
)["accuracy"]
|
||||||
if not pd.isna(subject_score):
|
if not pd.isna(bench_score):
|
||||||
results[f"bench_{bench_split}_accuracy_{subject}"] = subject_score
|
results[f"bench_{bench_split}_accuracy_{bench_name}"] = bench_score
|
||||||
subject_scores.append(subject_score)
|
bench_scores.append(bench_score)
|
||||||
results[f"bench_{bench_split}_accuracy"] = np.mean(subject_scores)
|
results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
|
||||||
trainer.log(results)
|
trainer.log(results)
|
||||||
trainer.data_collator.max_length = source_max_len
|
trainer.data_collator.max_length = source_max_len
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user