| | import json |
| | import metrics |
| | import argparse |
| | import numpy as np |
| | import multiprocessing |
| | from tqdm import trange |
| | import signal, functools |
| | import re, os, sys, random, time |
| | from fraction import Fraction |
| | from data_processing.answer_extraction import * |
| | from functools import lru_cache |
| | from eval.eval_script import * |
| | MAX_INT = sys.maxsize |
| | INVALID_ANS = "[Invalid]" |
| | INF = 1e9 |
| |
|
| | __all__ = [ |
| | "check_equal", |
| | "check_equal_without_timeout", |
| | "numberic_compare", |
| | "Evaluator", |
| | ] |
| |
|
| | @lru_cache(maxsize=1000000) |
| | def check_equal_without_timeout(ans_1, ans_2): |
| | return math_equal(ans_1, ans_2) |
| |
|
| | def check_equal(ans_1, ans_2, cache_dict=None): |
| | try: |
| | if cache_dict is not None: |
| | key = str(ans_1) + "<##>" + str(ans_2) |
| | if key in cache_dict: return cache_dict[key] |
| | print("Miss") |
| | return check_equal_without_timeout(ans_1, ans_2) |
| | except TimeoutError as e: |
| | return False |
| |
|
| | def numberic_compare(ai, aj, ci, cj, cache_dict=None): |
| | return check_equal(ai, aj, cache_dict) |
| |
|
| | def prep_evaluator( |
| | predicts, completions, perplexities, answer, equal_func, check_equal |
| | ): |
| | m = len(predicts) |
| |
|
| | |
| | max_perplexity = -INF |
| | max_perplexity_count = 0.0 |
| | for i in range(m): |
| | if perplexities[i] > max_perplexity: |
| | max_perplexity = perplexities[i] |
| | max_perplexity_count = 0.0 |
| | if perplexities[i] >= max_perplexity: |
| | max_perplexity_count += 1.0 |
| |
|
| | |
| | correct, answers = 0, [] |
| | for i in range(m): |
| | ans_i = predicts[i] |
| | answers.append([ans_i, np.exp(perplexities[i]), check_equal(ans_i, answer)]) |
| | if perplexities[i] < max_perplexity: continue |
| | if check_equal(ans_i, answer): |
| | correct += 1.0 / max_perplexity_count |
| |
|
| | return correct, answers |
| |
|
| | class Evaluator: |
| | def __init__(self): |
| | self.name = "Perplexity" |
| |
|
| | def process(self, json_file, cache_file, equal_func, evaluator, K, seed=0): |
| | |
| | |
| | results = json_file |
| | n = len(results["predict"]) |
| | m = len(results["predict"][0]) |
| | indices = list(range(m)) |
| | random.seed(seed) |
| | random.shuffle(indices) |
| | indices = indices[: K] |
| |
|
| | if cache_file is not None: |
| | def cache_equal_func(ai, aj, ci, cj): |
| | return equal_func(ai, aj, ci, cj, cache_file) |
| | def cache_check_equal(ai, aj): |
| | return check_equal(ai, aj, cache_file) |
| | else: |
| | cache_equal_func = equal_func |
| | cache_check_equal = check_equal |
| |
|
| |
|
| | predicts, completions, perplexities, answers = [], [], [], [] |
| | for i in range(0, n): |
| | predicts.append([results["predict"][i][j] for j in indices]) |
| | completions.append([results["completion"][i][j] for j in indices]) |
| | perplexities.append([results["mean_logprob"][i][j] for j in indices]) |
| | answers.append(results["answer"][i]) |
| | n = len(predicts) |
| |
|
| | start_time = time.time() |
| | outputs = [] |
| | for idx in trange(n): |
| | res = evaluator( |
| | predicts[idx], |
| | completions[idx], |
| | perplexities[idx], |
| | answers[idx], |
| | cache_equal_func, |
| | cache_check_equal, |
| | ) |
| | outputs.append(res) |
| | print(f"Running Time with Single Process Mode with Seed #{seed}: {time.time() - start_time:.2f}S") |
| |
|
| | for i in trange(n): |
| | m = len(outputs[i][1]) |
| | for j in range(m): |
| | ans, prob, flag = outputs[i][1][j] |
| | maximum, max_bins = metrics.compute_maximum_metrics([x[1] for x in outputs]) |
| | average, avg_bins = metrics.compute_average_metrics([x[1] for x in outputs]) |
| | accs = np.mean([x[0] for x in outputs]) |
| | return accs * 100.0, maximum, average, max_bins, avg_bins |
| |
|
| | def worker(self, args): |
| | json_file, cache_file, K, seed = args |
| | acc, maximum, average, max_bins, avg_bins = self.process( |
| | json_file=json_file, |
| | cache_file=cache_file, |
| | equal_func=numberic_compare, |
| | evaluator=prep_evaluator, |
| | K=K, |
| | seed=seed |
| | ) |
| | return acc, maximum, average |
| |
|
| | def solve(self, json_file, cache_file=None, repeats=10, K=128): |
| | accs, maxs, avgs = [], [], [] |
| | with multiprocessing.Pool() as pool: |
| | results = pool.map(self.worker, [(json_file, cache_file, K, seed) for seed in range(repeats)]) |
| | accs, maxs, _ = zip(*results) |
| | accs, maxs = np.array(accs), np.array(maxs) |
| | return { |
| | "Accuracy": f"{accs.mean():.2f} ± {accs.std():.2f}", |
| | "ECE": f"{maxs[:, 0].mean() * 100.0:.2f} ± {maxs[:, 0].std() * 100.0:.2f}", |
| | } |
| |
|