Spaces:

open-r1
/

open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF Staff commited on Jul 5, 2024

Commit

f7bb52c

1 Parent(s): 03f9140

Fix missing evals

Browse files

Files changed (1) hide show

app.py +9 -2

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ DESCRIPTION = f"""
 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
-BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5"]
 def get_leaderboard_df():
@@ -48,27 +48,34 @@ def get_leaderboard_df():
                 # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
                 if task.lower() == "truthfulqa":
                     value = data["results"][first_result_key]["truthfulqa_mc2"]
                 # IFEval has several metrics but we report just the prompt-loose-acc one
                 elif task.lower() == "ifeval":
                     value = data["results"][first_result_key]["prompt_level_loose_acc"]
                 # MMLU has several metrics but we report just the average one
                 elif task.lower() == "mmlu":
                     value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
                 # HellaSwag and ARC reports acc_norm
                 elif task.lower() in ["hellaswag", "arc"]:
                     value = data["results"][first_result_key]["acc_norm"]
                 # BBH has several metrics but we report just the average one
                 elif task.lower() == "bbh":
                     if "all" in data["results"]:
                         value = data["results"]["all"]["acc"]
                     else:
                         value = -100
                 # AGIEval reports acc_norm
                 elif task.lower() == "agieval":
                     value = data["results"]["all"]["acc_norm"]
                 # MATH reports qem
-                elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]
                 # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
                 elif task.lower() in ["mini_math_v2"]:
                     for k, v in data["results"].items():

 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
+BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
 def get_leaderboard_df():
                 # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
                 if task.lower() == "truthfulqa":
                     value = data["results"][first_result_key]["truthfulqa_mc2"]
+                    df.loc[model_revision, task] = float(value)
                 # IFEval has several metrics but we report just the prompt-loose-acc one
                 elif task.lower() == "ifeval":
                     value = data["results"][first_result_key]["prompt_level_loose_acc"]
+                    df.loc[model_revision, task] = float(value)
                 # MMLU has several metrics but we report just the average one
                 elif task.lower() == "mmlu":
                     value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
+                    df.loc[model_revision, task] = float(value)
                 # HellaSwag and ARC reports acc_norm
                 elif task.lower() in ["hellaswag", "arc"]:
                     value = data["results"][first_result_key]["acc_norm"]
+                    df.loc[model_revision, task] = float(value)
                 # BBH has several metrics but we report just the average one
                 elif task.lower() == "bbh":
                     if "all" in data["results"]:
                         value = data["results"]["all"]["acc"]
                     else:
                         value = -100
+                    df.loc[model_revision, task] = float(value)
                 # AGIEval reports acc_norm
                 elif task.lower() == "agieval":
                     value = data["results"]["all"]["acc_norm"]
+                    df.loc[model_revision, task] = float(value)
                 # MATH reports qem
+                elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]
+                    df.loc[model_revision, task] = float(value)
                 # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
                 elif task.lower() in ["mini_math_v2"]:
                     for k, v in data["results"].items():