Spaces:
Running
Running
File size: 4,866 Bytes
19cdc16 e4b1648 19cdc16 2391a62 19cdc16 cebed79 c04b086 cebed79 c04b086 e3ffac3 c04b086 cebed79 c3ba57d 19cdc16 d472b41 19cdc16 cebed79 c3ba57d 19cdc16 c3ba57d c04b086 c3ba57d c04b086 c3ba57d e4b1648 4682731 e4b1648 4682731 c04b086 e4b1648 4682731 c3ba57d c04b086 4682731 c04b086 19cdc16 4682731 19cdc16 4682731 19cdc16 cebed79 c3ba57d 19cdc16 c04b086 e3ffac3 c04b086 19cdc16 c04b086 cebed79 19cdc16 4682731 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import gradio as gr
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
AutoEvalColumn,
fields,
)
from src.envs import (
API,
EVAL_REQUESTS_PATH,
EVAL_RESULTS_PATH,
REPO_ID,
TOKEN,
)
from src.populate import get_leaderboard_df, get_model_info_df, get_merged_df
def restart_space():
API.restart_space(repo_id=REPO_ID, token=TOKEN)
LEADERBOARD_DF = get_leaderboard_df(
EVAL_RESULTS_PATH + "/leaderboards/BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
)
LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
EVAL_RESULTS_PATH + "/leaderboards/BOOM_domain_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
)
LEADERBOARD_DF_METRIC_TYPE = get_leaderboard_df(
EVAL_RESULTS_PATH + "/leaderboards/BOOM_metric_type_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
)
LEADERBOARD_DF_TERM = get_leaderboard_df(
EVAL_RESULTS_PATH + "/leaderboards/BOOM_term_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
)
LEADERBOARD_DF_BOOMLET = get_leaderboard_df(
EVAL_RESULTS_PATH + "/leaderboards/BOOMLET_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
)
model_info_df = get_model_info_df(EVAL_RESULTS_PATH)
# (
# finished_eval_queue_df,
# running_eval_queue_df,
# pending_eval_queue_df,
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe, model_info_df):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
merged_df = get_merged_df(dataframe, model_info_df)
if "Rank" in merged_df.columns:
merged_df = merged_df.sort_values(by=["Rank"], ascending=True)
else:
# Sort by the first CRPS column if the Rank column is not present
crps_cols = [col for col in merged_df.columns if "CRPS" in col]
if crps_cols:
merged_df = merged_df.sort_values(by=crps_cols[0], ascending=True)
# Move the model_type_symbol column to the beginning
cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] + sorted(
[
col
for col in merged_df.columns
if col not in [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
]
)
merged_df = merged_df[cols]
# Remove hidden columns
hidden_cols = [c.name for c in fields(AutoEvalColumn) if c.hidden]
merged_df = merged_df.drop(columns=[col for col in hidden_cols if col in merged_df.columns], errors="ignore")
# Build datatype list
col2type_dict = {c.name: c.type for c in fields(AutoEvalColumn)}
datatype_list = [col2type_dict[col] if col in col2type_dict else "number" for col in merged_df.columns]
# Use native Gradio 6 Dataframe with search and filter
return gr.Dataframe(
value=merged_df,
datatype=datatype_list,
show_search="filter", # Enable search + per-column filtering
column_widths=[40, 180] + [160 for _ in range(len(merged_df.columns) - 2)],
wrap=True,
interactive=False,
max_height=600,
)
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("π
Overall", elem_id="boom-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF, model_info_df)
with gr.TabItem("π
By Domain", elem_id="boom-benchmark-tab-table", id=1):
leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN, model_info_df)
with gr.TabItem("π
By Metric Type", elem_id="boom-benchmark-tab-table", id=2):
leaderboard = init_leaderboard(LEADERBOARD_DF_METRIC_TYPE, model_info_df)
with gr.TabItem("π
By Forecast Horizon", elem_id="boom-benchmark-tab-table", id=3):
leaderboard = init_leaderboard(LEADERBOARD_DF_TERM, model_info_df)
with gr.TabItem("π
BOOMLET", elem_id="boom-benchmark-tab-table", id=4):
leaderboard = init_leaderboard(LEADERBOARD_DF_BOOMLET, model_info_df)
with gr.TabItem("π About", elem_id="boom-benchmark-tab-table", id=5):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("π Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
)
# Queue memory leak fixed in Gradio 6.11+, no need for scheduled restarts
demo.queue(default_concurrency_limit=40).launch(css=custom_css)
|