import gradio as gr import time from apscheduler.schedulers.background import BackgroundScheduler import threading import globals from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats from utils.jobs import run_single_job, launch_jobs, update_job_statuses, relaunch_failed_jobs from typing import List, Optional def status_monitor() -> None: """Background thread to monitor job statuses.""" while True: update_job_statuses() time.sleep(240) # Check every 30 seconds def daily_checkpoint() -> None: """Daily checkpoint - save current state.""" print("Daily checkpoint - saving current state") save_results() # Create Gradio interface def create_app() -> gr.Blocks: with gr.Blocks(title="Inference Provider Testing Dashboard") as demo: with gr.Tab("Main"): gr.Markdown("# Inference Provider Testing Dashboard") gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.") # Simplified action buttons - only essential ones with gr.Row(): launch_btn = gr.Button("Launch All Jobs", variant="primary", scale=2) relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1) output = gr.Textbox(label="Status", interactive=False) # Accordion for viewing/editing models/providers list and initialization with gr.Accordion("Models/Providers Configuration", open=False): init_btn = gr.Button("Fetch and Initialize Models/Providers", variant="secondary") models_providers_display = gr.Code( label="Current Models and Providers", value=load_models_providers_str(), interactive=False, ) # Summary statistics summary_stats = gr.Markdown(value=get_summary_stats()) with gr.Row(): with gr.Column(): gr.Markdown("## Job Results") results_table = gr.Dataframe( value=get_results_table(), interactive=True, show_search="search", show_copy_button=True, show_fullscreen_button=True, wrap=True, static_columns=list(range(9)), datatype=["str", "str", "str", "str", "str", "str", "str", "str", "html", "str"], elem_id="results_table" ) # Event handlers init_btn.click( fn=initialize_models_providers_file, outputs=[output, models_providers_display] ) def launch_and_update(): """Launch jobs and return updated table and stats.""" result = launch_jobs() return result, get_results_table(), get_summary_stats() def relaunch_and_update(): """Relaunch failed jobs and return updated table and stats.""" result = relaunch_failed_jobs() return result, get_results_table(), get_summary_stats() launch_btn.click( fn=launch_and_update, outputs=[output, results_table, summary_stats] ) relaunch_failed_btn.click( fn=relaunch_and_update, outputs=[output, results_table, summary_stats] ) # Handle dataframe cell selection for relaunch def handle_table_select(evt: gr.SelectData): """Handle when a cell in the results table is clicked.""" print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}") # If we selected a "rerun" cell, we relaunch a job if evt.index[1] == 9: # Get the full row data from the dataframe df = get_results_table() row_data = df.data.iloc[evt.index[0]] model = row_data['Model'] provider = row_data['Provider'] print(f"[Relaunch] Relaunching job - Model: {model}, Provider: {provider}") run_single_job(model, provider, globals.TASKS) # Save after individual relaunch save_results() # Then update the table and stats return get_results_table(), get_summary_stats() results_table.select( fn=handle_table_select, inputs=[], outputs=[results_table, summary_stats] ) # Auto-refresh table and stats every 30 seconds def auto_refresh(): """Auto-refresh table and summary stats.""" return get_results_table(), get_summary_stats() demo.load( fn=auto_refresh, inputs=[], outputs=[results_table, summary_stats], every=30 ) with gr.Tab("About"): gr.Markdown(""" In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`. The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited). To run any of these locally, you can use the following ```python from huggingface_hub import run_job, inspect_job, whoami job = run_job( image="hf.co/spaces/OpenEvals/EvalsOnTheHub", command=[ "lighteval", "endpoint", "inference-providers", "model_name=MODEL,provider=PROVIDER", "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0", "--max-samples", "10", "--push-to-hub", "--save-details", "--results-org", "YOURORG" ], namespace="huggingface", secrets={"HF_TOKEN": YOURTOKEN}, token=YOURTOKEN ) ``` """) return demo if __name__ == "__main__": # Load previous results load_results() print("Starting Inference Provider Testing Dashboard") # Start status monitor thread monitor_thread = threading.Thread(target=status_monitor, daemon=True) monitor_thread.start() print("Job status monitor started") # Start APScheduler for daily checkpoint scheduler = BackgroundScheduler() scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) # Run at midnight scheduler.start() print("Daily checkpoint scheduler started (saves at 00:00)") # Create and launch the Gradio interface demo = create_app() demo.launch(server_name="0.0.0.0", server_port=7860)