Spaces:

OpenEvals
/

InferenceProviderTesting

Running

App Files Files Community

Clémentine commited on Nov 4

Commit

6e44082

1 Parent(s): 31c57c2

vc running 4 runs

Browse files

Files changed (2) hide show

app.py +12 -12
globals.py +2 -1

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
 import threading
 import globals
 from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
-from utils.jobs import run_single_job, launch_jobs, update_job_statuses, relaunch_failed_jobs
 from typing import List, Optional
@@ -65,24 +65,24 @@ def create_app() -> gr.Blocks:
                         show_copy_button=True,
                         show_fullscreen_button=True,
                         wrap=True,
-                        static_columns=list(range(9)),
-                        datatype=["str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
                         elem_id="results_table"
                     )
             # Event handlers
             def launch_single_and_update(model: str, provider: str):
-                """Launch a single job and return updated table and stats."""
                 if not model or not provider:
                     return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()
-                job_id = run_single_job(model, provider, globals.TASKS)
-                if job_id == -1:
-                    return "❌ Failed to launch job (may already be running)", get_results_table(), get_summary_stats()
                 save_results()
-                return f"✅ Launched job for {model} on {provider} (ID: {job_id})", get_results_table(), get_summary_stats()
             launch_single_btn.click(
                 fn=launch_single_and_update,
@@ -116,17 +116,17 @@ def create_app() -> gr.Blocks:
                 print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")
                 # If we selected a "rerun" cell, we relaunch a job
-                if evt.index[1] == 9:
                     # Get the full row data from the dataframe
                     df = get_results_table()
                     row_data = df.data.iloc[evt.index[0]]
                     model = row_data['Model']
                     provider = row_data['Provider']
-                    print(f"[Relaunch] Relaunching job - Model: {model}, Provider: {provider}")
-                    run_single_job(model, provider, globals.TASKS)
-                    # Save after individual relaunch
                     save_results()
                 # Then update the table and stats

 import threading
 import globals
 from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
+from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs
 from typing import List, Optional
                         show_copy_button=True,
                         show_fullscreen_button=True,
                         wrap=True,
+                        static_columns=list(range(11)),
+                        datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
                         elem_id="results_table"
                     )
             # Event handlers
             def launch_single_and_update(model: str, provider: str):
+                """Launch multiple jobs for a model-provider combination and return updated table and stats."""
                 if not model or not provider:
                     return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()
+                job_ids = run_multiple_jobs(model, provider, globals.TASKS)
+                if not job_ids:
+                    return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats()
                 save_results()
+                return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats()
             launch_single_btn.click(
                 fn=launch_single_and_update,
                 print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")
                 # If we selected a "rerun" cell, we relaunch a job
+                if evt.index[1] == 11:
                     # Get the full row data from the dataframe
                     df = get_results_table()
                     row_data = df.data.iloc[evt.index[0]]
                     model = row_data['Model']
                     provider = row_data['Provider']
+                    print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}")
+                    run_multiple_jobs(model, provider, globals.TASKS)
+                    # Save after relaunch
                     save_results()
                 # Then update the table and stats

globals.py CHANGED Viewed

@@ -4,7 +4,7 @@ import threading
 from typing import Dict, Any, Optional
 # Type definition for job result entries
-JobResult = Dict[str, Any]  # {model, provider, last_run, status, current_score, previous_score, job_id, start_time, duration, completed_at}
 # Global variables to track jobs
 job_results: Dict[str, JobResult] = {}  # {model_provider_key: JobResult}
@@ -12,6 +12,7 @@ results_lock: threading.Lock = threading.Lock()
 # Configuration
 NUM_MODELS_RUN: int = 100
 RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
 LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
 TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"

 from typing import Dict, Any, Optional
 # Type definition for job result entries
+JobResult = Dict[str, Any]  # {model, provider, last_run, status, current_score, previous_score, job_id, start_time, duration, completed_at, runs: [{job_id, score, status, start_time, duration, completed_at}]}
 # Global variables to track jobs
 job_results: Dict[str, JobResult] = {}  # {model_provider_key: JobResult}
 # Configuration
 NUM_MODELS_RUN: int = 100
+NUM_RUNS_PER_JOB: int = 4  # Number of times to run each job for variance reduction
 RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
 LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
 TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"