Clémentine
commited on
Commit
·
6e44082
1
Parent(s):
31c57c2
vc running 4 runs
Browse files- app.py +12 -12
- globals.py +2 -1
app.py
CHANGED
|
@@ -4,7 +4,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
| 4 |
import threading
|
| 5 |
import globals
|
| 6 |
from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
|
| 7 |
-
from utils.jobs import run_single_job, launch_jobs, update_job_statuses, relaunch_failed_jobs
|
| 8 |
from typing import List, Optional
|
| 9 |
|
| 10 |
|
|
@@ -65,24 +65,24 @@ def create_app() -> gr.Blocks:
|
|
| 65 |
show_copy_button=True,
|
| 66 |
show_fullscreen_button=True,
|
| 67 |
wrap=True,
|
| 68 |
-
static_columns=list(range(
|
| 69 |
-
datatype=["str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
|
| 70 |
elem_id="results_table"
|
| 71 |
)
|
| 72 |
|
| 73 |
|
| 74 |
# Event handlers
|
| 75 |
def launch_single_and_update(model: str, provider: str):
|
| 76 |
-
"""Launch a
|
| 77 |
if not model or not provider:
|
| 78 |
return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()
|
| 79 |
|
| 80 |
-
|
| 81 |
-
if
|
| 82 |
-
return "❌ Failed to launch
|
| 83 |
|
| 84 |
save_results()
|
| 85 |
-
return f"✅ Launched
|
| 86 |
|
| 87 |
launch_single_btn.click(
|
| 88 |
fn=launch_single_and_update,
|
|
@@ -116,17 +116,17 @@ def create_app() -> gr.Blocks:
|
|
| 116 |
print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")
|
| 117 |
|
| 118 |
# If we selected a "rerun" cell, we relaunch a job
|
| 119 |
-
if evt.index[1] ==
|
| 120 |
# Get the full row data from the dataframe
|
| 121 |
df = get_results_table()
|
| 122 |
row_data = df.data.iloc[evt.index[0]]
|
| 123 |
|
| 124 |
model = row_data['Model']
|
| 125 |
provider = row_data['Provider']
|
| 126 |
-
print(f"[Relaunch] Relaunching
|
| 127 |
|
| 128 |
-
|
| 129 |
-
# Save after
|
| 130 |
save_results()
|
| 131 |
|
| 132 |
# Then update the table and stats
|
|
|
|
| 4 |
import threading
|
| 5 |
import globals
|
| 6 |
from utils.io import initialize_models_providers_file, save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
|
| 7 |
+
from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs
|
| 8 |
from typing import List, Optional
|
| 9 |
|
| 10 |
|
|
|
|
| 65 |
show_copy_button=True,
|
| 66 |
show_fullscreen_button=True,
|
| 67 |
wrap=True,
|
| 68 |
+
static_columns=list(range(11)),
|
| 69 |
+
datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
|
| 70 |
elem_id="results_table"
|
| 71 |
)
|
| 72 |
|
| 73 |
|
| 74 |
# Event handlers
|
| 75 |
def launch_single_and_update(model: str, provider: str):
|
| 76 |
+
"""Launch multiple jobs for a model-provider combination and return updated table and stats."""
|
| 77 |
if not model or not provider:
|
| 78 |
return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()
|
| 79 |
|
| 80 |
+
job_ids = run_multiple_jobs(model, provider, globals.TASKS)
|
| 81 |
+
if not job_ids:
|
| 82 |
+
return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats()
|
| 83 |
|
| 84 |
save_results()
|
| 85 |
+
return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats()
|
| 86 |
|
| 87 |
launch_single_btn.click(
|
| 88 |
fn=launch_single_and_update,
|
|
|
|
| 116 |
print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")
|
| 117 |
|
| 118 |
# If we selected a "rerun" cell, we relaunch a job
|
| 119 |
+
if evt.index[1] == 11:
|
| 120 |
# Get the full row data from the dataframe
|
| 121 |
df = get_results_table()
|
| 122 |
row_data = df.data.iloc[evt.index[0]]
|
| 123 |
|
| 124 |
model = row_data['Model']
|
| 125 |
provider = row_data['Provider']
|
| 126 |
+
print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}")
|
| 127 |
|
| 128 |
+
run_multiple_jobs(model, provider, globals.TASKS)
|
| 129 |
+
# Save after relaunch
|
| 130 |
save_results()
|
| 131 |
|
| 132 |
# Then update the table and stats
|
globals.py
CHANGED
|
@@ -4,7 +4,7 @@ import threading
|
|
| 4 |
from typing import Dict, Any, Optional
|
| 5 |
|
| 6 |
# Type definition for job result entries
|
| 7 |
-
JobResult = Dict[str, Any] # {model, provider, last_run, status, current_score, previous_score, job_id, start_time, duration, completed_at}
|
| 8 |
|
| 9 |
# Global variables to track jobs
|
| 10 |
job_results: Dict[str, JobResult] = {} # {model_provider_key: JobResult}
|
|
@@ -12,6 +12,7 @@ results_lock: threading.Lock = threading.Lock()
|
|
| 12 |
|
| 13 |
# Configuration
|
| 14 |
NUM_MODELS_RUN: int = 100
|
|
|
|
| 15 |
RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
|
| 16 |
LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
|
| 17 |
TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"
|
|
|
|
| 4 |
from typing import Dict, Any, Optional
|
| 5 |
|
| 6 |
# Type definition for job result entries
|
| 7 |
+
JobResult = Dict[str, Any] # {model, provider, last_run, status, current_score, previous_score, job_id, start_time, duration, completed_at, runs: [{job_id, score, status, start_time, duration, completed_at}]}
|
| 8 |
|
| 9 |
# Global variables to track jobs
|
| 10 |
job_results: Dict[str, JobResult] = {} # {model_provider_key: JobResult}
|
|
|
|
| 12 |
|
| 13 |
# Configuration
|
| 14 |
NUM_MODELS_RUN: int = 100
|
| 15 |
+
NUM_RUNS_PER_JOB: int = 4 # Number of times to run each job for variance reduction
|
| 16 |
RESULTS_DATASET_NAME: str = "IPTesting/inference-provider-test-results"
|
| 17 |
LOCAL_CONFIG_FILE: str = "/home/user/app/model_providers.txt"
|
| 18 |
TASKS: str = "extended|ifeval|0,lighteval|gsm_plus|0,lighteval|gpqa:diamond|0"
|