| | import json |
| | import os |
| | from datetime import datetime, timezone, timedelta |
| | from collections import defaultdict |
| | from huggingface_hub import HfApi, hf_hub_download |
| | from huggingface_hub.errors import HfHubHTTPError |
| | from dotenv import load_dotenv |
| | import duckdb |
| | import backoff |
| | import requests |
| | import requests.exceptions |
| | import traceback |
| | import re |
| |
|
| | |
| | load_dotenv(override=True) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) |
| | BASE_DIR = os.path.dirname(SCRIPT_DIR) |
| |
|
| | AGENTS_REPO = "SWE-Arena/bot_data" |
| | AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_data") |
| | DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb") |
| | GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data") |
| | LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json" |
| | LEADERBOARD_REPO = "SWE-Arena/leaderboard_data" |
| | LEADERBOARD_TIME_FRAME_DAYS = 180 |
| | LONGSTANDING_GAP_DAYS = 30 |
| |
|
| | |
| | TRACKED_ORGS = [ |
| | "apache", |
| | "github", |
| | "huggingface", |
| | ] |
| |
|
| | |
| | PATCH_WANTED_LABELS = [ |
| | "bug", |
| | "enhancement", |
| | ] |
| |
|
| | |
| | GIT_SYNC_TIMEOUT = 300 |
| |
|
| | |
| | BATCH_SIZE_DAYS = 1 |
| |
|
| | |
| | MAX_RETRIES = 5 |
| |
|
| | |
| | |
| | |
| |
|
| | def load_jsonl(filename): |
| | """Load JSONL file and return list of dictionaries.""" |
| | if not os.path.exists(filename): |
| | return [] |
| |
|
| | data = [] |
| | with open(filename, 'r', encoding='utf-8') as f: |
| | for line in f: |
| | line = line.strip() |
| | if line: |
| | try: |
| | data.append(json.loads(line)) |
| | except json.JSONDecodeError as e: |
| | print(f"Warning: Skipping invalid JSON line: {e}") |
| | return data |
| |
|
| |
|
| | def save_jsonl(filename, data): |
| | """Save list of dictionaries to JSONL file.""" |
| | with open(filename, 'w', encoding='utf-8') as f: |
| | for item in data: |
| | f.write(json.dumps(item) + '\n') |
| |
|
| |
|
| | def normalize_date_format(date_string): |
| | """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix.""" |
| | if not date_string or date_string == 'N/A': |
| | return 'N/A' |
| |
|
| | try: |
| | if isinstance(date_string, datetime): |
| | return date_string.strftime('%Y-%m-%dT%H:%M:%SZ') |
| |
|
| | date_string = re.sub(r'\\s+', ' ', date_string.strip()) |
| | date_string = date_string.replace(' ', 'T') |
| |
|
| | if len(date_string) >= 3: |
| | if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]: |
| | date_string = date_string + ':00' |
| |
|
| | dt = datetime.fromisoformat(date_string.replace('Z', '+00:00')) |
| | return dt.strftime('%Y-%m-%dT%H:%M:%SZ') |
| | except Exception as e: |
| | print(f"Warning: Could not parse date '{date_string}': {e}") |
| | return date_string |
| |
|
| |
|
| | def get_hf_token(): |
| | """Get HuggingFace token from environment variables.""" |
| | token = os.getenv('HF_TOKEN') |
| | if not token: |
| | print("Warning: HF_TOKEN not found in environment variables") |
| | return token |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def download_file(url): |
| | """Download a GHArchive file with a single attempt.""" |
| | filename = url.split("/")[-1] |
| | filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename) |
| |
|
| | if os.path.exists(filepath): |
| | return True |
| |
|
| | try: |
| | response = requests.get(url, timeout=30) |
| | response.raise_for_status() |
| | with open(filepath, "wb") as f: |
| | f.write(response.content) |
| | return True |
| | except Exception as e: |
| | print(f" ⚠ {filename}: {e}") |
| | return False |
| |
|
| |
|
| | def download_all_gharchive_data(): |
| | """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS.""" |
| | os.makedirs(GHARCHIVE_DATA_LOCAL_PATH, exist_ok=True) |
| | |
| | end_date = datetime.now(timezone.utc) |
| | start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS) |
| |
|
| | urls = [] |
| | current_date = start_date |
| | while current_date <= end_date: |
| | date_str = current_date.strftime("%Y-%m-%d") |
| | for hour in range(24): |
| | url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz" |
| | urls.append(url) |
| | current_date += timedelta(days=1) |
| |
|
| | success = True |
| | for url in urls: |
| | if not download_file(url): |
| | success = False |
| |
|
| | return success |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def is_retryable_error(e): |
| | """Check if exception is retryable (rate limit or timeout error).""" |
| | if isinstance(e, HfHubHTTPError): |
| | if e.response.status_code == 429: |
| | return True |
| |
|
| | if isinstance(e, (requests.exceptions.Timeout, |
| | requests.exceptions.ReadTimeout, |
| | requests.exceptions.ConnectTimeout)): |
| | return True |
| |
|
| | if isinstance(e, Exception): |
| | error_str = str(e).lower() |
| | if 'timeout' in error_str or 'timed out' in error_str: |
| | return True |
| |
|
| | return False |
| |
|
| |
|
| | @backoff.on_exception( |
| | backoff.expo, |
| | (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception), |
| | max_tries=MAX_RETRIES, |
| | base=300, |
| | max_value=3600, |
| | giveup=lambda e: not is_retryable_error(e), |
| | on_backoff=lambda details: print( |
| | f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..." |
| | ) |
| | ) |
| | def list_repo_files_with_backoff(api, **kwargs): |
| | """Wrapper for api.list_repo_files() with exponential backoff.""" |
| | return api.list_repo_files(**kwargs) |
| |
|
| |
|
| | @backoff.on_exception( |
| | backoff.expo, |
| | (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception), |
| | max_tries=MAX_RETRIES, |
| | base=300, |
| | max_value=3600, |
| | giveup=lambda e: not is_retryable_error(e), |
| | on_backoff=lambda details: print( |
| | f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..." |
| | ) |
| | ) |
| | def hf_hub_download_with_backoff(**kwargs): |
| | """Wrapper for hf_hub_download() with exponential backoff.""" |
| | return hf_hub_download(**kwargs) |
| |
|
| |
|
| | @backoff.on_exception( |
| | backoff.expo, |
| | (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception), |
| | max_tries=MAX_RETRIES, |
| | base=300, |
| | max_value=3600, |
| | giveup=lambda e: not is_retryable_error(e), |
| | on_backoff=lambda details: print( |
| | f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..." |
| | ) |
| | ) |
| | def upload_file_with_backoff(api, **kwargs): |
| | """Wrapper for api.upload_file() with exponential backoff.""" |
| | return api.upload_file(**kwargs) |
| |
|
| |
|
| | @backoff.on_exception( |
| | backoff.expo, |
| | (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception), |
| | max_tries=MAX_RETRIES, |
| | base=300, |
| | max_value=3600, |
| | giveup=lambda e: not is_retryable_error(e), |
| | on_backoff=lambda details: print( |
| | f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..." |
| | ) |
| | ) |
| | def upload_folder_with_backoff(api, **kwargs): |
| | """Wrapper for api.upload_folder() with exponential backoff.""" |
| | return api.upload_folder(**kwargs) |
| |
|
| |
|
| | def get_duckdb_connection(): |
| | """ |
| | Initialize DuckDB connection with OPTIMIZED memory settings. |
| | Uses persistent database and reduced memory footprint. |
| | Automatically removes cache file if lock conflict is detected. |
| | """ |
| | try: |
| | conn = duckdb.connect(DUCKDB_CACHE_FILE) |
| | except Exception as e: |
| | |
| | error_msg = str(e) |
| | if "lock" in error_msg.lower() or "conflicting" in error_msg.lower(): |
| | print(f" ⚠ Lock conflict detected, removing {DUCKDB_CACHE_FILE}...") |
| | if os.path.exists(DUCKDB_CACHE_FILE): |
| | os.remove(DUCKDB_CACHE_FILE) |
| | print(f" ✓ Cache file removed, retrying connection...") |
| | |
| | conn = duckdb.connect(DUCKDB_CACHE_FILE) |
| | else: |
| | |
| | raise |
| |
|
| | |
| | conn.execute(f"SET threads TO 6;") |
| | conn.execute(f"SET max_memory = '50GB';") |
| | conn.execute("SET temp_directory = '/tmp/duckdb_temp';") |
| |
|
| | |
| | conn.execute("SET preserve_insertion_order = false;") |
| | conn.execute("SET enable_object_cache = true;") |
| |
|
| | return conn |
| |
|
| |
|
| | def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LOCAL_PATH): |
| | """Generate file path patterns for GHArchive data in date range (only existing files).""" |
| | file_patterns = [] |
| | missing_dates = set() |
| |
|
| | current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) |
| | end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0) |
| |
|
| | while current_date <= end_day: |
| | date_has_files = False |
| | for hour in range(24): |
| | pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz") |
| | if os.path.exists(pattern): |
| | file_patterns.append(pattern) |
| | date_has_files = True |
| |
|
| | if not date_has_files: |
| | missing_dates.add(current_date.strftime('%Y-%m-%d')) |
| |
|
| | current_date += timedelta(days=1) |
| |
|
| | if missing_dates: |
| | print(f" ⚠ Skipping {len(missing_dates)} date(s) with no data") |
| |
|
| | return file_patterns |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date): |
| | """ |
| | QUERY: Fetch both issue and discussion metadata using streaming batch processing: |
| | - IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues) |
| | - PullRequestEvent (for wanted issue tracking) |
| | - DiscussionEvent (for discussion tracking) |
| | |
| | Args: |
| | conn: DuckDB connection instance |
| | identifiers: List of GitHub usernames/bot identifiers |
| | start_date: Start datetime (timezone-aware) |
| | end_date: End datetime (timezone-aware) |
| | |
| | Returns: |
| | Dictionary with four keys: |
| | - 'agent_issues': {agent_id: [issue_metadata]} for assistant-assigned issues |
| | - 'wanted_open': [open_wanted_issues] for long-standing open issues |
| | - 'wanted_resolved': {agent_id: [resolved_wanted]} for resolved wanted issues |
| | - 'agent_discussions': {agent_id: [discussion_metadata]} for assistant discussions |
| | """ |
| | identifier_set = set(identifiers) |
| | identifier_list = ', '.join([f"'{id}'" for id in identifiers]) |
| | tracked_orgs_list = ', '.join([f"'{org}'" for org in TRACKED_ORGS]) |
| |
|
| | |
| | agent_issues = defaultdict(list) |
| | agent_issue_urls = defaultdict(set) |
| |
|
| | |
| | all_issues = {} |
| | issue_to_prs = defaultdict(set) |
| | pr_creators = {} |
| | pr_merged_at = {} |
| |
|
| | |
| | discussions_by_agent = defaultdict(list) |
| |
|
| | |
| | total_days = (end_date - start_date).days |
| | total_batches = (total_days // BATCH_SIZE_DAYS) + 1 |
| |
|
| | |
| | current_date = start_date |
| | batch_num = 0 |
| |
|
| | print(f" Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...") |
| |
|
| | while current_date <= end_date: |
| | batch_num += 1 |
| | batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date) |
| |
|
| | |
| | file_patterns = generate_file_path_patterns(current_date, batch_end) |
| |
|
| | if not file_patterns: |
| | print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA") |
| | current_date = batch_end + timedelta(days=1) |
| | continue |
| |
|
| | |
| | print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True) |
| |
|
| | |
| | file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']' |
| |
|
| | try: |
| | |
| | unified_query = f""" |
| | SELECT |
| | type, |
| | json_extract_string(repo, '$.name') as repo_name, |
| | json_extract_string(repo, '$.url') as repo_url, |
| | -- Issue fields |
| | json_extract_string(payload, '$.issue.html_url') as issue_url, |
| | json_extract_string(payload, '$.issue.title') as issue_title, |
| | json_extract_string(payload, '$.issue.number') as issue_number, |
| | json_extract_string(payload, '$.issue.created_at') as issue_created_at, |
| | json_extract_string(payload, '$.issue.closed_at') as issue_closed_at, |
| | json_extract(payload, '$.issue.labels') as issue_labels, |
| | json_extract_string(payload, '$.issue.pull_request') as is_pull_request, |
| | json_extract_string(payload, '$.issue.state_reason') as issue_state_reason, |
| | -- Actor/assignee fields for assistant assignment |
| | json_extract_string(payload, '$.issue.user.login') as issue_creator, |
| | json_extract_string(payload, '$.issue.assignee.login') as issue_assignee, |
| | json_extract(payload, '$.issue.assignees') as issue_assignees, |
| | json_extract_string(payload, '$.comment.user.login') as commenter, |
| | -- PR fields - simplified with COALESCE |
| | COALESCE( |
| | json_extract_string(payload, '$.issue.html_url'), |
| | json_extract_string(payload, '$.pull_request.html_url') |
| | ) as pr_url, |
| | COALESCE( |
| | json_extract_string(payload, '$.issue.user.login'), |
| | json_extract_string(payload, '$.pull_request.user.login') |
| | ) as pr_creator, |
| | COALESCE( |
| | json_extract_string(payload, '$.issue.pull_request.merged_at'), |
| | json_extract_string(payload, '$.pull_request.merged_at') |
| | ) as pr_merged_at, |
| | COALESCE( |
| | json_extract_string(payload, '$.issue.body'), |
| | json_extract_string(payload, '$.pull_request.body') |
| | ) as pr_body, |
| | -- Discussion fields |
| | json_extract_string(payload, '$.discussion.html_url') as discussion_url, |
| | json_extract_string(payload, '$.discussion.user.login') as discussion_creator, |
| | json_extract_string(payload, '$.discussion.created_at') as discussion_created_at, |
| | json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at, |
| | json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason, |
| | json_extract_string(payload, '$.action') as action |
| | FROM read_json( |
| | {file_patterns_sql}, |
| | union_by_name=true, |
| | filename=true, |
| | compression='gzip', |
| | format='newline_delimited', |
| | ignore_errors=true |
| | ) |
| | WHERE |
| | type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent') |
| | AND ( |
| | -- Assistant-assigned issues: assistant is creator, assignee, or commenter |
| | (type = 'IssuesEvent' AND ( |
| | json_extract_string(payload, '$.issue.user.login') IN ({identifier_list}) |
| | OR json_extract_string(payload, '$.issue.assignee.login') IN ({identifier_list}) |
| | OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list}) |
| | )) |
| | -- Issue comments: assistant is commenter OR tracked org |
| | OR (type = 'IssueCommentEvent' AND ( |
| | json_extract_string(payload, '$.comment.user.login') IN ({identifier_list}) |
| | OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list}) |
| | )) |
| | -- PRs: assistant is creator OR tracked org (for wanted issue tracking) |
| | OR (type = 'PullRequestEvent' AND ( |
| | json_extract_string(payload, '$.pull_request.user.login') IN ({identifier_list}) |
| | OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list}) |
| | )) |
| | -- Discussions: assistant is creator AND tracked org |
| | OR (type = 'DiscussionEvent' |
| | AND json_extract_string(payload, '$.discussion.user.login') IN ({identifier_list}) |
| | AND SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list}) |
| | ) |
| | ) |
| | """ |
| |
|
| | all_results = conn.execute(unified_query).fetchall() |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | issue_events = [] |
| | pr_events = [] |
| | discussion_events = [] |
| | agent_issue_events = [] |
| |
|
| | for row in all_results: |
| | event_type = row[0] |
| | is_pr = row[9] |
| |
|
| | if event_type in ('IssuesEvent', 'IssueCommentEvent'): |
| | if not is_pr: |
| | |
| | issue_creator = row[11] |
| | issue_assignee = row[12] |
| | issue_assignees_json = row[13] |
| | commenter = row[14] |
| |
|
| | agent_identifier = None |
| |
|
| | if event_type == 'IssuesEvent': |
| | |
| | if issue_creator in identifier_set: |
| | agent_identifier = issue_creator |
| | elif issue_assignee in identifier_set: |
| | agent_identifier = issue_assignee |
| | else: |
| | |
| | try: |
| | if issue_assignees_json: |
| | if isinstance(issue_assignees_json, str): |
| | assignees_data = json.loads(issue_assignees_json) |
| | else: |
| | assignees_data = issue_assignees_json |
| |
|
| | if isinstance(assignees_data, list): |
| | for assignee_obj in assignees_data: |
| | if isinstance(assignee_obj, dict): |
| | assignee_login = assignee_obj.get('login') |
| | if assignee_login in identifier_set: |
| | agent_identifier = assignee_login |
| | break |
| | except (json.JSONDecodeError, TypeError): |
| | pass |
| |
|
| | elif event_type == 'IssueCommentEvent': |
| | |
| | if commenter in identifier_set: |
| | agent_identifier = commenter |
| |
|
| | |
| | if agent_identifier: |
| | agent_issue_events.append((row, agent_identifier)) |
| |
|
| | |
| | issue_events.append(row) |
| | else: |
| | |
| | pr_events.append(row) |
| |
|
| | elif event_type == 'PullRequestEvent': |
| | pr_events.append(row) |
| |
|
| | elif event_type == 'DiscussionEvent': |
| | discussion_events.append(row) |
| |
|
| | |
| | for row, agent_identifier in agent_issue_events: |
| | |
| | repo_url = row[2] |
| | issue_url = row[3] |
| | created_at = row[6] |
| | closed_at = row[7] |
| | state_reason = row[10] |
| |
|
| | if not issue_url or not agent_identifier: |
| | continue |
| |
|
| | |
| | if repo_url and '/issues/' not in issue_url: |
| | issue_number = row[5] |
| | full_url = f"{repo_url.replace('api.github.com/repos/', 'github.com/')}/issues/{issue_number}" |
| | else: |
| | full_url = issue_url |
| |
|
| | |
| | if created_at: |
| | try: |
| | created_dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) |
| | if created_dt < start_date: |
| | continue |
| | except: |
| | continue |
| |
|
| | |
| | if full_url in agent_issue_urls[agent_identifier]: |
| | continue |
| |
|
| | agent_issue_urls[agent_identifier].add(full_url) |
| |
|
| | issue_metadata = { |
| | 'url': full_url, |
| | 'created_at': normalize_date_format(created_at), |
| | 'closed_at': normalize_date_format(closed_at) if closed_at else None, |
| | 'state_reason': state_reason, |
| | } |
| |
|
| | agent_issues[agent_identifier].append(issue_metadata) |
| |
|
| | |
| | for row in issue_events: |
| | |
| | |
| | repo_name = row[1] |
| | issue_url = row[3] |
| | title = row[4] |
| | issue_number = row[5] |
| | created_at = row[6] |
| | closed_at = row[7] |
| | labels_json = row[8] |
| |
|
| | if not issue_url or not repo_name: |
| | continue |
| |
|
| | |
| | parts = repo_name.split('/') |
| | if len(parts) != 2: |
| | continue |
| | org = parts[0] |
| |
|
| | |
| | if org not in TRACKED_ORGS: |
| | continue |
| |
|
| | |
| | try: |
| | if isinstance(labels_json, str): |
| | labels_data = json.loads(labels_json) |
| | else: |
| | labels_data = labels_json |
| |
|
| | if not isinstance(labels_data, list): |
| | label_names = [] |
| | else: |
| | label_names = [label.get('name', '').lower() for label in labels_data if isinstance(label, dict)] |
| |
|
| | except (json.JSONDecodeError, TypeError): |
| | label_names = [] |
| |
|
| | |
| | normalized_closed_at = normalize_date_format(closed_at) if closed_at else None |
| | state = 'closed' if (normalized_closed_at and normalized_closed_at != 'N/A') else 'open' |
| |
|
| | |
| | all_issues[issue_url] = { |
| | 'url': issue_url, |
| | 'repo': repo_name, |
| | 'title': title, |
| | 'number': issue_number, |
| | 'state': state, |
| | 'created_at': normalize_date_format(created_at), |
| | 'closed_at': normalized_closed_at, |
| | 'labels': label_names |
| | } |
| |
|
| | |
| | for row in pr_events: |
| | |
| | pr_url = row[15] |
| | pr_creator = row[16] |
| | merged_at = row[17] |
| | pr_body = row[18] |
| |
|
| | if not pr_url or not pr_creator: |
| | continue |
| |
|
| | pr_creators[pr_url] = pr_creator |
| | pr_merged_at[pr_url] = merged_at |
| |
|
| | |
| | if pr_body: |
| | |
| | issue_refs = re.findall(r'(?:https?://github\.com/[\w-]+/[\w-]+/issues/\d+)|(?:#\d+)', pr_body, re.IGNORECASE) |
| |
|
| | for ref in issue_refs: |
| | |
| | if ref.startswith('#'): |
| | |
| | pr_parts = pr_url.split('/') |
| | if len(pr_parts) >= 5: |
| | org = pr_parts[-4] |
| | repo = pr_parts[-3] |
| | issue_num = ref[1:] |
| | issue_url = f"https://github.com/{org}/{repo}/issues/{issue_num}" |
| | issue_to_prs[issue_url].add(pr_url) |
| | else: |
| | issue_to_prs[ref].add(pr_url) |
| |
|
| | |
| | for row in discussion_events: |
| | |
| | |
| | |
| | repo_name = row[1] |
| | discussion_url = row[19] |
| | discussion_creator = row[20] |
| | discussion_created_at = row[21] |
| | discussion_closed_at = row[22] |
| | discussion_state_reason = row[23] |
| |
|
| | if not discussion_url or not repo_name: |
| | continue |
| |
|
| | |
| | parts = repo_name.split('/') |
| | if len(parts) != 2: |
| | continue |
| | org = parts[0] |
| |
|
| | |
| | if org not in TRACKED_ORGS: |
| | continue |
| |
|
| | |
| | created_dt = None |
| | if discussion_created_at: |
| | try: |
| | created_dt = datetime.fromisoformat(discussion_created_at.replace('Z', '+00:00')) |
| | |
| | if created_dt < start_date: |
| | continue |
| | except: |
| | continue |
| |
|
| | |
| | |
| | if discussion_creator not in identifier_set: |
| | continue |
| |
|
| | |
| | |
| | is_resolved = False |
| | if discussion_closed_at: |
| | is_resolved = True |
| | elif discussion_state_reason and 'answered' in discussion_state_reason.lower(): |
| | is_resolved = True |
| |
|
| | |
| | discussion_meta = { |
| | 'url': discussion_url, |
| | 'repo': repo_name, |
| | 'created_at': normalize_date_format(discussion_created_at), |
| | 'closed_at': normalize_date_format(discussion_closed_at) if discussion_closed_at else None, |
| | 'state': 'resolved' if is_resolved else 'open', |
| | 'state_reason': discussion_state_reason |
| | } |
| |
|
| | |
| | if discussion_creator not in discussions_by_agent: |
| | discussions_by_agent[discussion_creator] = [] |
| | discussions_by_agent[discussion_creator].append(discussion_meta) |
| |
|
| | print(f"✓ {len(agent_issue_events)} assistant issues, {len(issue_events)} wanted issues, {len(pr_events)} PRs, {len(discussion_events)} discussions") |
| |
|
| | except Exception as e: |
| | print(f"\n ✗ Batch {batch_num} error: {str(e)}") |
| | traceback.print_exc() |
| |
|
| | |
| | current_date = batch_end + timedelta(days=1) |
| |
|
| | |
| | print(f"\n Post-processing {len(all_issues)} wanted issues...") |
| |
|
| | wanted_open = [] |
| | wanted_resolved = defaultdict(list) |
| | wanted_in_progress = defaultdict(list) |
| | current_time = datetime.now(timezone.utc) |
| |
|
| | for issue_url, issue_meta in all_issues.items(): |
| | |
| | linked_prs = issue_to_prs.get(issue_url, set()) |
| | if not linked_prs: |
| | continue |
| |
|
| | |
| | agents_with_merged_pr = set() |
| | agents_with_unmerged_pr = set() |
| |
|
| | for pr_url in linked_prs: |
| | pr_creator = pr_creators.get(pr_url) |
| | if pr_creator not in identifier_set: |
| | continue |
| |
|
| | merged_at = pr_merged_at.get(pr_url) |
| | if merged_at: |
| | agents_with_merged_pr.add(pr_creator) |
| | else: |
| | agents_with_unmerged_pr.add(pr_creator) |
| |
|
| | |
| | if not agents_with_merged_pr and not agents_with_unmerged_pr: |
| | continue |
| |
|
| | |
| | issue_labels = issue_meta.get('labels', []) |
| | has_patch_label = False |
| | for issue_label in issue_labels: |
| | for wanted_label in PATCH_WANTED_LABELS: |
| | if wanted_label.lower() in issue_label: |
| | has_patch_label = True |
| | break |
| | if has_patch_label: |
| | break |
| |
|
| | if not has_patch_label: |
| | continue |
| |
|
| | |
| | created_at_str = issue_meta.get('created_at') |
| | if not created_at_str or created_at_str == 'N/A': |
| | continue |
| |
|
| | try: |
| | created_dt = datetime.fromisoformat(created_at_str.replace('Z', '+00:00')) |
| | except: |
| | continue |
| |
|
| | |
| | if issue_meta['state'] == 'open': |
| | days_open = (current_time - created_dt).days |
| | if days_open < LONGSTANDING_GAP_DAYS: |
| | continue |
| |
|
| | |
| | wanted_open.append(issue_meta) |
| |
|
| | |
| | for agent_id in agents_with_unmerged_pr - agents_with_merged_pr: |
| | wanted_in_progress[agent_id].append(issue_meta) |
| |
|
| | elif issue_meta['state'] == 'closed': |
| | closed_at_str = issue_meta.get('closed_at') |
| | if not closed_at_str or closed_at_str == 'N/A': |
| | continue |
| |
|
| | try: |
| | closed_dt = datetime.fromisoformat(closed_at_str.replace('Z', '+00:00')) |
| | days_open = (closed_dt - created_dt).days |
| |
|
| | |
| | if start_date <= closed_dt <= end_date and days_open >= LONGSTANDING_GAP_DAYS: |
| | |
| | for agent_id in agents_with_merged_pr: |
| | wanted_resolved[agent_id].append(issue_meta) |
| | except: |
| | pass |
| |
|
| | print(f" ✓ Found {sum(len(issues) for issues in agent_issues.values())} assistant-assigned issues across {len(agent_issues)} assistants") |
| | print(f" ✓ Found {len(wanted_open)} long-standing open wanted issues") |
| | print(f" ✓ Found {sum(len(issues) for issues in wanted_resolved.values())} resolved wanted issues across {len(wanted_resolved)} assistants") |
| | print(f" ✓ Found {sum(len(issues) for issues in wanted_in_progress.values())} in-progress wanted issues across {len(wanted_in_progress)} assistants") |
| | print(f" ✓ Found {sum(len(discussions) for discussions in discussions_by_agent.values())} discussions across {len(discussions_by_agent)} assistants") |
| |
|
| | return { |
| | 'agent_issues': dict(agent_issues), |
| | 'wanted_open': wanted_open, |
| | 'wanted_resolved': dict(wanted_resolved), |
| | 'wanted_in_progress': dict(wanted_in_progress), |
| | 'agent_discussions': dict(discussions_by_agent) |
| | } |
| |
|
| |
|
| | def load_agents_from_hf(): |
| | """ |
| | Load all assistant metadata JSON files from local git repository. |
| | """ |
| | assistants = [] |
| |
|
| | |
| | if not os.path.exists(AGENTS_REPO_LOCAL_PATH): |
| | raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}") |
| |
|
| | |
| | files_processed = 0 |
| | print(f" Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...") |
| |
|
| | for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH): |
| | |
| | if '.git' in root: |
| | continue |
| |
|
| | for filename in files: |
| | if not filename.endswith('.json'): |
| | continue |
| |
|
| | files_processed += 1 |
| | file_path = os.path.join(root, filename) |
| |
|
| | try: |
| | with open(file_path, 'r', encoding='utf-8') as f: |
| | agent_data = json.load(f) |
| |
|
| | |
| | if agent_data.get('status') != 'active': |
| | continue |
| |
|
| | |
| | github_identifier = filename.replace('.json', '') |
| | agent_data['github_identifier'] = github_identifier |
| |
|
| | assistants.append(agent_data) |
| |
|
| | except Exception as e: |
| | print(f" ⚠ Error loading {filename}: {str(e)}") |
| | continue |
| |
|
| | print(f" ✓ Loaded {len(assistants)} active assistants (from {files_processed} total files)") |
| | return assistants |
| |
|
| |
|
| | def calculate_issue_stats_from_metadata(metadata_list): |
| | """Calculate statistics from a list of issue metadata.""" |
| | total_issues = len(metadata_list) |
| | closed = sum(1 for issue_meta in metadata_list if issue_meta.get('closed_at')) |
| | resolved = sum(1 for issue_meta in metadata_list |
| | if issue_meta.get('state_reason') == 'completed') |
| |
|
| | |
| | resolved_rate = (resolved / closed * 100) if closed > 0 else 0 |
| |
|
| | return { |
| | 'total_issues': total_issues, |
| | 'closed_issues': closed, |
| | 'resolved_issues': resolved, |
| | 'resolved_rate': round(resolved_rate, 2), |
| | } |
| |
|
| |
|
| | def calculate_monthly_metrics_by_issues(all_metadata_dict, assistants): |
| | """Calculate monthly metrics for all assistants for visualization.""" |
| | identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')} |
| |
|
| | if not all_metadata_dict: |
| | return {'assistants': [], 'months': [], 'data': {}} |
| | |
| | agent_month_data = defaultdict(lambda: defaultdict(list)) |
| |
|
| | for agent_identifier, metadata_list in all_metadata_dict.items(): |
| | for issue_meta in metadata_list: |
| | created_at = issue_meta.get('created_at') |
| |
|
| | if not created_at: |
| | continue |
| |
|
| | agent_name = identifier_to_name.get(agent_identifier, agent_identifier) |
| |
|
| | try: |
| | dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) |
| | month_key = f"{dt.year}-{dt.month:02d}" |
| | agent_month_data[agent_name][month_key].append(issue_meta) |
| | except Exception as e: |
| | print(f"Warning: Could not parse date '{created_at}': {e}") |
| | continue |
| |
|
| | all_months = set() |
| | for agent_data in agent_month_data.values(): |
| | all_months.update(agent_data.keys()) |
| | months = sorted(list(all_months)) |
| |
|
| | result_data = {} |
| | for agent_name, month_dict in agent_month_data.items(): |
| | resolved_rates = [] |
| | total_issues_list = [] |
| | resolved_issues_list = [] |
| | closed_issues_list = [] |
| |
|
| | for month in months: |
| | issues_in_month = month_dict.get(month, []) |
| |
|
| | resolved_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed') |
| | closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at')) |
| | total_count = len(issues_in_month) |
| |
|
| | |
| | resolved_rate = (resolved_count / closed_count * 100) if closed_count > 0 else 0 |
| |
|
| | resolved_rates.append(resolved_rate) |
| | total_issues_list.append(total_count) |
| | resolved_issues_list.append(resolved_count) |
| | closed_issues_list.append(closed_count) |
| |
|
| | result_data[agent_name] = { |
| | 'resolved_rates': resolved_rates, |
| | 'total_issues': total_issues_list, |
| | 'resolved_issues': resolved_issues_list, |
| | 'closed_issues': closed_issues_list |
| | } |
| |
|
| | agents_list = sorted(list(agent_month_data.keys())) |
| |
|
| | return { |
| | 'assistants': agents_list, |
| | 'months': months, |
| | 'data': result_data |
| | } |
| |
|
| |
|
| | def calculate_discussion_stats_from_metadata(metadata_list): |
| | """Calculate statistics from a list of discussion metadata.""" |
| | total_discussions = len(metadata_list) |
| | resolved = sum(1 for discussion_meta in metadata_list if discussion_meta.get('state') == 'resolved') |
| |
|
| | |
| | resolved_rate = (resolved / total_discussions * 100) if total_discussions > 0 else 0 |
| |
|
| | return { |
| | 'total_discussions': total_discussions, |
| | 'resolved_discussions': resolved, |
| | 'discussion_resolved_rate': round(resolved_rate, 2), |
| | } |
| |
|
| |
|
| | def calculate_monthly_metrics_by_discussions(all_discussions_dict, assistants): |
| | """Calculate monthly metrics for discussions for all assistants for visualization.""" |
| | identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')} |
| |
|
| | if not all_discussions_dict: |
| | return {'assistants': [], 'months': [], 'data': {}} |
| |
|
| | agent_month_data = defaultdict(lambda: defaultdict(list)) |
| |
|
| | for agent_identifier, metadata_list in all_discussions_dict.items(): |
| | for discussion_meta in metadata_list: |
| | created_at = discussion_meta.get('created_at') |
| |
|
| | if not created_at: |
| | continue |
| |
|
| | agent_name = identifier_to_name.get(agent_identifier, agent_identifier) |
| |
|
| | try: |
| | dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) |
| | month_key = f"{dt.year}-{dt.month:02d}" |
| | agent_month_data[agent_name][month_key].append(discussion_meta) |
| | except Exception as e: |
| | print(f"Warning: Could not parse discussion date '{created_at}': {e}") |
| | continue |
| |
|
| | all_months = set() |
| | for agent_data in agent_month_data.values(): |
| | all_months.update(agent_data.keys()) |
| | months = sorted(list(all_months)) |
| |
|
| | result_data = {} |
| | for agent_name, month_dict in agent_month_data.items(): |
| | resolved_rates = [] |
| | total_discussions_list = [] |
| | resolved_discussions_list = [] |
| |
|
| | for month in months: |
| | discussions_in_month = month_dict.get(month, []) |
| |
|
| | resolved_count = sum(1 for discussion in discussions_in_month if discussion.get('state') == 'resolved') |
| | total_count = len(discussions_in_month) |
| |
|
| | |
| | resolved_rate = (resolved_count / total_count * 100) if total_count > 0 else 0 |
| |
|
| | resolved_rates.append(resolved_rate) |
| | total_discussions_list.append(total_count) |
| | resolved_discussions_list.append(resolved_count) |
| |
|
| | result_data[agent_name] = { |
| | 'resolved_rates': resolved_rates, |
| | 'total_discussions': total_discussions_list, |
| | 'resolved_discussions': resolved_discussions_list |
| | } |
| |
|
| | agents_list = sorted(list(agent_month_data.keys())) |
| |
|
| | return { |
| | 'assistants': agents_list, |
| | 'months': months, |
| | 'data': result_data |
| | } |
| |
|
| |
|
| | def calculate_monthly_metrics_by_wanted_issues(wanted_resolved_dict, wanted_in_progress_dict, assistants): |
| | """Calculate monthly metrics for wanted issues for all assistants for visualization. |
| | """ |
| | identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')} |
| |
|
| | if not wanted_resolved_dict and not wanted_in_progress_dict: |
| | return {'assistants': [], 'months': [], 'data': {}} |
| |
|
| | |
| | current_time = datetime.now(timezone.utc) |
| | start_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS) |
| | start_month_key = f"{start_date.year}-{start_date.month:02d}" |
| | current_month_key = f"{current_time.year}-{current_time.month:02d}" |
| |
|
| | agent_month_data = defaultdict(lambda: defaultdict(lambda: {'resolved': 0, 'in_progress': 0})) |
| |
|
| | |
| | for agent_identifier, metadata_list in (wanted_resolved_dict or {}).items(): |
| | for issue_meta in metadata_list: |
| | closed_at = issue_meta.get('closed_at') |
| |
|
| | if not closed_at or closed_at == 'N/A': |
| | continue |
| |
|
| | agent_name = identifier_to_name.get(agent_identifier, agent_identifier) |
| |
|
| | try: |
| | dt = datetime.fromisoformat(closed_at.replace('Z', '+00:00')) |
| |
|
| | |
| | if dt < start_date: |
| | continue |
| |
|
| | month_key = f"{dt.year}-{dt.month:02d}" |
| | agent_month_data[agent_name][month_key]['resolved'] += 1 |
| | except Exception as e: |
| | print(f"Warning: Could not parse wanted issue date '{closed_at}': {e}") |
| | continue |
| |
|
| | |
| | for agent_identifier, metadata_list in (wanted_in_progress_dict or {}).items(): |
| | agent_name = identifier_to_name.get(agent_identifier, agent_identifier) |
| | |
| | agent_month_data[agent_name][current_month_key]['in_progress'] += len(metadata_list) |
| |
|
| | all_months = set() |
| | for agent_data in agent_month_data.values(): |
| | all_months.update(agent_data.keys()) |
| |
|
| | |
| | months = sorted([m for m in all_months if m >= start_month_key]) |
| |
|
| | result_data = {} |
| | for agent_name, month_dict in agent_month_data.items(): |
| | resolved_rates = [] |
| | total_wanted_list = [] |
| | resolved_wanted_list = [] |
| |
|
| | for month in months: |
| | month_data = month_dict.get(month, {'resolved': 0, 'in_progress': 0}) |
| | resolved_count = month_data['resolved'] |
| | in_progress_count = month_data['in_progress'] |
| | total_count = resolved_count + in_progress_count |
| |
|
| | |
| | resolved_rate = (resolved_count / total_count * 100) if total_count > 0 else 0 |
| |
|
| | resolved_rates.append(resolved_rate) |
| | total_wanted_list.append(total_count) |
| | resolved_wanted_list.append(resolved_count) |
| |
|
| | result_data[agent_name] = { |
| | 'resolved_rates': resolved_rates, |
| | 'total_wanted': total_wanted_list, |
| | 'resolved_wanted': resolved_wanted_list |
| | } |
| |
|
| | agents_list = sorted(list(agent_month_data.keys())) |
| |
|
| | return { |
| | 'assistants': agents_list, |
| | 'months': months, |
| | 'data': result_data |
| | } |
| |
|
| |
|
| | def construct_leaderboard_from_metadata(all_metadata_dict, assistants, wanted_resolved_dict=None, wanted_in_progress_dict=None, discussions_dict=None): |
| | """Construct leaderboard from in-memory issue metadata and discussion metadata. |
| | |
| | Args: |
| | all_metadata_dict: Dictionary mapping assistant ID to list of issue metadata (assistant-assigned issues) |
| | assistants: List of assistant metadata |
| | wanted_resolved_dict: Optional dictionary mapping assistant ID to list of resolved wanted issues |
| | wanted_in_progress_dict: Optional dictionary mapping assistant ID to list of in-progress wanted issues |
| | discussions_dict: Optional dictionary mapping assistant ID to list of discussion metadata |
| | """ |
| | if not assistants: |
| | print("Error: No assistants found") |
| | return {} |
| |
|
| | if wanted_resolved_dict is None: |
| | wanted_resolved_dict = {} |
| |
|
| | if wanted_in_progress_dict is None: |
| | wanted_in_progress_dict = {} |
| |
|
| | if discussions_dict is None: |
| | discussions_dict = {} |
| |
|
| | cache_dict = {} |
| |
|
| | for assistant in assistants: |
| | identifier = assistant.get('github_identifier') |
| | agent_name = assistant.get('name', 'Unknown') |
| |
|
| | bot_data = all_metadata_dict.get(identifier, []) |
| | stats = calculate_issue_stats_from_metadata(bot_data) |
| |
|
| | |
| | resolved_wanted = len(wanted_resolved_dict.get(identifier, [])) |
| | in_progress_wanted = len(wanted_in_progress_dict.get(identifier, [])) |
| | total_wanted = resolved_wanted + in_progress_wanted |
| | wanted_resolved_rate = (resolved_wanted / total_wanted * 100) if total_wanted > 0 else 0.0 |
| |
|
| | |
| | discussion_metadata = discussions_dict.get(identifier, []) |
| | discussion_stats = calculate_discussion_stats_from_metadata(discussion_metadata) |
| |
|
| | cache_dict[identifier] = { |
| | 'name': agent_name, |
| | 'website': assistant.get('website', 'N/A'), |
| | 'github_identifier': identifier, |
| | **stats, |
| | 'total_wanted_issues': total_wanted, |
| | 'resolved_wanted_issues': resolved_wanted, |
| | 'wanted_issue_resolved_rate': round(wanted_resolved_rate, 2), |
| | **discussion_stats |
| | } |
| |
|
| | return cache_dict |
| |
|
| |
|
| | def save_leaderboard_data_to_hf(leaderboard_dict, issue_monthly_metrics, wanted_issues=None, discussion_monthly_metrics=None, wanted_issue_monthly_metrics=None): |
| | """Save leaderboard data, monthly metrics, wanted issues, and discussion metrics to HuggingFace dataset.""" |
| | try: |
| | token = get_hf_token() |
| | if not token: |
| | raise Exception("No HuggingFace token found") |
| |
|
| | api = HfApi(token=token) |
| |
|
| | if wanted_issues is None: |
| | wanted_issues = [] |
| |
|
| | combined_data = { |
| | 'metadata': { |
| | 'last_updated': datetime.now(timezone.utc).isoformat(), |
| | 'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS, |
| | 'longstanding_gap_days': LONGSTANDING_GAP_DAYS, |
| | 'tracked_orgs': TRACKED_ORGS, |
| | 'patch_wanted_labels': PATCH_WANTED_LABELS |
| | }, |
| | 'leaderboard': leaderboard_dict, |
| | 'issue_monthly_metrics': issue_monthly_metrics, |
| | 'wanted_issues': wanted_issues, |
| | 'wanted_issue_monthly_metrics': wanted_issue_monthly_metrics, |
| | 'discussion_monthly_metrics': discussion_monthly_metrics |
| | } |
| |
|
| | with open(LEADERBOARD_FILENAME, 'w') as f: |
| | json.dump(combined_data, f, indent=2) |
| |
|
| | try: |
| | upload_file_with_backoff( |
| | api=api, |
| | path_or_fileobj=LEADERBOARD_FILENAME, |
| | path_in_repo=LEADERBOARD_FILENAME, |
| | repo_id=LEADERBOARD_REPO, |
| | repo_type="dataset" |
| | ) |
| | return True |
| | finally: |
| | if os.path.exists(LEADERBOARD_FILENAME): |
| | os.remove(LEADERBOARD_FILENAME) |
| |
|
| | except Exception as e: |
| | print(f"Error saving leaderboard data: {str(e)}") |
| | traceback.print_exc() |
| | return False |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def mine_all_agents(): |
| | """ |
| | Mine issue metadata for all assistants using STREAMING batch processing. |
| | Downloads GHArchive data, then uses BATCH-based DuckDB queries. |
| | """ |
| | print(f"\n[1/4] Downloading GHArchive data...") |
| |
|
| | if not download_all_gharchive_data(): |
| | print("Warning: Download had errors, continuing with available data...") |
| |
|
| | print(f"\n[2/4] Loading assistant metadata...") |
| |
|
| | assistants = load_agents_from_hf() |
| | if not assistants: |
| | print("Error: No assistants found") |
| | return |
| |
|
| | identifiers = [assistant['github_identifier'] for assistant in assistants if assistant.get('github_identifier')] |
| | if not identifiers: |
| | print("Error: No valid assistant identifiers found") |
| | return |
| |
|
| | print(f"\n[3/4] Mining issue metadata ({len(identifiers)} assistants, {LEADERBOARD_TIME_FRAME_DAYS} days)...") |
| |
|
| | try: |
| | conn = get_duckdb_connection() |
| | except Exception as e: |
| | print(f"Failed to initialize DuckDB connection: {str(e)}") |
| | return |
| |
|
| | current_time = datetime.now(timezone.utc) |
| | end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0) |
| | start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS) |
| |
|
| | try: |
| | |
| | results = fetch_all_metadata_streaming( |
| | conn, identifiers, start_date, end_date |
| | ) |
| |
|
| | agent_issues = results['agent_issues'] |
| | wanted_open = results['wanted_open'] |
| | wanted_resolved = results['wanted_resolved'] |
| | wanted_in_progress = results['wanted_in_progress'] |
| | agent_discussions = results['agent_discussions'] |
| | except Exception as e: |
| | print(f"Error during DuckDB fetch: {str(e)}") |
| | traceback.print_exc() |
| | return |
| | finally: |
| | conn.close() |
| |
|
| | print(f"\n[4/4] Saving leaderboard...") |
| |
|
| | try: |
| | leaderboard_dict = construct_leaderboard_from_metadata( |
| | agent_issues, assistants, wanted_resolved, wanted_in_progress, agent_discussions |
| | ) |
| | issue_monthly_metrics = calculate_monthly_metrics_by_issues(agent_issues, assistants) |
| | discussion_monthly_metrics = calculate_monthly_metrics_by_discussions( |
| | agent_discussions, assistants |
| | ) |
| | wanted_issue_monthly_metrics = calculate_monthly_metrics_by_wanted_issues( |
| | wanted_resolved, wanted_in_progress, assistants |
| | ) |
| | save_leaderboard_data_to_hf( |
| | leaderboard_dict, issue_monthly_metrics, wanted_open, discussion_monthly_metrics, wanted_issue_monthly_metrics |
| | ) |
| | except Exception as e: |
| | print(f"Error saving leaderboard: {str(e)}") |
| | traceback.print_exc() |
| | finally: |
| | |
| | if os.path.exists(DUCKDB_CACHE_FILE): |
| | try: |
| | os.remove(DUCKDB_CACHE_FILE) |
| | print(f" ✓ Cache file removed: {DUCKDB_CACHE_FILE}") |
| | except Exception as e: |
| | print(f" ⚠ Failed to remove cache file: {str(e)}") |
| |
|
| | |
| | |
| | |
| |
|
| | if __name__ == "__main__": |
| | mine_all_agents() |
| |
|