Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Wheel-Based Accuracy Benchmark - Similarity-aware emotion evaluation | |
| Features: | |
| - Uses Emotion Wheel for similarity scoring | |
| - Exact match = 1.0, Same category = 0.8, Adjacent = 0.5, Opposite = 0.0 | |
| - Shows detected emotion for each test | |
| - Calculates weighted accuracy based on similarity | |
| - Identifies contradictory detections vs similar mistakes | |
| Version: 3.0.0 | |
| """ | |
| import sys | |
| import os | |
| import time | |
| from typing import Dict, List, Tuple, Any, Optional | |
| from dataclasses import dataclass, field | |
| from collections import defaultdict | |
| from datetime import datetime | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from avatar.emotion_wheel import EmotionWheel, get_emotion_wheel, EmotionCategory | |
| class WheelTestResult: | |
| """Result for a single test with wheel-based scoring""" | |
| emotion: str | |
| text: str | |
| expected_category: str | |
| detected_emotion: str | |
| detected_category: Optional[str] | |
| similarity_score: float | |
| relationship: str # "exact", "same_category", "adjacent", "distant", "opposite" | |
| inference_time_ms: float | |
| emoji: str | |
| class WheelBenchmarkResults: | |
| """Aggregated wheel-based benchmark results""" | |
| total_tests: int = 0 | |
| # Weighted accuracy (using similarity scores) | |
| weighted_accuracy: float = 0.0 | |
| # Traditional accuracy | |
| exact_matches: int = 0 | |
| exact_accuracy: float = 0.0 | |
| # Similarity-based metrics | |
| same_category_matches: int = 0 | |
| adjacent_matches: int = 0 | |
| distant_matches: int = 0 | |
| opposite_matches: int = 0 # These are the real failures | |
| # Acceptable = exact + same_category + adjacent | |
| acceptable_accuracy: float = 0.0 | |
| # Contradiction rate (opposite emotions) | |
| contradiction_rate: float = 0.0 | |
| # Per-emotion results | |
| emotion_results: Dict[str, List[WheelTestResult]] = field(default_factory=dict) | |
| emotion_scores: Dict[str, float] = field(default_factory=dict) | |
| # Category performance | |
| category_accuracy: Dict[str, float] = field(default_factory=dict) | |
| # Timing | |
| avg_inference_time_ms: float = 0.0 | |
| # Failed emotions (>50% opposite) | |
| failed_emotions: List[str] = field(default_factory=list) | |
| # Confusion matrix by category | |
| category_confusion: Dict[str, Dict[str, int]] = field(default_factory=dict) | |
| class WheelBenchmark: | |
| """ | |
| Wheel-based emotion benchmark with similarity scoring | |
| Uses EmotionWheel to determine: | |
| - Exact matches (same emotion) | |
| - Same category (similar emotions) | |
| - Adjacent category (related emotions) | |
| - Opposite category (contradictory emotions - failures) | |
| """ | |
| def __init__(self, analyzer, emoji_mapper): | |
| """ | |
| Initialize benchmark | |
| Args: | |
| analyzer: Sentiment analyzer instance | |
| emoji_mapper: EmojiMapper instance | |
| """ | |
| self.analyzer = analyzer | |
| self.mapper = emoji_mapper | |
| self.wheel = get_emotion_wheel() | |
| def _extract_emotion(self, result: Dict[str, Any]) -> str: | |
| """Extract emotion label from analyzer result""" | |
| # Try different possible keys | |
| for key in ["label", "emotion", "detected_emotion"]: | |
| if key in result: | |
| return result[key].lower() | |
| return "neutral" | |
| def run_single_test(self, text: str, expected_emotion: str, expected_category: str) -> WheelTestResult: | |
| """Run single test with wheel-based scoring""" | |
| start_time = time.perf_counter() | |
| result = self.analyzer.analyze(text) | |
| end_time = time.perf_counter() | |
| inference_time_ms = (end_time - start_time) * 1000 | |
| detected_emotion = self._extract_emotion(result) | |
| detected_category = self.wheel.get_category(detected_emotion) | |
| detected_cat_name = detected_category.value if detected_category else "unknown" | |
| # Get similarity score and relationship | |
| score, relationship = self.wheel.get_similarity_score(expected_emotion, detected_emotion) | |
| # Get emoji | |
| emoji = self.mapper.get_emoji(detected_emotion) | |
| return WheelTestResult( | |
| emotion=expected_emotion, | |
| text=text, | |
| expected_category=expected_category, | |
| detected_emotion=detected_emotion, | |
| detected_category=detected_cat_name, | |
| similarity_score=score, | |
| relationship=relationship, | |
| inference_time_ms=inference_time_ms, | |
| emoji=emoji | |
| ) | |
| def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> WheelBenchmarkResults: | |
| """ | |
| Run full benchmark with wheel-based scoring | |
| Args: | |
| test_data: Dict mapping emotion -> [(text, category), ...] | |
| Returns: | |
| WheelBenchmarkResults with similarity-aware metrics | |
| """ | |
| results = WheelBenchmarkResults() | |
| results.emotion_results = defaultdict(list) | |
| results.category_confusion = defaultdict(lambda: defaultdict(int)) | |
| all_times = [] | |
| all_scores = [] | |
| for emotion, test_cases in test_data.items(): | |
| emotion_scores = [] | |
| for text, expected_category in test_cases: | |
| test_result = self.run_single_test(text, emotion, expected_category) | |
| results.emotion_results[emotion].append(test_result) | |
| results.total_tests += 1 | |
| all_times.append(test_result.inference_time_ms) | |
| all_scores.append(test_result.similarity_score) | |
| emotion_scores.append(test_result.similarity_score) | |
| # Count by relationship type | |
| if test_result.relationship == "exact": | |
| results.exact_matches += 1 | |
| elif test_result.relationship == "same_category": | |
| results.same_category_matches += 1 | |
| elif test_result.relationship == "adjacent": | |
| results.adjacent_matches += 1 | |
| elif test_result.relationship == "opposite": | |
| results.opposite_matches += 1 | |
| else: | |
| results.distant_matches += 1 | |
| # Update category confusion matrix | |
| results.category_confusion[expected_category][test_result.detected_category] += 1 | |
| # Calculate per-emotion score | |
| if emotion_scores: | |
| avg_score = sum(emotion_scores) / len(emotion_scores) | |
| results.emotion_scores[emotion] = avg_score | |
| # Check if emotion failed (majority opposite) | |
| opposite_count = sum(1 for r in results.emotion_results[emotion] | |
| if r.relationship == "opposite") | |
| if opposite_count > len(emotion_scores) / 2: | |
| results.failed_emotions.append(emotion) | |
| # Calculate overall metrics | |
| if results.total_tests > 0: | |
| results.weighted_accuracy = sum(all_scores) / len(all_scores) | |
| results.exact_accuracy = results.exact_matches / results.total_tests | |
| acceptable = results.exact_matches + results.same_category_matches + results.adjacent_matches | |
| results.acceptable_accuracy = acceptable / results.total_tests | |
| results.contradiction_rate = results.opposite_matches / results.total_tests | |
| # Calculate category accuracy | |
| for category in set(tc[1] for cases in test_data.values() for tc in cases): | |
| category_tests = [ | |
| r for emotion_results in results.emotion_results.values() | |
| for r in emotion_results if r.expected_category == category | |
| ] | |
| if category_tests: | |
| category_score = sum(r.similarity_score for r in category_tests) / len(category_tests) | |
| results.category_accuracy[category] = category_score | |
| # Calculate timing | |
| if all_times: | |
| results.avg_inference_time_ms = sum(all_times) / len(all_times) | |
| return results | |
| def generate_detailed_report(self, results: WheelBenchmarkResults) -> str: | |
| """Generate detailed report showing detected vs expected""" | |
| lines = [ | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "β WHEEL-BASED EMOTION RECOGNITION EVALUATION REPORT β", | |
| "β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£", | |
| f"β Total Tests: {results.total_tests:<10} β", | |
| f"β Weighted Accuracy: {results.weighted_accuracy:.1%} β", | |
| f"β Exact Match Rate: {results.exact_accuracy:.1%} β", | |
| f"β Acceptable Rate: {results.acceptable_accuracy:.1%} (exact + similar + adjacent) β", | |
| f"β Contradiction Rate: {results.contradiction_rate:.1%} (opposite emotions) β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "β SCORING BREAKDOWN β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€", | |
| f"β β Exact Matches: {results.exact_matches:4} ({results.exact_matches/results.total_tests*100:5.1f}%) - Score: 1.0 β", | |
| f"β π’ Same Category: {results.same_category_matches:4} ({results.same_category_matches/results.total_tests*100:5.1f}%) - Score: 0.8 β", | |
| f"β π‘ Adjacent Category: {results.adjacent_matches:4} ({results.adjacent_matches/results.total_tests*100:5.1f}%) - Score: 0.5 β", | |
| f"β π Distant Category: {results.distant_matches:4} ({results.distant_matches/results.total_tests*100:5.1f}%) - Score: 0.2 β", | |
| f"β β Opposite (FAIL): {results.opposite_matches:4} ({results.opposite_matches/results.total_tests*100:5.1f}%) - Score: 0.0 β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "", | |
| ] | |
| # Category performance | |
| lines.extend([ | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "β CATEGORY PERFORMANCE β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€", | |
| ]) | |
| for category, score in sorted(results.category_accuracy.items(), key=lambda x: -x[1]): | |
| bar = "β" * int(score * 30) + "β" * (30 - int(score * 30)) | |
| lines.append(f"β {category:<15} {bar} {score:.1%} β") | |
| lines.append("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| lines.append("") | |
| # Show some example results for each category | |
| lines.extend([ | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "β SAMPLE RESULTS (Expected β Detected) β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€", | |
| ]) | |
| # Show a few examples from each relationship type | |
| examples = {"exact": [], "same_category": [], "adjacent": [], "opposite": []} | |
| for emotion_results in results.emotion_results.values(): | |
| for r in emotion_results: | |
| if r.relationship in examples and len(examples[r.relationship]) < 3: | |
| examples[r.relationship].append(r) | |
| for rel_type, emoji_prefix in [("exact", "β "), ("same_category", "π’"), | |
| ("adjacent", "π‘"), ("opposite", "β")]: | |
| if examples[rel_type]: | |
| lines.append(f"β {emoji_prefix} {rel_type.upper()}: β") | |
| for ex in examples[rel_type]: | |
| text_short = ex.text[:35] + "..." if len(ex.text) > 35 else ex.text | |
| lines.append(f"β '{text_short:<38}' β") | |
| lines.append(f"β Expected: {ex.emotion:<12} β Detected: {ex.detected_emotion:<12} {ex.emoji} β") | |
| lines.append("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| # Failed emotions | |
| if results.failed_emotions: | |
| lines.extend([ | |
| "", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ", | |
| "β β οΈ FAILED EMOTIONS (>50% contradictory detections) β", | |
| "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€", | |
| ]) | |
| for em in results.failed_emotions[:10]: | |
| score = results.emotion_scores.get(em, 0) | |
| lines.append(f"β β {em:<25} Score: {score:.2f} β") | |
| lines.append("ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ") | |
| return "\n".join(lines) | |
| def generate_emotion_detail_table(self, results: WheelBenchmarkResults) -> str: | |
| """Generate detailed table for all emotions""" | |
| lines = [ | |
| "# Emotion Recognition Detail Report", | |
| "", | |
| "| Emotion | Score | Exact | Similar | Adjacent | Opposite | Status |", | |
| "|---------|-------|-------|---------|----------|----------|--------|", | |
| ] | |
| for emotion in sorted(results.emotion_scores.keys()): | |
| score = results.emotion_scores[emotion] | |
| emotion_tests = results.emotion_results[emotion] | |
| exact = sum(1 for r in emotion_tests if r.relationship == "exact") | |
| same = sum(1 for r in emotion_tests if r.relationship == "same_category") | |
| adj = sum(1 for r in emotion_tests if r.relationship == "adjacent") | |
| opp = sum(1 for r in emotion_tests if r.relationship == "opposite") | |
| total = len(emotion_tests) | |
| if score >= 0.8: | |
| status = "β PASS" | |
| elif score >= 0.5: | |
| status = "β οΈ OK" | |
| else: | |
| status = "β FAIL" | |
| lines.append( | |
| f"| {emotion:<15} | {score:.2f} | " | |
| f"{exact}/{total} | {same}/{total} | {adj}/{total} | {opp}/{total} | {status} |" | |
| ) | |
| return "\n".join(lines) | |
| def run_wheel_evaluation(): | |
| """Run the wheel-based evaluation""" | |
| print("=" * 80) | |
| print("WHEEL-BASED EMOTION RECOGNITION EVALUATION") | |
| print("=" * 80) | |
| print() | |
| # Import components | |
| from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer | |
| from avatar.sentiment_emoji_map import EmojiMapper | |
| from evaluation.emotion_test_suite_v3 import EmotionTestSuiteV3 | |
| # Initialize | |
| print("Loading components...") | |
| analyzer = MultiEmotionAnalyzer() | |
| mapper = EmojiMapper() | |
| suite = EmotionTestSuiteV3() | |
| benchmark = WheelBenchmark(analyzer, mapper) | |
| print(f"Test Suite V{suite.VERSION}") | |
| print(f"Emotions: {suite.get_emotion_count()}, Tests: {suite.get_test_count()}") | |
| print() | |
| # Run benchmark | |
| print("Running benchmark...") | |
| start = time.time() | |
| results = benchmark.run_benchmark(suite.EMOTION_TEST_DATA) | |
| elapsed = time.time() - start | |
| print(f"Completed in {elapsed:.2f}s") | |
| print() | |
| # Show report | |
| print(benchmark.generate_detailed_report(results)) | |
| # Save detailed markdown report | |
| timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
| report_dir = "evaluation/reports" | |
| os.makedirs(report_dir, exist_ok=True) | |
| # Save emotion detail table | |
| detail_path = os.path.join(report_dir, f"wheel_evaluation_{timestamp}.md") | |
| with open(detail_path, "w", encoding="utf-8") as f: | |
| f.write(f"# Wheel-Based Emotion Evaluation Report\n\n") | |
| f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") | |
| f.write(f"## Summary\n\n") | |
| f.write(f"- **Total Tests:** {results.total_tests}\n") | |
| f.write(f"- **Weighted Accuracy:** {results.weighted_accuracy:.1%}\n") | |
| f.write(f"- **Exact Match Rate:** {results.exact_accuracy:.1%}\n") | |
| f.write(f"- **Acceptable Rate:** {results.acceptable_accuracy:.1%}\n") | |
| f.write(f"- **Contradiction Rate:** {results.contradiction_rate:.1%}\n\n") | |
| f.write(f"## Scoring Breakdown\n\n") | |
| f.write(f"| Relationship | Count | Percentage | Score |\n") | |
| f.write(f"|--------------|-------|------------|-------|\n") | |
| f.write(f"| β Exact | {results.exact_matches} | {results.exact_matches/results.total_tests*100:.1f}% | 1.0 |\n") | |
| f.write(f"| π’ Same Category | {results.same_category_matches} | {results.same_category_matches/results.total_tests*100:.1f}% | 0.8 |\n") | |
| f.write(f"| π‘ Adjacent | {results.adjacent_matches} | {results.adjacent_matches/results.total_tests*100:.1f}% | 0.5 |\n") | |
| f.write(f"| π Distant | {results.distant_matches} | {results.distant_matches/results.total_tests*100:.1f}% | 0.2 |\n") | |
| f.write(f"| β Opposite | {results.opposite_matches} | {results.opposite_matches/results.total_tests*100:.1f}% | 0.0 |\n\n") | |
| f.write(benchmark.generate_emotion_detail_table(results)) | |
| print(f"\nπ Detailed report saved to: {detail_path}") | |
| return results | |
| if __name__ == "__main__": | |
| run_wheel_evaluation() | |