Emoji-AI-Avatar / evaluation /wheel_benchmark.py
Deminiko
Initial import: Emoji AI Avatar
25e624c
# -*- coding: utf-8 -*-
"""
Wheel-Based Accuracy Benchmark - Similarity-aware emotion evaluation
Features:
- Uses Emotion Wheel for similarity scoring
- Exact match = 1.0, Same category = 0.8, Adjacent = 0.5, Opposite = 0.0
- Shows detected emotion for each test
- Calculates weighted accuracy based on similarity
- Identifies contradictory detections vs similar mistakes
Version: 3.0.0
"""
import sys
import os
import time
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass, field
from collections import defaultdict
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from avatar.emotion_wheel import EmotionWheel, get_emotion_wheel, EmotionCategory
@dataclass
class WheelTestResult:
"""Result for a single test with wheel-based scoring"""
emotion: str
text: str
expected_category: str
detected_emotion: str
detected_category: Optional[str]
similarity_score: float
relationship: str # "exact", "same_category", "adjacent", "distant", "opposite"
inference_time_ms: float
emoji: str
@dataclass
class WheelBenchmarkResults:
"""Aggregated wheel-based benchmark results"""
total_tests: int = 0
# Weighted accuracy (using similarity scores)
weighted_accuracy: float = 0.0
# Traditional accuracy
exact_matches: int = 0
exact_accuracy: float = 0.0
# Similarity-based metrics
same_category_matches: int = 0
adjacent_matches: int = 0
distant_matches: int = 0
opposite_matches: int = 0 # These are the real failures
# Acceptable = exact + same_category + adjacent
acceptable_accuracy: float = 0.0
# Contradiction rate (opposite emotions)
contradiction_rate: float = 0.0
# Per-emotion results
emotion_results: Dict[str, List[WheelTestResult]] = field(default_factory=dict)
emotion_scores: Dict[str, float] = field(default_factory=dict)
# Category performance
category_accuracy: Dict[str, float] = field(default_factory=dict)
# Timing
avg_inference_time_ms: float = 0.0
# Failed emotions (>50% opposite)
failed_emotions: List[str] = field(default_factory=list)
# Confusion matrix by category
category_confusion: Dict[str, Dict[str, int]] = field(default_factory=dict)
class WheelBenchmark:
"""
Wheel-based emotion benchmark with similarity scoring
Uses EmotionWheel to determine:
- Exact matches (same emotion)
- Same category (similar emotions)
- Adjacent category (related emotions)
- Opposite category (contradictory emotions - failures)
"""
def __init__(self, analyzer, emoji_mapper):
"""
Initialize benchmark
Args:
analyzer: Sentiment analyzer instance
emoji_mapper: EmojiMapper instance
"""
self.analyzer = analyzer
self.mapper = emoji_mapper
self.wheel = get_emotion_wheel()
def _extract_emotion(self, result: Dict[str, Any]) -> str:
"""Extract emotion label from analyzer result"""
# Try different possible keys
for key in ["label", "emotion", "detected_emotion"]:
if key in result:
return result[key].lower()
return "neutral"
def run_single_test(self, text: str, expected_emotion: str, expected_category: str) -> WheelTestResult:
"""Run single test with wheel-based scoring"""
start_time = time.perf_counter()
result = self.analyzer.analyze(text)
end_time = time.perf_counter()
inference_time_ms = (end_time - start_time) * 1000
detected_emotion = self._extract_emotion(result)
detected_category = self.wheel.get_category(detected_emotion)
detected_cat_name = detected_category.value if detected_category else "unknown"
# Get similarity score and relationship
score, relationship = self.wheel.get_similarity_score(expected_emotion, detected_emotion)
# Get emoji
emoji = self.mapper.get_emoji(detected_emotion)
return WheelTestResult(
emotion=expected_emotion,
text=text,
expected_category=expected_category,
detected_emotion=detected_emotion,
detected_category=detected_cat_name,
similarity_score=score,
relationship=relationship,
inference_time_ms=inference_time_ms,
emoji=emoji
)
def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> WheelBenchmarkResults:
"""
Run full benchmark with wheel-based scoring
Args:
test_data: Dict mapping emotion -> [(text, category), ...]
Returns:
WheelBenchmarkResults with similarity-aware metrics
"""
results = WheelBenchmarkResults()
results.emotion_results = defaultdict(list)
results.category_confusion = defaultdict(lambda: defaultdict(int))
all_times = []
all_scores = []
for emotion, test_cases in test_data.items():
emotion_scores = []
for text, expected_category in test_cases:
test_result = self.run_single_test(text, emotion, expected_category)
results.emotion_results[emotion].append(test_result)
results.total_tests += 1
all_times.append(test_result.inference_time_ms)
all_scores.append(test_result.similarity_score)
emotion_scores.append(test_result.similarity_score)
# Count by relationship type
if test_result.relationship == "exact":
results.exact_matches += 1
elif test_result.relationship == "same_category":
results.same_category_matches += 1
elif test_result.relationship == "adjacent":
results.adjacent_matches += 1
elif test_result.relationship == "opposite":
results.opposite_matches += 1
else:
results.distant_matches += 1
# Update category confusion matrix
results.category_confusion[expected_category][test_result.detected_category] += 1
# Calculate per-emotion score
if emotion_scores:
avg_score = sum(emotion_scores) / len(emotion_scores)
results.emotion_scores[emotion] = avg_score
# Check if emotion failed (majority opposite)
opposite_count = sum(1 for r in results.emotion_results[emotion]
if r.relationship == "opposite")
if opposite_count > len(emotion_scores) / 2:
results.failed_emotions.append(emotion)
# Calculate overall metrics
if results.total_tests > 0:
results.weighted_accuracy = sum(all_scores) / len(all_scores)
results.exact_accuracy = results.exact_matches / results.total_tests
acceptable = results.exact_matches + results.same_category_matches + results.adjacent_matches
results.acceptable_accuracy = acceptable / results.total_tests
results.contradiction_rate = results.opposite_matches / results.total_tests
# Calculate category accuracy
for category in set(tc[1] for cases in test_data.values() for tc in cases):
category_tests = [
r for emotion_results in results.emotion_results.values()
for r in emotion_results if r.expected_category == category
]
if category_tests:
category_score = sum(r.similarity_score for r in category_tests) / len(category_tests)
results.category_accuracy[category] = category_score
# Calculate timing
if all_times:
results.avg_inference_time_ms = sum(all_times) / len(all_times)
return results
def generate_detailed_report(self, results: WheelBenchmarkResults) -> str:
"""Generate detailed report showing detected vs expected"""
lines = [
"╔══════════════════════════════════════════════════════════════════════════════╗",
"β•‘ WHEEL-BASED EMOTION RECOGNITION EVALUATION REPORT β•‘",
"╠══════════════════════════════════════════════════════════════════════════════╣",
f"β•‘ Total Tests: {results.total_tests:<10} β•‘",
f"β•‘ Weighted Accuracy: {results.weighted_accuracy:.1%} β•‘",
f"β•‘ Exact Match Rate: {results.exact_accuracy:.1%} β•‘",
f"β•‘ Acceptable Rate: {results.acceptable_accuracy:.1%} (exact + similar + adjacent) β•‘",
f"β•‘ Contradiction Rate: {results.contradiction_rate:.1%} (opposite emotions) β•‘",
"β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•",
"",
"β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”",
"β”‚ SCORING BREAKDOWN β”‚",
"β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€",
f"β”‚ βœ… Exact Matches: {results.exact_matches:4} ({results.exact_matches/results.total_tests*100:5.1f}%) - Score: 1.0 β”‚",
f"β”‚ 🟒 Same Category: {results.same_category_matches:4} ({results.same_category_matches/results.total_tests*100:5.1f}%) - Score: 0.8 β”‚",
f"β”‚ 🟑 Adjacent Category: {results.adjacent_matches:4} ({results.adjacent_matches/results.total_tests*100:5.1f}%) - Score: 0.5 β”‚",
f"β”‚ 🟠 Distant Category: {results.distant_matches:4} ({results.distant_matches/results.total_tests*100:5.1f}%) - Score: 0.2 β”‚",
f"β”‚ ❌ Opposite (FAIL): {results.opposite_matches:4} ({results.opposite_matches/results.total_tests*100:5.1f}%) - Score: 0.0 β”‚",
"β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜",
"",
]
# Category performance
lines.extend([
"β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”",
"β”‚ CATEGORY PERFORMANCE β”‚",
"β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€",
])
for category, score in sorted(results.category_accuracy.items(), key=lambda x: -x[1]):
bar = "β–ˆ" * int(score * 30) + "β–‘" * (30 - int(score * 30))
lines.append(f"β”‚ {category:<15} {bar} {score:.1%} β”‚")
lines.append("β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")
lines.append("")
# Show some example results for each category
lines.extend([
"β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”",
"β”‚ SAMPLE RESULTS (Expected β†’ Detected) β”‚",
"β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€",
])
# Show a few examples from each relationship type
examples = {"exact": [], "same_category": [], "adjacent": [], "opposite": []}
for emotion_results in results.emotion_results.values():
for r in emotion_results:
if r.relationship in examples and len(examples[r.relationship]) < 3:
examples[r.relationship].append(r)
for rel_type, emoji_prefix in [("exact", "βœ…"), ("same_category", "🟒"),
("adjacent", "🟑"), ("opposite", "❌")]:
if examples[rel_type]:
lines.append(f"β”‚ {emoji_prefix} {rel_type.upper()}: β”‚")
for ex in examples[rel_type]:
text_short = ex.text[:35] + "..." if len(ex.text) > 35 else ex.text
lines.append(f"β”‚ '{text_short:<38}' β”‚")
lines.append(f"β”‚ Expected: {ex.emotion:<12} β†’ Detected: {ex.detected_emotion:<12} {ex.emoji} β”‚")
lines.append("β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")
# Failed emotions
if results.failed_emotions:
lines.extend([
"",
"β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”",
"β”‚ ⚠️ FAILED EMOTIONS (>50% contradictory detections) β”‚",
"β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€",
])
for em in results.failed_emotions[:10]:
score = results.emotion_scores.get(em, 0)
lines.append(f"β”‚ ❌ {em:<25} Score: {score:.2f} β”‚")
lines.append("β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜")
return "\n".join(lines)
def generate_emotion_detail_table(self, results: WheelBenchmarkResults) -> str:
"""Generate detailed table for all emotions"""
lines = [
"# Emotion Recognition Detail Report",
"",
"| Emotion | Score | Exact | Similar | Adjacent | Opposite | Status |",
"|---------|-------|-------|---------|----------|----------|--------|",
]
for emotion in sorted(results.emotion_scores.keys()):
score = results.emotion_scores[emotion]
emotion_tests = results.emotion_results[emotion]
exact = sum(1 for r in emotion_tests if r.relationship == "exact")
same = sum(1 for r in emotion_tests if r.relationship == "same_category")
adj = sum(1 for r in emotion_tests if r.relationship == "adjacent")
opp = sum(1 for r in emotion_tests if r.relationship == "opposite")
total = len(emotion_tests)
if score >= 0.8:
status = "βœ… PASS"
elif score >= 0.5:
status = "⚠️ OK"
else:
status = "❌ FAIL"
lines.append(
f"| {emotion:<15} | {score:.2f} | "
f"{exact}/{total} | {same}/{total} | {adj}/{total} | {opp}/{total} | {status} |"
)
return "\n".join(lines)
def run_wheel_evaluation():
"""Run the wheel-based evaluation"""
print("=" * 80)
print("WHEEL-BASED EMOTION RECOGNITION EVALUATION")
print("=" * 80)
print()
# Import components
from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer
from avatar.sentiment_emoji_map import EmojiMapper
from evaluation.emotion_test_suite_v3 import EmotionTestSuiteV3
# Initialize
print("Loading components...")
analyzer = MultiEmotionAnalyzer()
mapper = EmojiMapper()
suite = EmotionTestSuiteV3()
benchmark = WheelBenchmark(analyzer, mapper)
print(f"Test Suite V{suite.VERSION}")
print(f"Emotions: {suite.get_emotion_count()}, Tests: {suite.get_test_count()}")
print()
# Run benchmark
print("Running benchmark...")
start = time.time()
results = benchmark.run_benchmark(suite.EMOTION_TEST_DATA)
elapsed = time.time() - start
print(f"Completed in {elapsed:.2f}s")
print()
# Show report
print(benchmark.generate_detailed_report(results))
# Save detailed markdown report
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
report_dir = "evaluation/reports"
os.makedirs(report_dir, exist_ok=True)
# Save emotion detail table
detail_path = os.path.join(report_dir, f"wheel_evaluation_{timestamp}.md")
with open(detail_path, "w", encoding="utf-8") as f:
f.write(f"# Wheel-Based Emotion Evaluation Report\n\n")
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
f.write(f"## Summary\n\n")
f.write(f"- **Total Tests:** {results.total_tests}\n")
f.write(f"- **Weighted Accuracy:** {results.weighted_accuracy:.1%}\n")
f.write(f"- **Exact Match Rate:** {results.exact_accuracy:.1%}\n")
f.write(f"- **Acceptable Rate:** {results.acceptable_accuracy:.1%}\n")
f.write(f"- **Contradiction Rate:** {results.contradiction_rate:.1%}\n\n")
f.write(f"## Scoring Breakdown\n\n")
f.write(f"| Relationship | Count | Percentage | Score |\n")
f.write(f"|--------------|-------|------------|-------|\n")
f.write(f"| βœ… Exact | {results.exact_matches} | {results.exact_matches/results.total_tests*100:.1f}% | 1.0 |\n")
f.write(f"| 🟒 Same Category | {results.same_category_matches} | {results.same_category_matches/results.total_tests*100:.1f}% | 0.8 |\n")
f.write(f"| 🟑 Adjacent | {results.adjacent_matches} | {results.adjacent_matches/results.total_tests*100:.1f}% | 0.5 |\n")
f.write(f"| 🟠 Distant | {results.distant_matches} | {results.distant_matches/results.total_tests*100:.1f}% | 0.2 |\n")
f.write(f"| ❌ Opposite | {results.opposite_matches} | {results.opposite_matches/results.total_tests*100:.1f}% | 0.0 |\n\n")
f.write(benchmark.generate_emotion_detail_table(results))
print(f"\nπŸ“„ Detailed report saved to: {detail_path}")
return results
if __name__ == "__main__":
run_wheel_evaluation()