Spaces:

NLarchive
/

Emoji-AI-Avatar

Sleeping

Emoji-AI-Avatar / evaluation /wheel_benchmark.py

Deminiko

Initial import: Emoji AI Avatar

25e624c 11 days ago

20.7 kB

	# -- coding: utf-8 --
	"""
	Wheel-Based Accuracy Benchmark - Similarity-aware emotion evaluation

	Features:
	- Uses Emotion Wheel for similarity scoring
	- Exact match = 1.0, Same category = 0.8, Adjacent = 0.5, Opposite = 0.0
	- Shows detected emotion for each test
	- Calculates weighted accuracy based on similarity
	- Identifies contradictory detections vs similar mistakes

	Version: 3.0.0
	"""

	import sys
	import os
	import time
	from typing import Dict, List, Tuple, Any, Optional
	from dataclasses import dataclass, field
	from collections import defaultdict
	from datetime import datetime

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from avatar.emotion_wheel import EmotionWheel, get_emotion_wheel, EmotionCategory


	@dataclass
	class WheelTestResult:
	"""Result for a single test with wheel-based scoring"""
	emotion: str
	text: str
	expected_category: str
	detected_emotion: str
	detected_category: Optional[str]
	similarity_score: float
	relationship: str # "exact", "same_category", "adjacent", "distant", "opposite"
	inference_time_ms: float
	emoji: str


	@dataclass
	class WheelBenchmarkResults:
	"""Aggregated wheel-based benchmark results"""
	total_tests: int = 0

	# Weighted accuracy (using similarity scores)
	weighted_accuracy: float = 0.0

	# Traditional accuracy
	exact_matches: int = 0
	exact_accuracy: float = 0.0

	# Similarity-based metrics
	same_category_matches: int = 0
	adjacent_matches: int = 0
	distant_matches: int = 0
	opposite_matches: int = 0 # These are the real failures

	# Acceptable = exact + same_category + adjacent
	acceptable_accuracy: float = 0.0

	# Contradiction rate (opposite emotions)
	contradiction_rate: float = 0.0

	# Per-emotion results
	emotion_results: Dict[str, List[WheelTestResult]] = field(default_factory=dict)
	emotion_scores: Dict[str, float] = field(default_factory=dict)

	# Category performance
	category_accuracy: Dict[str, float] = field(default_factory=dict)

	# Timing
	avg_inference_time_ms: float = 0.0

	# Failed emotions (>50% opposite)
	failed_emotions: List[str] = field(default_factory=list)

	# Confusion matrix by category
	category_confusion: Dict[str, Dict[str, int]] = field(default_factory=dict)


	class WheelBenchmark:
	"""
	Wheel-based emotion benchmark with similarity scoring

	Uses EmotionWheel to determine:
	- Exact matches (same emotion)
	- Same category (similar emotions)
	- Adjacent category (related emotions)
	- Opposite category (contradictory emotions - failures)
	"""

	def __init__(self, analyzer, emoji_mapper):
	"""
	Initialize benchmark

	Args:
	analyzer: Sentiment analyzer instance
	emoji_mapper: EmojiMapper instance
	"""
	self.analyzer = analyzer
	self.mapper = emoji_mapper
	self.wheel = get_emotion_wheel()

	def _extract_emotion(self, result: Dict[str, Any]) -> str:
	"""Extract emotion label from analyzer result"""
	# Try different possible keys
	for key in ["label", "emotion", "detected_emotion"]:
	if key in result:
	return result[key].lower()
	return "neutral"

	def run_single_test(self, text: str, expected_emotion: str, expected_category: str) -> WheelTestResult:
	"""Run single test with wheel-based scoring"""

	start_time = time.perf_counter()
	result = self.analyzer.analyze(text)
	end_time = time.perf_counter()

	inference_time_ms = (end_time - start_time) * 1000

	detected_emotion = self._extract_emotion(result)
	detected_category = self.wheel.get_category(detected_emotion)
	detected_cat_name = detected_category.value if detected_category else "unknown"

	# Get similarity score and relationship
	score, relationship = self.wheel.get_similarity_score(expected_emotion, detected_emotion)

	# Get emoji
	emoji = self.mapper.get_emoji(detected_emotion)

	return WheelTestResult(
	emotion=expected_emotion,
	text=text,
	expected_category=expected_category,
	detected_emotion=detected_emotion,
	detected_category=detected_cat_name,
	similarity_score=score,
	relationship=relationship,
	inference_time_ms=inference_time_ms,
	emoji=emoji
	)

	def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> WheelBenchmarkResults:
	"""
	Run full benchmark with wheel-based scoring

	Args:
	test_data: Dict mapping emotion -> [(text, category), ...]

	Returns:
	WheelBenchmarkResults with similarity-aware metrics
	"""
	results = WheelBenchmarkResults()
	results.emotion_results = defaultdict(list)
	results.category_confusion = defaultdict(lambda: defaultdict(int))

	all_times = []
	all_scores = []

	for emotion, test_cases in test_data.items():
	emotion_scores = []

	for text, expected_category in test_cases:
	test_result = self.run_single_test(text, emotion, expected_category)
	results.emotion_results[emotion].append(test_result)

	results.total_tests += 1
	all_times.append(test_result.inference_time_ms)
	all_scores.append(test_result.similarity_score)
	emotion_scores.append(test_result.similarity_score)

	# Count by relationship type
	if test_result.relationship == "exact":
	results.exact_matches += 1
	elif test_result.relationship == "same_category":
	results.same_category_matches += 1
	elif test_result.relationship == "adjacent":
	results.adjacent_matches += 1
	elif test_result.relationship == "opposite":
	results.opposite_matches += 1
	else:
	results.distant_matches += 1

	# Update category confusion matrix
	results.category_confusion[expected_category][test_result.detected_category] += 1

	# Calculate per-emotion score
	if emotion_scores:
	avg_score = sum(emotion_scores) / len(emotion_scores)
	results.emotion_scores[emotion] = avg_score

	# Check if emotion failed (majority opposite)
	opposite_count = sum(1 for r in results.emotion_results[emotion]
	if r.relationship == "opposite")
	if opposite_count > len(emotion_scores) / 2:
	results.failed_emotions.append(emotion)

	# Calculate overall metrics
	if results.total_tests > 0:
	results.weighted_accuracy = sum(all_scores) / len(all_scores)
	results.exact_accuracy = results.exact_matches / results.total_tests

	acceptable = results.exact_matches + results.same_category_matches + results.adjacent_matches
	results.acceptable_accuracy = acceptable / results.total_tests

	results.contradiction_rate = results.opposite_matches / results.total_tests

	# Calculate category accuracy
	for category in set(tc[1] for cases in test_data.values() for tc in cases):
	category_tests = [
	r for emotion_results in results.emotion_results.values()
	for r in emotion_results if r.expected_category == category
	]
	if category_tests:
	category_score = sum(r.similarity_score for r in category_tests) / len(category_tests)
	results.category_accuracy[category] = category_score

	# Calculate timing
	if all_times:
	results.avg_inference_time_ms = sum(all_times) / len(all_times)

	return results

	def generate_detailed_report(self, results: WheelBenchmarkResults) -> str:
	"""Generate detailed report showing detected vs expected"""

	lines = [
	"╔══════════════════════════════════════════════════════════════════════════════╗",
	"║ WHEEL-BASED EMOTION RECOGNITION EVALUATION REPORT ║",
	"╠══════════════════════════════════════════════════════════════════════════════╣",
	f"║ Total Tests: {results.total_tests:<10} ║",
	f"║ Weighted Accuracy: {results.weighted_accuracy:.1%} ║",
	f"║ Exact Match Rate: {results.exact_accuracy:.1%} ║",
	f"║ Acceptable Rate: {results.acceptable_accuracy:.1%} (exact + similar + adjacent) ║",
	f"║ Contradiction Rate: {results.contradiction_rate:.1%} (opposite emotions) ║",
	"╚══════════════════════════════════════════════════════════════════════════════╝",
	"",
	"┌──────────────────────────────────────────────────────────────────────────────┐",
	"│ SCORING BREAKDOWN │",
	"├──────────────────────────────────────────────────────────────────────────────┤",
	f"│ ✅ Exact Matches: {results.exact_matches:4} ({results.exact_matches/results.total_tests*100:5.1f}%) - Score: 1.0 │",
	f"│ 🟢 Same Category: {results.same_category_matches:4} ({results.same_category_matches/results.total_tests*100:5.1f}%) - Score: 0.8 │",
	f"│ 🟡 Adjacent Category: {results.adjacent_matches:4} ({results.adjacent_matches/results.total_tests*100:5.1f}%) - Score: 0.5 │",
	f"│ 🟠 Distant Category: {results.distant_matches:4} ({results.distant_matches/results.total_tests*100:5.1f}%) - Score: 0.2 │",
	f"│ ❌ Opposite (FAIL): {results.opposite_matches:4} ({results.opposite_matches/results.total_tests*100:5.1f}%) - Score: 0.0 │",
	"└──────────────────────────────────────────────────────────────────────────────┘",
	"",
	]

	# Category performance
	lines.extend([
	"┌──────────────────────────────────────────────────────────────────────────────┐",
	"│ CATEGORY PERFORMANCE │",
	"├──────────────────────────────────────────────────────────────────────────────┤",
	])

	for category, score in sorted(results.category_accuracy.items(), key=lambda x: -x[1]):
	bar = "█" * int(score * 30) + "░" * (30 - int(score * 30))
	lines.append(f"│ {category:<15} {bar} {score:.1%} │")

	lines.append("└──────────────────────────────────────────────────────────────────────────────┘")
	lines.append("")

	# Show some example results for each category
	lines.extend([
	"┌──────────────────────────────────────────────────────────────────────────────┐",
	"│ SAMPLE RESULTS (Expected → Detected) │",
	"├──────────────────────────────────────────────────────────────────────────────┤",
	])

	# Show a few examples from each relationship type
	examples = {"exact": [], "same_category": [], "adjacent": [], "opposite": []}

	for emotion_results in results.emotion_results.values():
	for r in emotion_results:
	if r.relationship in examples and len(examples[r.relationship]) < 3:
	examples[r.relationship].append(r)

	for rel_type, emoji_prefix in [("exact", "✅"), ("same_category", "🟢"),
	("adjacent", "🟡"), ("opposite", "❌")]:
	if examples[rel_type]:
	lines.append(f"│ {emoji_prefix} {rel_type.upper()}: │")
	for ex in examples[rel_type]:
	text_short = ex.text[:35] + "..." if len(ex.text) > 35 else ex.text
	lines.append(f"│ '{text_short:<38}' │")
	lines.append(f"│ Expected: {ex.emotion:<12} → Detected: {ex.detected_emotion:<12} {ex.emoji} │")

	lines.append("└──────────────────────────────────────────────────────────────────────────────┘")

	# Failed emotions
	if results.failed_emotions:
	lines.extend([
	"",
	"┌──────────────────────────────────────────────────────────────────────────────┐",
	"│ ⚠️ FAILED EMOTIONS (>50% contradictory detections) │",
	"├──────────────────────────────────────────────────────────────────────────────┤",
	])
	for em in results.failed_emotions[:10]:
	score = results.emotion_scores.get(em, 0)
	lines.append(f"│ ❌ {em:<25} Score: {score:.2f} │")
	lines.append("└──────────────────────────────────────────────────────────────────────────────┘")

	return "\n".join(lines)

	def generate_emotion_detail_table(self, results: WheelBenchmarkResults) -> str:
	"""Generate detailed table for all emotions"""

	lines = [
	"# Emotion Recognition Detail Report",
	"",
	"\| Emotion \| Score \| Exact \| Similar \| Adjacent \| Opposite \| Status \|",
	"\|---------\|-------\|-------\|---------\|----------\|----------\|--------\|",
	]

	for emotion in sorted(results.emotion_scores.keys()):
	score = results.emotion_scores[emotion]
	emotion_tests = results.emotion_results[emotion]

	exact = sum(1 for r in emotion_tests if r.relationship == "exact")
	same = sum(1 for r in emotion_tests if r.relationship == "same_category")
	adj = sum(1 for r in emotion_tests if r.relationship == "adjacent")
	opp = sum(1 for r in emotion_tests if r.relationship == "opposite")
	total = len(emotion_tests)

	if score >= 0.8:
	status = "✅ PASS"
	elif score >= 0.5:
	status = "⚠️ OK"
	else:
	status = "❌ FAIL"

	lines.append(
	f"\| {emotion:<15} \| {score:.2f} \| "
	f"{exact}/{total} \| {same}/{total} \| {adj}/{total} \| {opp}/{total} \| {status} \|"
	)

	return "\n".join(lines)


	def run_wheel_evaluation():
	"""Run the wheel-based evaluation"""

	print("=" * 80)
	print("WHEEL-BASED EMOTION RECOGNITION EVALUATION")
	print("=" * 80)
	print()

	# Import components
	from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer
	from avatar.sentiment_emoji_map import EmojiMapper
	from evaluation.emotion_test_suite_v3 import EmotionTestSuiteV3

	# Initialize
	print("Loading components...")
	analyzer = MultiEmotionAnalyzer()
	mapper = EmojiMapper()
	suite = EmotionTestSuiteV3()
	benchmark = WheelBenchmark(analyzer, mapper)

	print(f"Test Suite V{suite.VERSION}")
	print(f"Emotions: {suite.get_emotion_count()}, Tests: {suite.get_test_count()}")
	print()

	# Run benchmark
	print("Running benchmark...")
	start = time.time()
	results = benchmark.run_benchmark(suite.EMOTION_TEST_DATA)
	elapsed = time.time() - start
	print(f"Completed in {elapsed:.2f}s")
	print()

	# Show report
	print(benchmark.generate_detailed_report(results))

	# Save detailed markdown report
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	report_dir = "evaluation/reports"
	os.makedirs(report_dir, exist_ok=True)

	# Save emotion detail table
	detail_path = os.path.join(report_dir, f"wheel_evaluation_{timestamp}.md")

	with open(detail_path, "w", encoding="utf-8") as f:
	f.write(f"# Wheel-Based Emotion Evaluation Report\n\n")
	f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
	f.write(f"## Summary\n\n")
	f.write(f"- Total Tests: {results.total_tests}\n")
	f.write(f"- Weighted Accuracy: {results.weighted_accuracy:.1%}\n")
	f.write(f"- Exact Match Rate: {results.exact_accuracy:.1%}\n")
	f.write(f"- Acceptable Rate: {results.acceptable_accuracy:.1%}\n")
	f.write(f"- Contradiction Rate: {results.contradiction_rate:.1%}\n\n")
	f.write(f"## Scoring Breakdown\n\n")
	f.write(f"\| Relationship \| Count \| Percentage \| Score \|\n")
	f.write(f"\|--------------\|-------\|------------\|-------\|\n")
	f.write(f"\| ✅ Exact \| {results.exact_matches} \| {results.exact_matches/results.total_tests*100:.1f}% \| 1.0 \|\n")
	f.write(f"\| 🟢 Same Category \| {results.same_category_matches} \| {results.same_category_matches/results.total_tests*100:.1f}% \| 0.8 \|\n")
	f.write(f"\| 🟡 Adjacent \| {results.adjacent_matches} \| {results.adjacent_matches/results.total_tests*100:.1f}% \| 0.5 \|\n")
	f.write(f"\| 🟠 Distant \| {results.distant_matches} \| {results.distant_matches/results.total_tests*100:.1f}% \| 0.2 \|\n")
	f.write(f"\| ❌ Opposite \| {results.opposite_matches} \| {results.opposite_matches/results.total_tests*100:.1f}% \| 0.0 \|\n\n")
	f.write(benchmark.generate_emotion_detail_table(results))

	print(f"\n📄 Detailed report saved to: {detail_path}")

	return results


	if __name__ == "__main__":
	run_wheel_evaluation()