Spaces:

NLarchive
/

Emoji-AI-Avatar

Sleeping

Emoji-AI-Avatar / evaluation /accuracy_benchmark.py

Deminiko

Initial import: Emoji AI Avatar

25e624c 10 days ago

8.54 kB

	# -- coding: utf-8 --
	"""
	Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis

	Measures:
	- Detection speed (ms per text)
	- Accuracy per emotion
	- Overall accuracy by polarity
	- Confusion matrix
	"""

	import time
	from typing import Dict, List, Any, Tuple
	from dataclasses import dataclass, field
	from collections import defaultdict


	@dataclass
	class EmotionResult:
	"""Result for a single emotion test"""
	emotion: str
	text: str
	expected_polarity: str
	detected_label: str
	detected_polarity: str
	is_correct: bool
	inference_time_ms: float
	emoji: str


	@dataclass
	class BenchmarkResults:
	"""Aggregated benchmark results"""
	total_tests: int = 0
	correct_tests: int = 0
	accuracy: float = 0.0
	avg_inference_time_ms: float = 0.0
	min_inference_time_ms: float = 0.0
	max_inference_time_ms: float = 0.0
	median_inference_time_ms: float = 0.0
	emotion_accuracy: Dict[str, float] = field(default_factory=dict)
	emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict)
	failed_emotions: List[str] = field(default_factory=list)
	confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict)


	class AccuracyBenchmark:
	"""
	Benchmark sentiment analysis accuracy and speed

	Tests all emotions and generates accuracy metrics
	"""

	# Map transformer output to polarity
	LABEL_TO_POLARITY = {
	"happiness": "positive",
	"sadness": "negative",
	"positive": "positive",
	"negative": "negative",
	"neutral": "neutral",
	"joy": "positive",
	"anger": "negative",
	"fear": "negative",
	"surprise": "positive",
	"disgust": "negative",
	}

	def __init__(self, analyzer, emoji_mapper):
	"""
	Initialize benchmark with analyzer and mapper

	Args:
	analyzer: SentimentAnalyzer instance
	emoji_mapper: EmojiMapper instance
	"""
	self.analyzer = analyzer
	self.emoji_mapper = emoji_mapper

	def _get_polarity(self, label: str) -> str:
	"""Map emotion label to polarity (positive/negative/neutral)"""
	return self.LABEL_TO_POLARITY.get(label.lower(), "neutral")

	def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult:
	"""Run a single test and return result"""
	# Time the inference
	start_time = time.perf_counter()
	result = self.analyzer.analyze(text)
	end_time = time.perf_counter()

	inference_time_ms = (end_time - start_time) * 1000

	detected_label = result.get("label", "neutral")

	# Use polarity from result if available, otherwise infer from label
	if "polarity" in result:
	detected_polarity = result["polarity"]
	else:
	detected_polarity = self._get_polarity(detected_label)

	# Get emoji
	emoji = self.emoji_mapper.get_emoji(detected_label)

	# Check correctness (polarity match)
	is_correct = detected_polarity == expected_polarity

	return EmotionResult(
	emotion=emotion,
	text=text,
	expected_polarity=expected_polarity,
	detected_label=detected_label,
	detected_polarity=detected_polarity,
	is_correct=is_correct,
	inference_time_ms=inference_time_ms,
	emoji=emoji
	)

	def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults:
	"""
	Run full benchmark on test data

	Args:
	test_data: Dict mapping emotion -> [(text, expected_polarity), ...]

	Returns:
	BenchmarkResults with all metrics
	"""
	results = BenchmarkResults()
	results.emotion_results = defaultdict(list)
	results.confusion_matrix = defaultdict(lambda: defaultdict(int))

	all_times = []

	for emotion, test_cases in test_data.items():
	emotion_correct = 0
	emotion_total = 0

	for text, expected_polarity in test_cases:
	result = self.run_single_test(text, expected_polarity, emotion)
	results.emotion_results[emotion].append(result)

	results.total_tests += 1
	emotion_total += 1
	all_times.append(result.inference_time_ms)

	# Update confusion matrix
	results.confusion_matrix[expected_polarity][result.detected_polarity] += 1

	if result.is_correct:
	results.correct_tests += 1
	emotion_correct += 1

	# Calculate per-emotion accuracy
	if emotion_total > 0:
	emotion_acc = emotion_correct / emotion_total
	results.emotion_accuracy[emotion] = emotion_acc

	if emotion_acc < 0.5: # Less than 50% accuracy
	results.failed_emotions.append(emotion)

	# Calculate overall metrics
	if results.total_tests > 0:
	results.accuracy = results.correct_tests / results.total_tests

	if all_times:
	all_times_sorted = sorted(all_times)
	results.avg_inference_time_ms = sum(all_times) / len(all_times)
	results.min_inference_time_ms = min(all_times)
	results.max_inference_time_ms = max(all_times)
	mid = len(all_times_sorted) // 2
	if len(all_times_sorted) % 2 == 0:
	results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2
	else:
	results.median_inference_time_ms = all_times_sorted[mid]

	return results

	def get_accuracy_report(self, results: BenchmarkResults) -> str:
	"""Generate human-readable accuracy report"""
	lines = [
	"=" * 70,
	"SENTIMENT ANALYSIS ACCURACY BENCHMARK",
	"=" * 70,
	"",
	f"Total Tests: {results.total_tests}",
	f"Correct: {results.correct_tests}",
	f"Overall Accuracy: {results.accuracy:.1%}",
	f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms",
	"",
	"-" * 70,
	"ACCURACY BY EMOTION (sorted by accuracy)",
	"-" * 70,
	]

	# Sort emotions by accuracy
	sorted_emotions = sorted(
	results.emotion_accuracy.items(),
	key=lambda x: x[1],
	reverse=True
	)

	for emotion, acc in sorted_emotions:
	status = "✓" if acc >= 0.5 else "✗"
	lines.append(f"{status} {emotion:25} {acc:6.1%}")

	lines.extend([
	"",
	"-" * 70,
	"CONFUSION MATRIX (expected → detected)",
	"-" * 70,
	])

	# Print confusion matrix
	polarities = ["positive", "negative", "neutral"]
	header = " " + " ".join(f"{p:>10}" for p in polarities)
	lines.append(header)

	for expected in polarities:
	row = f"{expected:>8} "
	for detected in polarities:
	count = results.confusion_matrix[expected][detected]
	row += f"{count:>10} "
	lines.append(row)

	if results.failed_emotions:
	lines.extend([
	"",
	"-" * 70,
	f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}",
	"-" * 70,
	])
	for em in results.failed_emotions:
	lines.append(f" ✗ {em}")

	lines.append("=" * 70)

	return "\n".join(lines)


	if __name__ == "__main__":
	# Quick test
	from avatar import SentimentAnalyzer, EmojiMapper

	analyzer = SentimentAnalyzer()
	mapper = EmojiMapper()
	benchmark = AccuracyBenchmark(analyzer, mapper)

	# Mini test
	test_data = {
	"happiness": [
	("I am happy", "positive"),
	("I am good", "positive"),
	],
	"sadness": [
	("I am sad", "negative"),
	("I feel terrible", "negative"),
	],
	}

	results = benchmark.run_benchmark(test_data)
	print(benchmark.get_accuracy_report(results))