Emoji-AI-Avatar / evaluation /accuracy_benchmark.py
Deminiko
Initial import: Emoji AI Avatar
25e624c
# -*- coding: utf-8 -*-
"""
Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis
Measures:
- Detection speed (ms per text)
- Accuracy per emotion
- Overall accuracy by polarity
- Confusion matrix
"""
import time
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass, field
from collections import defaultdict
@dataclass
class EmotionResult:
"""Result for a single emotion test"""
emotion: str
text: str
expected_polarity: str
detected_label: str
detected_polarity: str
is_correct: bool
inference_time_ms: float
emoji: str
@dataclass
class BenchmarkResults:
"""Aggregated benchmark results"""
total_tests: int = 0
correct_tests: int = 0
accuracy: float = 0.0
avg_inference_time_ms: float = 0.0
min_inference_time_ms: float = 0.0
max_inference_time_ms: float = 0.0
median_inference_time_ms: float = 0.0
emotion_accuracy: Dict[str, float] = field(default_factory=dict)
emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict)
failed_emotions: List[str] = field(default_factory=list)
confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict)
class AccuracyBenchmark:
"""
Benchmark sentiment analysis accuracy and speed
Tests all emotions and generates accuracy metrics
"""
# Map transformer output to polarity
LABEL_TO_POLARITY = {
"happiness": "positive",
"sadness": "negative",
"positive": "positive",
"negative": "negative",
"neutral": "neutral",
"joy": "positive",
"anger": "negative",
"fear": "negative",
"surprise": "positive",
"disgust": "negative",
}
def __init__(self, analyzer, emoji_mapper):
"""
Initialize benchmark with analyzer and mapper
Args:
analyzer: SentimentAnalyzer instance
emoji_mapper: EmojiMapper instance
"""
self.analyzer = analyzer
self.emoji_mapper = emoji_mapper
def _get_polarity(self, label: str) -> str:
"""Map emotion label to polarity (positive/negative/neutral)"""
return self.LABEL_TO_POLARITY.get(label.lower(), "neutral")
def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult:
"""Run a single test and return result"""
# Time the inference
start_time = time.perf_counter()
result = self.analyzer.analyze(text)
end_time = time.perf_counter()
inference_time_ms = (end_time - start_time) * 1000
detected_label = result.get("label", "neutral")
# Use polarity from result if available, otherwise infer from label
if "polarity" in result:
detected_polarity = result["polarity"]
else:
detected_polarity = self._get_polarity(detected_label)
# Get emoji
emoji = self.emoji_mapper.get_emoji(detected_label)
# Check correctness (polarity match)
is_correct = detected_polarity == expected_polarity
return EmotionResult(
emotion=emotion,
text=text,
expected_polarity=expected_polarity,
detected_label=detected_label,
detected_polarity=detected_polarity,
is_correct=is_correct,
inference_time_ms=inference_time_ms,
emoji=emoji
)
def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults:
"""
Run full benchmark on test data
Args:
test_data: Dict mapping emotion -> [(text, expected_polarity), ...]
Returns:
BenchmarkResults with all metrics
"""
results = BenchmarkResults()
results.emotion_results = defaultdict(list)
results.confusion_matrix = defaultdict(lambda: defaultdict(int))
all_times = []
for emotion, test_cases in test_data.items():
emotion_correct = 0
emotion_total = 0
for text, expected_polarity in test_cases:
result = self.run_single_test(text, expected_polarity, emotion)
results.emotion_results[emotion].append(result)
results.total_tests += 1
emotion_total += 1
all_times.append(result.inference_time_ms)
# Update confusion matrix
results.confusion_matrix[expected_polarity][result.detected_polarity] += 1
if result.is_correct:
results.correct_tests += 1
emotion_correct += 1
# Calculate per-emotion accuracy
if emotion_total > 0:
emotion_acc = emotion_correct / emotion_total
results.emotion_accuracy[emotion] = emotion_acc
if emotion_acc < 0.5: # Less than 50% accuracy
results.failed_emotions.append(emotion)
# Calculate overall metrics
if results.total_tests > 0:
results.accuracy = results.correct_tests / results.total_tests
if all_times:
all_times_sorted = sorted(all_times)
results.avg_inference_time_ms = sum(all_times) / len(all_times)
results.min_inference_time_ms = min(all_times)
results.max_inference_time_ms = max(all_times)
mid = len(all_times_sorted) // 2
if len(all_times_sorted) % 2 == 0:
results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2
else:
results.median_inference_time_ms = all_times_sorted[mid]
return results
def get_accuracy_report(self, results: BenchmarkResults) -> str:
"""Generate human-readable accuracy report"""
lines = [
"=" * 70,
"SENTIMENT ANALYSIS ACCURACY BENCHMARK",
"=" * 70,
"",
f"Total Tests: {results.total_tests}",
f"Correct: {results.correct_tests}",
f"Overall Accuracy: {results.accuracy:.1%}",
f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms",
"",
"-" * 70,
"ACCURACY BY EMOTION (sorted by accuracy)",
"-" * 70,
]
# Sort emotions by accuracy
sorted_emotions = sorted(
results.emotion_accuracy.items(),
key=lambda x: x[1],
reverse=True
)
for emotion, acc in sorted_emotions:
status = "✓" if acc >= 0.5 else "✗"
lines.append(f"{status} {emotion:25} {acc:6.1%}")
lines.extend([
"",
"-" * 70,
"CONFUSION MATRIX (expected → detected)",
"-" * 70,
])
# Print confusion matrix
polarities = ["positive", "negative", "neutral"]
header = " " + " ".join(f"{p:>10}" for p in polarities)
lines.append(header)
for expected in polarities:
row = f"{expected:>8} "
for detected in polarities:
count = results.confusion_matrix[expected][detected]
row += f"{count:>10} "
lines.append(row)
if results.failed_emotions:
lines.extend([
"",
"-" * 70,
f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}",
"-" * 70,
])
for em in results.failed_emotions:
lines.append(f" ✗ {em}")
lines.append("=" * 70)
return "\n".join(lines)
if __name__ == "__main__":
# Quick test
from avatar import SentimentAnalyzer, EmojiMapper
analyzer = SentimentAnalyzer()
mapper = EmojiMapper()
benchmark = AccuracyBenchmark(analyzer, mapper)
# Mini test
test_data = {
"happiness": [
("I am happy", "positive"),
("I am good", "positive"),
],
"sadness": [
("I am sad", "negative"),
("I feel terrible", "negative"),
],
}
results = benchmark.run_benchmark(test_data)
print(benchmark.get_accuracy_report(results))