Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis | |
| Measures: | |
| - Detection speed (ms per text) | |
| - Accuracy per emotion | |
| - Overall accuracy by polarity | |
| - Confusion matrix | |
| """ | |
| import time | |
| from typing import Dict, List, Any, Tuple | |
| from dataclasses import dataclass, field | |
| from collections import defaultdict | |
| class EmotionResult: | |
| """Result for a single emotion test""" | |
| emotion: str | |
| text: str | |
| expected_polarity: str | |
| detected_label: str | |
| detected_polarity: str | |
| is_correct: bool | |
| inference_time_ms: float | |
| emoji: str | |
| class BenchmarkResults: | |
| """Aggregated benchmark results""" | |
| total_tests: int = 0 | |
| correct_tests: int = 0 | |
| accuracy: float = 0.0 | |
| avg_inference_time_ms: float = 0.0 | |
| min_inference_time_ms: float = 0.0 | |
| max_inference_time_ms: float = 0.0 | |
| median_inference_time_ms: float = 0.0 | |
| emotion_accuracy: Dict[str, float] = field(default_factory=dict) | |
| emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict) | |
| failed_emotions: List[str] = field(default_factory=list) | |
| confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict) | |
| class AccuracyBenchmark: | |
| """ | |
| Benchmark sentiment analysis accuracy and speed | |
| Tests all emotions and generates accuracy metrics | |
| """ | |
| # Map transformer output to polarity | |
| LABEL_TO_POLARITY = { | |
| "happiness": "positive", | |
| "sadness": "negative", | |
| "positive": "positive", | |
| "negative": "negative", | |
| "neutral": "neutral", | |
| "joy": "positive", | |
| "anger": "negative", | |
| "fear": "negative", | |
| "surprise": "positive", | |
| "disgust": "negative", | |
| } | |
| def __init__(self, analyzer, emoji_mapper): | |
| """ | |
| Initialize benchmark with analyzer and mapper | |
| Args: | |
| analyzer: SentimentAnalyzer instance | |
| emoji_mapper: EmojiMapper instance | |
| """ | |
| self.analyzer = analyzer | |
| self.emoji_mapper = emoji_mapper | |
| def _get_polarity(self, label: str) -> str: | |
| """Map emotion label to polarity (positive/negative/neutral)""" | |
| return self.LABEL_TO_POLARITY.get(label.lower(), "neutral") | |
| def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult: | |
| """Run a single test and return result""" | |
| # Time the inference | |
| start_time = time.perf_counter() | |
| result = self.analyzer.analyze(text) | |
| end_time = time.perf_counter() | |
| inference_time_ms = (end_time - start_time) * 1000 | |
| detected_label = result.get("label", "neutral") | |
| # Use polarity from result if available, otherwise infer from label | |
| if "polarity" in result: | |
| detected_polarity = result["polarity"] | |
| else: | |
| detected_polarity = self._get_polarity(detected_label) | |
| # Get emoji | |
| emoji = self.emoji_mapper.get_emoji(detected_label) | |
| # Check correctness (polarity match) | |
| is_correct = detected_polarity == expected_polarity | |
| return EmotionResult( | |
| emotion=emotion, | |
| text=text, | |
| expected_polarity=expected_polarity, | |
| detected_label=detected_label, | |
| detected_polarity=detected_polarity, | |
| is_correct=is_correct, | |
| inference_time_ms=inference_time_ms, | |
| emoji=emoji | |
| ) | |
| def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults: | |
| """ | |
| Run full benchmark on test data | |
| Args: | |
| test_data: Dict mapping emotion -> [(text, expected_polarity), ...] | |
| Returns: | |
| BenchmarkResults with all metrics | |
| """ | |
| results = BenchmarkResults() | |
| results.emotion_results = defaultdict(list) | |
| results.confusion_matrix = defaultdict(lambda: defaultdict(int)) | |
| all_times = [] | |
| for emotion, test_cases in test_data.items(): | |
| emotion_correct = 0 | |
| emotion_total = 0 | |
| for text, expected_polarity in test_cases: | |
| result = self.run_single_test(text, expected_polarity, emotion) | |
| results.emotion_results[emotion].append(result) | |
| results.total_tests += 1 | |
| emotion_total += 1 | |
| all_times.append(result.inference_time_ms) | |
| # Update confusion matrix | |
| results.confusion_matrix[expected_polarity][result.detected_polarity] += 1 | |
| if result.is_correct: | |
| results.correct_tests += 1 | |
| emotion_correct += 1 | |
| # Calculate per-emotion accuracy | |
| if emotion_total > 0: | |
| emotion_acc = emotion_correct / emotion_total | |
| results.emotion_accuracy[emotion] = emotion_acc | |
| if emotion_acc < 0.5: # Less than 50% accuracy | |
| results.failed_emotions.append(emotion) | |
| # Calculate overall metrics | |
| if results.total_tests > 0: | |
| results.accuracy = results.correct_tests / results.total_tests | |
| if all_times: | |
| all_times_sorted = sorted(all_times) | |
| results.avg_inference_time_ms = sum(all_times) / len(all_times) | |
| results.min_inference_time_ms = min(all_times) | |
| results.max_inference_time_ms = max(all_times) | |
| mid = len(all_times_sorted) // 2 | |
| if len(all_times_sorted) % 2 == 0: | |
| results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2 | |
| else: | |
| results.median_inference_time_ms = all_times_sorted[mid] | |
| return results | |
| def get_accuracy_report(self, results: BenchmarkResults) -> str: | |
| """Generate human-readable accuracy report""" | |
| lines = [ | |
| "=" * 70, | |
| "SENTIMENT ANALYSIS ACCURACY BENCHMARK", | |
| "=" * 70, | |
| "", | |
| f"Total Tests: {results.total_tests}", | |
| f"Correct: {results.correct_tests}", | |
| f"Overall Accuracy: {results.accuracy:.1%}", | |
| f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms", | |
| "", | |
| "-" * 70, | |
| "ACCURACY BY EMOTION (sorted by accuracy)", | |
| "-" * 70, | |
| ] | |
| # Sort emotions by accuracy | |
| sorted_emotions = sorted( | |
| results.emotion_accuracy.items(), | |
| key=lambda x: x[1], | |
| reverse=True | |
| ) | |
| for emotion, acc in sorted_emotions: | |
| status = "✓" if acc >= 0.5 else "✗" | |
| lines.append(f"{status} {emotion:25} {acc:6.1%}") | |
| lines.extend([ | |
| "", | |
| "-" * 70, | |
| "CONFUSION MATRIX (expected → detected)", | |
| "-" * 70, | |
| ]) | |
| # Print confusion matrix | |
| polarities = ["positive", "negative", "neutral"] | |
| header = " " + " ".join(f"{p:>10}" for p in polarities) | |
| lines.append(header) | |
| for expected in polarities: | |
| row = f"{expected:>8} " | |
| for detected in polarities: | |
| count = results.confusion_matrix[expected][detected] | |
| row += f"{count:>10} " | |
| lines.append(row) | |
| if results.failed_emotions: | |
| lines.extend([ | |
| "", | |
| "-" * 70, | |
| f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}", | |
| "-" * 70, | |
| ]) | |
| for em in results.failed_emotions: | |
| lines.append(f" ✗ {em}") | |
| lines.append("=" * 70) | |
| return "\n".join(lines) | |
| if __name__ == "__main__": | |
| # Quick test | |
| from avatar import SentimentAnalyzer, EmojiMapper | |
| analyzer = SentimentAnalyzer() | |
| mapper = EmojiMapper() | |
| benchmark = AccuracyBenchmark(analyzer, mapper) | |
| # Mini test | |
| test_data = { | |
| "happiness": [ | |
| ("I am happy", "positive"), | |
| ("I am good", "positive"), | |
| ], | |
| "sadness": [ | |
| ("I am sad", "negative"), | |
| ("I feel terrible", "negative"), | |
| ], | |
| } | |
| results = benchmark.run_benchmark(test_data) | |
| print(benchmark.get_accuracy_report(results)) | |