File size: 8,541 Bytes
25e624c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# -*- coding: utf-8 -*-
"""
Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis

Measures:
- Detection speed (ms per text)
- Accuracy per emotion
- Overall accuracy by polarity
- Confusion matrix
"""

import time
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass, field
from collections import defaultdict


@dataclass
class EmotionResult:
    """Result for a single emotion test"""
    emotion: str
    text: str
    expected_polarity: str
    detected_label: str
    detected_polarity: str
    is_correct: bool
    inference_time_ms: float
    emoji: str


@dataclass
class BenchmarkResults:
    """Aggregated benchmark results"""
    total_tests: int = 0
    correct_tests: int = 0
    accuracy: float = 0.0
    avg_inference_time_ms: float = 0.0
    min_inference_time_ms: float = 0.0
    max_inference_time_ms: float = 0.0
    median_inference_time_ms: float = 0.0
    emotion_accuracy: Dict[str, float] = field(default_factory=dict)
    emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict)
    failed_emotions: List[str] = field(default_factory=list)
    confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict)


class AccuracyBenchmark:
    """
    Benchmark sentiment analysis accuracy and speed
    
    Tests all emotions and generates accuracy metrics
    """
    
    # Map transformer output to polarity
    LABEL_TO_POLARITY = {
        "happiness": "positive",
        "sadness": "negative",
        "positive": "positive",
        "negative": "negative",
        "neutral": "neutral",
        "joy": "positive",
        "anger": "negative",
        "fear": "negative",
        "surprise": "positive",
        "disgust": "negative",
    }
    
    def __init__(self, analyzer, emoji_mapper):
        """
        Initialize benchmark with analyzer and mapper
        
        Args:
            analyzer: SentimentAnalyzer instance
            emoji_mapper: EmojiMapper instance
        """
        self.analyzer = analyzer
        self.emoji_mapper = emoji_mapper
    
    def _get_polarity(self, label: str) -> str:
        """Map emotion label to polarity (positive/negative/neutral)"""
        return self.LABEL_TO_POLARITY.get(label.lower(), "neutral")
    
    def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult:
        """Run a single test and return result"""
        # Time the inference
        start_time = time.perf_counter()
        result = self.analyzer.analyze(text)
        end_time = time.perf_counter()
        
        inference_time_ms = (end_time - start_time) * 1000
        
        detected_label = result.get("label", "neutral")
        
        # Use polarity from result if available, otherwise infer from label
        if "polarity" in result:
            detected_polarity = result["polarity"]
        else:
            detected_polarity = self._get_polarity(detected_label)
        
        # Get emoji
        emoji = self.emoji_mapper.get_emoji(detected_label)
        
        # Check correctness (polarity match)
        is_correct = detected_polarity == expected_polarity
        
        return EmotionResult(
            emotion=emotion,
            text=text,
            expected_polarity=expected_polarity,
            detected_label=detected_label,
            detected_polarity=detected_polarity,
            is_correct=is_correct,
            inference_time_ms=inference_time_ms,
            emoji=emoji
        )
    
    def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults:
        """
        Run full benchmark on test data
        
        Args:
            test_data: Dict mapping emotion -> [(text, expected_polarity), ...]
            
        Returns:
            BenchmarkResults with all metrics
        """
        results = BenchmarkResults()
        results.emotion_results = defaultdict(list)
        results.confusion_matrix = defaultdict(lambda: defaultdict(int))
        
        all_times = []
        
        for emotion, test_cases in test_data.items():
            emotion_correct = 0
            emotion_total = 0
            
            for text, expected_polarity in test_cases:
                result = self.run_single_test(text, expected_polarity, emotion)
                results.emotion_results[emotion].append(result)
                
                results.total_tests += 1
                emotion_total += 1
                all_times.append(result.inference_time_ms)
                
                # Update confusion matrix
                results.confusion_matrix[expected_polarity][result.detected_polarity] += 1
                
                if result.is_correct:
                    results.correct_tests += 1
                    emotion_correct += 1
            
            # Calculate per-emotion accuracy
            if emotion_total > 0:
                emotion_acc = emotion_correct / emotion_total
                results.emotion_accuracy[emotion] = emotion_acc
                
                if emotion_acc < 0.5:  # Less than 50% accuracy
                    results.failed_emotions.append(emotion)
        
        # Calculate overall metrics
        if results.total_tests > 0:
            results.accuracy = results.correct_tests / results.total_tests
        
        if all_times:
            all_times_sorted = sorted(all_times)
            results.avg_inference_time_ms = sum(all_times) / len(all_times)
            results.min_inference_time_ms = min(all_times)
            results.max_inference_time_ms = max(all_times)
            mid = len(all_times_sorted) // 2
            if len(all_times_sorted) % 2 == 0:
                results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2
            else:
                results.median_inference_time_ms = all_times_sorted[mid]
        
        return results
    
    def get_accuracy_report(self, results: BenchmarkResults) -> str:
        """Generate human-readable accuracy report"""
        lines = [
            "=" * 70,
            "SENTIMENT ANALYSIS ACCURACY BENCHMARK",
            "=" * 70,
            "",
            f"Total Tests: {results.total_tests}",
            f"Correct: {results.correct_tests}",
            f"Overall Accuracy: {results.accuracy:.1%}",
            f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms",
            "",
            "-" * 70,
            "ACCURACY BY EMOTION (sorted by accuracy)",
            "-" * 70,
        ]
        
        # Sort emotions by accuracy
        sorted_emotions = sorted(
            results.emotion_accuracy.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        for emotion, acc in sorted_emotions:
            status = "✓" if acc >= 0.5 else "✗"
            lines.append(f"{status} {emotion:25} {acc:6.1%}")
        
        lines.extend([
            "",
            "-" * 70,
            "CONFUSION MATRIX (expected → detected)",
            "-" * 70,
        ])
        
        # Print confusion matrix
        polarities = ["positive", "negative", "neutral"]
        header = "         " + " ".join(f"{p:>10}" for p in polarities)
        lines.append(header)
        
        for expected in polarities:
            row = f"{expected:>8} "
            for detected in polarities:
                count = results.confusion_matrix[expected][detected]
                row += f"{count:>10} "
            lines.append(row)
        
        if results.failed_emotions:
            lines.extend([
                "",
                "-" * 70,
                f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}",
                "-" * 70,
            ])
            for em in results.failed_emotions:
                lines.append(f"  ✗ {em}")
        
        lines.append("=" * 70)
        
        return "\n".join(lines)


if __name__ == "__main__":
    # Quick test
    from avatar import SentimentAnalyzer, EmojiMapper
    
    analyzer = SentimentAnalyzer()
    mapper = EmojiMapper()
    benchmark = AccuracyBenchmark(analyzer, mapper)
    
    # Mini test
    test_data = {
        "happiness": [
            ("I am happy", "positive"),
            ("I am good", "positive"),
        ],
        "sadness": [
            ("I am sad", "negative"),
            ("I feel terrible", "negative"),
        ],
    }
    
    results = benchmark.run_benchmark(test_data)
    print(benchmark.get_accuracy_report(results))