Spaces:

NLarchive
/

Emoji-AI-Avatar

Running

File size: 8,541 Bytes

25e624c

# -*- coding: utf-8 -*-
"""
Accuracy Benchmark - Speed and accuracy measurement for sentiment analysis

Measures:
- Detection speed (ms per text)
- Accuracy per emotion
- Overall accuracy by polarity
- Confusion matrix
"""

import time
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass, field
from collections import defaultdict


@dataclass
class EmotionResult:
    """Result for a single emotion test"""
    emotion: str
    text: str
    expected_polarity: str
    detected_label: str
    detected_polarity: str
    is_correct: bool
    inference_time_ms: float
    emoji: str


@dataclass
class BenchmarkResults:
    """Aggregated benchmark results"""
    total_tests: int = 0
    correct_tests: int = 0
    accuracy: float = 0.0
    avg_inference_time_ms: float = 0.0
    min_inference_time_ms: float = 0.0
    max_inference_time_ms: float = 0.0
    median_inference_time_ms: float = 0.0
    emotion_accuracy: Dict[str, float] = field(default_factory=dict)
    emotion_results: Dict[str, List[EmotionResult]] = field(default_factory=dict)
    failed_emotions: List[str] = field(default_factory=list)
    confusion_matrix: Dict[str, Dict[str, int]] = field(default_factory=dict)


class AccuracyBenchmark:
    """
    Benchmark sentiment analysis accuracy and speed
    
    Tests all emotions and generates accuracy metrics
    """
    
    # Map transformer output to polarity
    LABEL_TO_POLARITY = {
        "happiness": "positive",
        "sadness": "negative",
        "positive": "positive",
        "negative": "negative",
        "neutral": "neutral",
        "joy": "positive",
        "anger": "negative",
        "fear": "negative",
        "surprise": "positive",
        "disgust": "negative",
    }
    
    def __init__(self, analyzer, emoji_mapper):
        """
        Initialize benchmark with analyzer and mapper
        
        Args:
            analyzer: SentimentAnalyzer instance
            emoji_mapper: EmojiMapper instance
        """
        self.analyzer = analyzer
        self.emoji_mapper = emoji_mapper
    
    def _get_polarity(self, label: str) -> str:
        """Map emotion label to polarity (positive/negative/neutral)"""
        return self.LABEL_TO_POLARITY.get(label.lower(), "neutral")
    
    def run_single_test(self, text: str, expected_polarity: str, emotion: str) -> EmotionResult:
        """Run a single test and return result"""
        # Time the inference
        start_time = time.perf_counter()
        result = self.analyzer.analyze(text)
        end_time = time.perf_counter()
        
        inference_time_ms = (end_time - start_time) * 1000
        
        detected_label = result.get("label", "neutral")
        
        # Use polarity from result if available, otherwise infer from label
        if "polarity" in result:
            detected_polarity = result["polarity"]
        else:
            detected_polarity = self._get_polarity(detected_label)
        
        # Get emoji
        emoji = self.emoji_mapper.get_emoji(detected_label)
        
        # Check correctness (polarity match)
        is_correct = detected_polarity == expected_polarity
        
        return EmotionResult(
            emotion=emotion,
            text=text,
            expected_polarity=expected_polarity,
            detected_label=detected_label,
            detected_polarity=detected_polarity,
            is_correct=is_correct,
            inference_time_ms=inference_time_ms,
            emoji=emoji
        )
    
    def run_benchmark(self, test_data: Dict[str, List[Tuple[str, str]]]) -> BenchmarkResults:
        """
        Run full benchmark on test data
        
        Args:
            test_data: Dict mapping emotion -> [(text, expected_polarity), ...]
            
        Returns:
            BenchmarkResults with all metrics
        """
        results = BenchmarkResults()
        results.emotion_results = defaultdict(list)
        results.confusion_matrix = defaultdict(lambda: defaultdict(int))
        
        all_times = []
        
        for emotion, test_cases in test_data.items():
            emotion_correct = 0
            emotion_total = 0
            
            for text, expected_polarity in test_cases:
                result = self.run_single_test(text, expected_polarity, emotion)
                results.emotion_results[emotion].append(result)
                
                results.total_tests += 1
                emotion_total += 1
                all_times.append(result.inference_time_ms)
                
                # Update confusion matrix
                results.confusion_matrix[expected_polarity][result.detected_polarity] += 1
                
                if result.is_correct:
                    results.correct_tests += 1
                    emotion_correct += 1
            
            # Calculate per-emotion accuracy
            if emotion_total > 0:
                emotion_acc = emotion_correct / emotion_total
                results.emotion_accuracy[emotion] = emotion_acc
                
                if emotion_acc < 0.5:  # Less than 50% accuracy
                    results.failed_emotions.append(emotion)
        
        # Calculate overall metrics
        if results.total_tests > 0:
            results.accuracy = results.correct_tests / results.total_tests
        
        if all_times:
            all_times_sorted = sorted(all_times)
            results.avg_inference_time_ms = sum(all_times) / len(all_times)
            results.min_inference_time_ms = min(all_times)
            results.max_inference_time_ms = max(all_times)
            mid = len(all_times_sorted) // 2
            if len(all_times_sorted) % 2 == 0:
                results.median_inference_time_ms = (all_times_sorted[mid - 1] + all_times_sorted[mid]) / 2
            else:
                results.median_inference_time_ms = all_times_sorted[mid]
        
        return results
    
    def get_accuracy_report(self, results: BenchmarkResults) -> str:
        """Generate human-readable accuracy report"""
        lines = [
            "=" * 70,
            "SENTIMENT ANALYSIS ACCURACY BENCHMARK",
            "=" * 70,
            "",
            f"Total Tests: {results.total_tests}",
            f"Correct: {results.correct_tests}",
            f"Overall Accuracy: {results.accuracy:.1%}",
            f"Avg Inference Time: {results.avg_inference_time_ms:.2f} ms",
            "",
            "-" * 70,
            "ACCURACY BY EMOTION (sorted by accuracy)",
            "-" * 70,
        ]
        
        # Sort emotions by accuracy
        sorted_emotions = sorted(
            results.emotion_accuracy.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        for emotion, acc in sorted_emotions:
            status = "✓" if acc >= 0.5 else "✗"
            lines.append(f"{status} {emotion:25} {acc:6.1%}")
        
        lines.extend([
            "",
            "-" * 70,
            "CONFUSION MATRIX (expected → detected)",
            "-" * 70,
        ])
        
        # Print confusion matrix
        polarities = ["positive", "negative", "neutral"]
        header = "         " + " ".join(f"{p:>10}" for p in polarities)
        lines.append(header)
        
        for expected in polarities:
            row = f"{expected:>8} "
            for detected in polarities:
                count = results.confusion_matrix[expected][detected]
                row += f"{count:>10} "
            lines.append(row)
        
        if results.failed_emotions:
            lines.extend([
                "",
                "-" * 70,
                f"FAILED EMOTIONS (< 50% accuracy): {len(results.failed_emotions)}",
                "-" * 70,
            ])
            for em in results.failed_emotions:
                lines.append(f"  ✗ {em}")
        
        lines.append("=" * 70)
        
        return "\n".join(lines)


if __name__ == "__main__":
    # Quick test
    from avatar import SentimentAnalyzer, EmojiMapper
    
    analyzer = SentimentAnalyzer()
    mapper = EmojiMapper()
    benchmark = AccuracyBenchmark(analyzer, mapper)
    
    # Mini test
    test_data = {
        "happiness": [
            ("I am happy", "positive"),
            ("I am good", "positive"),
        ],
        "sadness": [
            ("I am sad", "negative"),
            ("I feel terrible", "negative"),
        ],
    }
    
    results = benchmark.run_benchmark(test_data)
    print(benchmark.get_accuracy_report(results))