Spaces:

NLarchive
/

Emoji-AI-Avatar

Sleeping

File size: 6,769 Bytes

25e624c

# -*- coding: utf-8 -*-
"""
Run Comparative Evaluation - Compare models and test suites

Runs evaluations with:
1. Binary model (DistilBERT) + V1 test suite
2. Binary model (DistilBERT) + V2 test suite  
3. Multi-emotion model (RoBERTa) + V1 test suite
4. Multi-emotion model (RoBERTa) + V2 test suite

Generates comparison reports
"""

import sys
import os
import time
from datetime import datetime

# Add parent to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from avatar.sentiment_emoji_map import EmojiMapper
from evaluation.accuracy_benchmark import AccuracyBenchmark
from evaluation.report_generator import ReportGenerator


def run_comparative_evaluation():
    """Run evaluation comparing models and test suites"""
    print("=" * 80)
    print("COMPARATIVE SENTIMENT ANALYSIS EVALUATION")
    print("=" * 80)
    print()
    
    # Initialize common components
    mapper = EmojiMapper()
    report_gen = ReportGenerator(output_dir="evaluation/reports")
    
    results = {}
    
    # ========================================
    # Test 1: Binary Model + V1 Suite
    # ========================================
    print("[1/4] Binary Model + V1 Test Suite")
    print("-" * 50)
    
    try:
        from avatar.sentiment_transformer import SentimentAnalyzer as BinaryAnalyzer
        from evaluation.emotion_test_suite import EmotionTestSuite
        
        binary_analyzer = BinaryAnalyzer()
        suite_v1 = EmotionTestSuite()
        benchmark_v1 = AccuracyBenchmark(binary_analyzer, mapper)
        
        print(f"   Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")
        
        start = time.time()
        results["binary_v1"] = benchmark_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
        elapsed = time.time() - start
        
        print(f"   Accuracy: {results['binary_v1'].accuracy:.1%}")
        print(f"   Time: {elapsed:.2f}s")
        print()
    except Exception as e:
        print(f"   ❌ Error: {e}")
        results["binary_v1"] = None
    
    # ========================================
    # Test 2: Binary Model + V2 Suite
    # ========================================
    print("[2/4] Binary Model + V2 Test Suite")
    print("-" * 50)
    
    try:
        from evaluation.emotion_test_suite_v2 import EmotionTestSuiteV2
        
        suite_v2 = EmotionTestSuiteV2()
        benchmark_v2 = AccuracyBenchmark(binary_analyzer, mapper)
        
        print(f"   Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")
        
        start = time.time()
        results["binary_v2"] = benchmark_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
        elapsed = time.time() - start
        
        print(f"   Accuracy: {results['binary_v2'].accuracy:.1%}")
        print(f"   Time: {elapsed:.2f}s")
        print()
    except Exception as e:
        print(f"   ❌ Error: {e}")
        results["binary_v2"] = None
    
    # ========================================
    # Test 3: Multi-Emotion Model + V1 Suite
    # ========================================
    print("[3/4] Multi-Emotion Model + V1 Test Suite")
    print("-" * 50)
    
    try:
        from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer
        
        multi_analyzer = MultiEmotionAnalyzer()
        benchmark_multi_v1 = AccuracyBenchmark(multi_analyzer, mapper)
        
        print(f"   Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")
        
        start = time.time()
        results["multi_v1"] = benchmark_multi_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
        elapsed = time.time() - start
        
        print(f"   Accuracy: {results['multi_v1'].accuracy:.1%}")
        print(f"   Time: {elapsed:.2f}s")
        print()
    except Exception as e:
        print(f"   ❌ Error: {e}")
        print(f"   (Install with: pip install transformers torch)")
        results["multi_v1"] = None
    
    # ========================================
    # Test 4: Multi-Emotion Model + V2 Suite
    # ========================================
    print("[4/4] Multi-Emotion Model + V2 Test Suite")
    print("-" * 50)
    
    try:
        benchmark_multi_v2 = AccuracyBenchmark(multi_analyzer, mapper)
        
        print(f"   Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")
        
        start = time.time()
        results["multi_v2"] = benchmark_multi_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
        elapsed = time.time() - start
        
        print(f"   Accuracy: {results['multi_v2'].accuracy:.1%}")
        print(f"   Time: {elapsed:.2f}s")
        print()
    except Exception as e:
        print(f"   ❌ Error: {e}")
        results["multi_v2"] = None
    
    # ========================================
    # Generate Comparison Report
    # ========================================
    print("=" * 80)
    print("COMPARISON SUMMARY")
    print("=" * 80)
    print()
    
    print("| Configuration              | Accuracy | Avg Time | Failed Emotions |")
    print("|---------------------------|----------|----------|-----------------|")
    
    configs = [
        ("Binary + V1 Suite", "binary_v1"),
        ("Binary + V2 Suite", "binary_v2"),
        ("Multi-Emotion + V1 Suite", "multi_v1"),
        ("Multi-Emotion + V2 Suite", "multi_v2"),
    ]
    
    for name, key in configs:
        r = results.get(key)
        if r:
            print(f"| {name:25} | {r.accuracy:7.1%} | {r.avg_inference_time_ms:6.2f}ms | {len(r.failed_emotions):15} |")
        else:
            print(f"| {name:25} | {'N/A':>7} | {'N/A':>8} | {'N/A':>15} |")
    
    print()
    
    # Show failed emotions comparison
    print("Failed Emotions by Configuration:")
    print("-" * 50)
    
    for name, key in configs:
        r = results.get(key)
        if r and r.failed_emotions:
            print(f"\n{name}:")
            for em in r.failed_emotions[:10]:  # Show first 10
                acc = r.emotion_accuracy.get(em, 0)
                print(f"   ❌ {em}: {acc:.1%}")
    
    # Save detailed reports
    print()
    print("=" * 80)
    print("SAVING REPORTS")
    print("=" * 80)
    
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    for name, key in configs:
        r = results.get(key)
        if r:
            safe_name = key.replace("_", "-")
            md_path = report_gen.generate_markdown_report(
                r, 
                filename=f"comparison_{safe_name}_{timestamp}.md"
            )
            print(f"   Saved: {md_path}")
    
    print()
    print("=" * 80)
    print("EVALUATION COMPLETE")
    print("=" * 80)
    
    return results


if __name__ == "__main__":
    run_comparative_evaluation()