# -*- coding: utf-8 -*- """ Run Comparative Evaluation - Compare models and test suites Runs evaluations with: 1. Binary model (DistilBERT) + V1 test suite 2. Binary model (DistilBERT) + V2 test suite 3. Multi-emotion model (RoBERTa) + V1 test suite 4. Multi-emotion model (RoBERTa) + V2 test suite Generates comparison reports """ import sys import os import time from datetime import datetime # Add parent to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from avatar.sentiment_emoji_map import EmojiMapper from evaluation.accuracy_benchmark import AccuracyBenchmark from evaluation.report_generator import ReportGenerator def run_comparative_evaluation(): """Run evaluation comparing models and test suites""" print("=" * 80) print("COMPARATIVE SENTIMENT ANALYSIS EVALUATION") print("=" * 80) print() # Initialize common components mapper = EmojiMapper() report_gen = ReportGenerator(output_dir="evaluation/reports") results = {} # ======================================== # Test 1: Binary Model + V1 Suite # ======================================== print("[1/4] Binary Model + V1 Test Suite") print("-" * 50) try: from avatar.sentiment_transformer import SentimentAnalyzer as BinaryAnalyzer from evaluation.emotion_test_suite import EmotionTestSuite binary_analyzer = BinaryAnalyzer() suite_v1 = EmotionTestSuite() benchmark_v1 = AccuracyBenchmark(binary_analyzer, mapper) print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}") start = time.time() results["binary_v1"] = benchmark_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA) elapsed = time.time() - start print(f" Accuracy: {results['binary_v1'].accuracy:.1%}") print(f" Time: {elapsed:.2f}s") print() except Exception as e: print(f" ❌ Error: {e}") results["binary_v1"] = None # ======================================== # Test 2: Binary Model + V2 Suite # ======================================== print("[2/4] Binary Model + V2 Test Suite") print("-" * 50) try: from evaluation.emotion_test_suite_v2 import EmotionTestSuiteV2 suite_v2 = EmotionTestSuiteV2() benchmark_v2 = AccuracyBenchmark(binary_analyzer, mapper) print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}") start = time.time() results["binary_v2"] = benchmark_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA) elapsed = time.time() - start print(f" Accuracy: {results['binary_v2'].accuracy:.1%}") print(f" Time: {elapsed:.2f}s") print() except Exception as e: print(f" ❌ Error: {e}") results["binary_v2"] = None # ======================================== # Test 3: Multi-Emotion Model + V1 Suite # ======================================== print("[3/4] Multi-Emotion Model + V1 Test Suite") print("-" * 50) try: from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer multi_analyzer = MultiEmotionAnalyzer() benchmark_multi_v1 = AccuracyBenchmark(multi_analyzer, mapper) print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}") start = time.time() results["multi_v1"] = benchmark_multi_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA) elapsed = time.time() - start print(f" Accuracy: {results['multi_v1'].accuracy:.1%}") print(f" Time: {elapsed:.2f}s") print() except Exception as e: print(f" ❌ Error: {e}") print(f" (Install with: pip install transformers torch)") results["multi_v1"] = None # ======================================== # Test 4: Multi-Emotion Model + V2 Suite # ======================================== print("[4/4] Multi-Emotion Model + V2 Test Suite") print("-" * 50) try: benchmark_multi_v2 = AccuracyBenchmark(multi_analyzer, mapper) print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}") start = time.time() results["multi_v2"] = benchmark_multi_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA) elapsed = time.time() - start print(f" Accuracy: {results['multi_v2'].accuracy:.1%}") print(f" Time: {elapsed:.2f}s") print() except Exception as e: print(f" ❌ Error: {e}") results["multi_v2"] = None # ======================================== # Generate Comparison Report # ======================================== print("=" * 80) print("COMPARISON SUMMARY") print("=" * 80) print() print("| Configuration | Accuracy | Avg Time | Failed Emotions |") print("|---------------------------|----------|----------|-----------------|") configs = [ ("Binary + V1 Suite", "binary_v1"), ("Binary + V2 Suite", "binary_v2"), ("Multi-Emotion + V1 Suite", "multi_v1"), ("Multi-Emotion + V2 Suite", "multi_v2"), ] for name, key in configs: r = results.get(key) if r: print(f"| {name:25} | {r.accuracy:7.1%} | {r.avg_inference_time_ms:6.2f}ms | {len(r.failed_emotions):15} |") else: print(f"| {name:25} | {'N/A':>7} | {'N/A':>8} | {'N/A':>15} |") print() # Show failed emotions comparison print("Failed Emotions by Configuration:") print("-" * 50) for name, key in configs: r = results.get(key) if r and r.failed_emotions: print(f"\n{name}:") for em in r.failed_emotions[:10]: # Show first 10 acc = r.emotion_accuracy.get(em, 0) print(f" ❌ {em}: {acc:.1%}") # Save detailed reports print() print("=" * 80) print("SAVING REPORTS") print("=" * 80) timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") for name, key in configs: r = results.get(key) if r: safe_name = key.replace("_", "-") md_path = report_gen.generate_markdown_report( r, filename=f"comparison_{safe_name}_{timestamp}.md" ) print(f" Saved: {md_path}") print() print("=" * 80) print("EVALUATION COMPLETE") print("=" * 80) return results if __name__ == "__main__": run_comparative_evaluation()