Emoji-AI-Avatar / evaluation /run_comparison.py
Deminiko
Initial import: Emoji AI Avatar
25e624c
# -*- coding: utf-8 -*-
"""
Run Comparative Evaluation - Compare models and test suites
Runs evaluations with:
1. Binary model (DistilBERT) + V1 test suite
2. Binary model (DistilBERT) + V2 test suite
3. Multi-emotion model (RoBERTa) + V1 test suite
4. Multi-emotion model (RoBERTa) + V2 test suite
Generates comparison reports
"""
import sys
import os
import time
from datetime import datetime
# Add parent to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from avatar.sentiment_emoji_map import EmojiMapper
from evaluation.accuracy_benchmark import AccuracyBenchmark
from evaluation.report_generator import ReportGenerator
def run_comparative_evaluation():
"""Run evaluation comparing models and test suites"""
print("=" * 80)
print("COMPARATIVE SENTIMENT ANALYSIS EVALUATION")
print("=" * 80)
print()
# Initialize common components
mapper = EmojiMapper()
report_gen = ReportGenerator(output_dir="evaluation/reports")
results = {}
# ========================================
# Test 1: Binary Model + V1 Suite
# ========================================
print("[1/4] Binary Model + V1 Test Suite")
print("-" * 50)
try:
from avatar.sentiment_transformer import SentimentAnalyzer as BinaryAnalyzer
from evaluation.emotion_test_suite import EmotionTestSuite
binary_analyzer = BinaryAnalyzer()
suite_v1 = EmotionTestSuite()
benchmark_v1 = AccuracyBenchmark(binary_analyzer, mapper)
print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")
start = time.time()
results["binary_v1"] = benchmark_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
elapsed = time.time() - start
print(f" Accuracy: {results['binary_v1'].accuracy:.1%}")
print(f" Time: {elapsed:.2f}s")
print()
except Exception as e:
print(f" ❌ Error: {e}")
results["binary_v1"] = None
# ========================================
# Test 2: Binary Model + V2 Suite
# ========================================
print("[2/4] Binary Model + V2 Test Suite")
print("-" * 50)
try:
from evaluation.emotion_test_suite_v2 import EmotionTestSuiteV2
suite_v2 = EmotionTestSuiteV2()
benchmark_v2 = AccuracyBenchmark(binary_analyzer, mapper)
print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")
start = time.time()
results["binary_v2"] = benchmark_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
elapsed = time.time() - start
print(f" Accuracy: {results['binary_v2'].accuracy:.1%}")
print(f" Time: {elapsed:.2f}s")
print()
except Exception as e:
print(f" ❌ Error: {e}")
results["binary_v2"] = None
# ========================================
# Test 3: Multi-Emotion Model + V1 Suite
# ========================================
print("[3/4] Multi-Emotion Model + V1 Test Suite")
print("-" * 50)
try:
from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer
multi_analyzer = MultiEmotionAnalyzer()
benchmark_multi_v1 = AccuracyBenchmark(multi_analyzer, mapper)
print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")
start = time.time()
results["multi_v1"] = benchmark_multi_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
elapsed = time.time() - start
print(f" Accuracy: {results['multi_v1'].accuracy:.1%}")
print(f" Time: {elapsed:.2f}s")
print()
except Exception as e:
print(f" ❌ Error: {e}")
print(f" (Install with: pip install transformers torch)")
results["multi_v1"] = None
# ========================================
# Test 4: Multi-Emotion Model + V2 Suite
# ========================================
print("[4/4] Multi-Emotion Model + V2 Test Suite")
print("-" * 50)
try:
benchmark_multi_v2 = AccuracyBenchmark(multi_analyzer, mapper)
print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")
start = time.time()
results["multi_v2"] = benchmark_multi_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
elapsed = time.time() - start
print(f" Accuracy: {results['multi_v2'].accuracy:.1%}")
print(f" Time: {elapsed:.2f}s")
print()
except Exception as e:
print(f" ❌ Error: {e}")
results["multi_v2"] = None
# ========================================
# Generate Comparison Report
# ========================================
print("=" * 80)
print("COMPARISON SUMMARY")
print("=" * 80)
print()
print("| Configuration | Accuracy | Avg Time | Failed Emotions |")
print("|---------------------------|----------|----------|-----------------|")
configs = [
("Binary + V1 Suite", "binary_v1"),
("Binary + V2 Suite", "binary_v2"),
("Multi-Emotion + V1 Suite", "multi_v1"),
("Multi-Emotion + V2 Suite", "multi_v2"),
]
for name, key in configs:
r = results.get(key)
if r:
print(f"| {name:25} | {r.accuracy:7.1%} | {r.avg_inference_time_ms:6.2f}ms | {len(r.failed_emotions):15} |")
else:
print(f"| {name:25} | {'N/A':>7} | {'N/A':>8} | {'N/A':>15} |")
print()
# Show failed emotions comparison
print("Failed Emotions by Configuration:")
print("-" * 50)
for name, key in configs:
r = results.get(key)
if r and r.failed_emotions:
print(f"\n{name}:")
for em in r.failed_emotions[:10]: # Show first 10
acc = r.emotion_accuracy.get(em, 0)
print(f" ❌ {em}: {acc:.1%}")
# Save detailed reports
print()
print("=" * 80)
print("SAVING REPORTS")
print("=" * 80)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
for name, key in configs:
r = results.get(key)
if r:
safe_name = key.replace("_", "-")
md_path = report_gen.generate_markdown_report(
r,
filename=f"comparison_{safe_name}_{timestamp}.md"
)
print(f" Saved: {md_path}")
print()
print("=" * 80)
print("EVALUATION COMPLETE")
print("=" * 80)
return results
if __name__ == "__main__":
run_comparative_evaluation()