Spaces:

NLarchive
/

Emoji-AI-Avatar

Sleeping

Emoji-AI-Avatar / evaluation /run_comparison.py

Deminiko

Initial import: Emoji AI Avatar

25e624c 10 days ago

6.77 kB

	# -- coding: utf-8 --
	"""
	Run Comparative Evaluation - Compare models and test suites

	Runs evaluations with:
	1. Binary model (DistilBERT) + V1 test suite
	2. Binary model (DistilBERT) + V2 test suite
	3. Multi-emotion model (RoBERTa) + V1 test suite
	4. Multi-emotion model (RoBERTa) + V2 test suite

	Generates comparison reports
	"""

	import sys
	import os
	import time
	from datetime import datetime

	# Add parent to path
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from avatar.sentiment_emoji_map import EmojiMapper
	from evaluation.accuracy_benchmark import AccuracyBenchmark
	from evaluation.report_generator import ReportGenerator


	def run_comparative_evaluation():
	"""Run evaluation comparing models and test suites"""
	print("=" * 80)
	print("COMPARATIVE SENTIMENT ANALYSIS EVALUATION")
	print("=" * 80)
	print()

	# Initialize common components
	mapper = EmojiMapper()
	report_gen = ReportGenerator(output_dir="evaluation/reports")

	results = {}

	# ========================================
	# Test 1: Binary Model + V1 Suite
	# ========================================
	print("[1/4] Binary Model + V1 Test Suite")
	print("-" * 50)

	try:
	from avatar.sentiment_transformer import SentimentAnalyzer as BinaryAnalyzer
	from evaluation.emotion_test_suite import EmotionTestSuite

	binary_analyzer = BinaryAnalyzer()
	suite_v1 = EmotionTestSuite()
	benchmark_v1 = AccuracyBenchmark(binary_analyzer, mapper)

	print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")

	start = time.time()
	results["binary_v1"] = benchmark_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
	elapsed = time.time() - start

	print(f" Accuracy: {results['binary_v1'].accuracy:.1%}")
	print(f" Time: {elapsed:.2f}s")
	print()
	except Exception as e:
	print(f" ❌ Error: {e}")
	results["binary_v1"] = None

	# ========================================
	# Test 2: Binary Model + V2 Suite
	# ========================================
	print("[2/4] Binary Model + V2 Test Suite")
	print("-" * 50)

	try:
	from evaluation.emotion_test_suite_v2 import EmotionTestSuiteV2

	suite_v2 = EmotionTestSuiteV2()
	benchmark_v2 = AccuracyBenchmark(binary_analyzer, mapper)

	print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")

	start = time.time()
	results["binary_v2"] = benchmark_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
	elapsed = time.time() - start

	print(f" Accuracy: {results['binary_v2'].accuracy:.1%}")
	print(f" Time: {elapsed:.2f}s")
	print()
	except Exception as e:
	print(f" ❌ Error: {e}")
	results["binary_v2"] = None

	# ========================================
	# Test 3: Multi-Emotion Model + V1 Suite
	# ========================================
	print("[3/4] Multi-Emotion Model + V1 Test Suite")
	print("-" * 50)

	try:
	from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer

	multi_analyzer = MultiEmotionAnalyzer()
	benchmark_multi_v1 = AccuracyBenchmark(multi_analyzer, mapper)

	print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}")

	start = time.time()
	results["multi_v1"] = benchmark_multi_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA)
	elapsed = time.time() - start

	print(f" Accuracy: {results['multi_v1'].accuracy:.1%}")
	print(f" Time: {elapsed:.2f}s")
	print()
	except Exception as e:
	print(f" ❌ Error: {e}")
	print(f" (Install with: pip install transformers torch)")
	results["multi_v1"] = None

	# ========================================
	# Test 4: Multi-Emotion Model + V2 Suite
	# ========================================
	print("[4/4] Multi-Emotion Model + V2 Test Suite")
	print("-" * 50)

	try:
	benchmark_multi_v2 = AccuracyBenchmark(multi_analyzer, mapper)

	print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}")

	start = time.time()
	results["multi_v2"] = benchmark_multi_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA)
	elapsed = time.time() - start

	print(f" Accuracy: {results['multi_v2'].accuracy:.1%}")
	print(f" Time: {elapsed:.2f}s")
	print()
	except Exception as e:
	print(f" ❌ Error: {e}")
	results["multi_v2"] = None

	# ========================================
	# Generate Comparison Report
	# ========================================
	print("=" * 80)
	print("COMPARISON SUMMARY")
	print("=" * 80)
	print()

	print("\| Configuration \| Accuracy \| Avg Time \| Failed Emotions \|")
	print("\|---------------------------\|----------\|----------\|-----------------\|")

	configs = [
	("Binary + V1 Suite", "binary_v1"),
	("Binary + V2 Suite", "binary_v2"),
	("Multi-Emotion + V1 Suite", "multi_v1"),
	("Multi-Emotion + V2 Suite", "multi_v2"),
	]

	for name, key in configs:
	r = results.get(key)
	if r:
	print(f"\| {name:25} \| {r.accuracy:7.1%} \| {r.avg_inference_time_ms:6.2f}ms \| {len(r.failed_emotions):15} \|")
	else:
	print(f"\| {name:25} \| {'N/A':>7} \| {'N/A':>8} \| {'N/A':>15} \|")

	print()

	# Show failed emotions comparison
	print("Failed Emotions by Configuration:")
	print("-" * 50)

	for name, key in configs:
	r = results.get(key)
	if r and r.failed_emotions:
	print(f"\n{name}:")
	for em in r.failed_emotions[:10]: # Show first 10
	acc = r.emotion_accuracy.get(em, 0)
	print(f" ❌ {em}: {acc:.1%}")

	# Save detailed reports
	print()
	print("=" * 80)
	print("SAVING REPORTS")
	print("=" * 80)

	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

	for name, key in configs:
	r = results.get(key)
	if r:
	safe_name = key.replace("_", "-")
	md_path = report_gen.generate_markdown_report(
	r,
	filename=f"comparison_{safe_name}_{timestamp}.md"
	)
	print(f" Saved: {md_path}")

	print()
	print("=" * 80)
	print("EVALUATION COMPLETE")
	print("=" * 80)

	return results


	if __name__ == "__main__":
	run_comparative_evaluation()