Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Run Comparative Evaluation - Compare models and test suites | |
| Runs evaluations with: | |
| 1. Binary model (DistilBERT) + V1 test suite | |
| 2. Binary model (DistilBERT) + V2 test suite | |
| 3. Multi-emotion model (RoBERTa) + V1 test suite | |
| 4. Multi-emotion model (RoBERTa) + V2 test suite | |
| Generates comparison reports | |
| """ | |
| import sys | |
| import os | |
| import time | |
| from datetime import datetime | |
| # Add parent to path | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from avatar.sentiment_emoji_map import EmojiMapper | |
| from evaluation.accuracy_benchmark import AccuracyBenchmark | |
| from evaluation.report_generator import ReportGenerator | |
| def run_comparative_evaluation(): | |
| """Run evaluation comparing models and test suites""" | |
| print("=" * 80) | |
| print("COMPARATIVE SENTIMENT ANALYSIS EVALUATION") | |
| print("=" * 80) | |
| print() | |
| # Initialize common components | |
| mapper = EmojiMapper() | |
| report_gen = ReportGenerator(output_dir="evaluation/reports") | |
| results = {} | |
| # ======================================== | |
| # Test 1: Binary Model + V1 Suite | |
| # ======================================== | |
| print("[1/4] Binary Model + V1 Test Suite") | |
| print("-" * 50) | |
| try: | |
| from avatar.sentiment_transformer import SentimentAnalyzer as BinaryAnalyzer | |
| from evaluation.emotion_test_suite import EmotionTestSuite | |
| binary_analyzer = BinaryAnalyzer() | |
| suite_v1 = EmotionTestSuite() | |
| benchmark_v1 = AccuracyBenchmark(binary_analyzer, mapper) | |
| print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}") | |
| start = time.time() | |
| results["binary_v1"] = benchmark_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA) | |
| elapsed = time.time() - start | |
| print(f" Accuracy: {results['binary_v1'].accuracy:.1%}") | |
| print(f" Time: {elapsed:.2f}s") | |
| print() | |
| except Exception as e: | |
| print(f" ❌ Error: {e}") | |
| results["binary_v1"] = None | |
| # ======================================== | |
| # Test 2: Binary Model + V2 Suite | |
| # ======================================== | |
| print("[2/4] Binary Model + V2 Test Suite") | |
| print("-" * 50) | |
| try: | |
| from evaluation.emotion_test_suite_v2 import EmotionTestSuiteV2 | |
| suite_v2 = EmotionTestSuiteV2() | |
| benchmark_v2 = AccuracyBenchmark(binary_analyzer, mapper) | |
| print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}") | |
| start = time.time() | |
| results["binary_v2"] = benchmark_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA) | |
| elapsed = time.time() - start | |
| print(f" Accuracy: {results['binary_v2'].accuracy:.1%}") | |
| print(f" Time: {elapsed:.2f}s") | |
| print() | |
| except Exception as e: | |
| print(f" ❌ Error: {e}") | |
| results["binary_v2"] = None | |
| # ======================================== | |
| # Test 3: Multi-Emotion Model + V1 Suite | |
| # ======================================== | |
| print("[3/4] Multi-Emotion Model + V1 Test Suite") | |
| print("-" * 50) | |
| try: | |
| from avatar.sentiment_multi_emotion import MultiEmotionAnalyzer | |
| multi_analyzer = MultiEmotionAnalyzer() | |
| benchmark_multi_v1 = AccuracyBenchmark(multi_analyzer, mapper) | |
| print(f" Emotions: {suite_v1.get_emotion_count()}, Tests: {suite_v1.get_test_count()}") | |
| start = time.time() | |
| results["multi_v1"] = benchmark_multi_v1.run_benchmark(suite_v1.EMOTION_TEST_DATA) | |
| elapsed = time.time() - start | |
| print(f" Accuracy: {results['multi_v1'].accuracy:.1%}") | |
| print(f" Time: {elapsed:.2f}s") | |
| print() | |
| except Exception as e: | |
| print(f" ❌ Error: {e}") | |
| print(f" (Install with: pip install transformers torch)") | |
| results["multi_v1"] = None | |
| # ======================================== | |
| # Test 4: Multi-Emotion Model + V2 Suite | |
| # ======================================== | |
| print("[4/4] Multi-Emotion Model + V2 Test Suite") | |
| print("-" * 50) | |
| try: | |
| benchmark_multi_v2 = AccuracyBenchmark(multi_analyzer, mapper) | |
| print(f" Emotions: {suite_v2.get_emotion_count()}, Tests: {suite_v2.get_test_count()}") | |
| start = time.time() | |
| results["multi_v2"] = benchmark_multi_v2.run_benchmark(suite_v2.EMOTION_TEST_DATA) | |
| elapsed = time.time() - start | |
| print(f" Accuracy: {results['multi_v2'].accuracy:.1%}") | |
| print(f" Time: {elapsed:.2f}s") | |
| print() | |
| except Exception as e: | |
| print(f" ❌ Error: {e}") | |
| results["multi_v2"] = None | |
| # ======================================== | |
| # Generate Comparison Report | |
| # ======================================== | |
| print("=" * 80) | |
| print("COMPARISON SUMMARY") | |
| print("=" * 80) | |
| print() | |
| print("| Configuration | Accuracy | Avg Time | Failed Emotions |") | |
| print("|---------------------------|----------|----------|-----------------|") | |
| configs = [ | |
| ("Binary + V1 Suite", "binary_v1"), | |
| ("Binary + V2 Suite", "binary_v2"), | |
| ("Multi-Emotion + V1 Suite", "multi_v1"), | |
| ("Multi-Emotion + V2 Suite", "multi_v2"), | |
| ] | |
| for name, key in configs: | |
| r = results.get(key) | |
| if r: | |
| print(f"| {name:25} | {r.accuracy:7.1%} | {r.avg_inference_time_ms:6.2f}ms | {len(r.failed_emotions):15} |") | |
| else: | |
| print(f"| {name:25} | {'N/A':>7} | {'N/A':>8} | {'N/A':>15} |") | |
| print() | |
| # Show failed emotions comparison | |
| print("Failed Emotions by Configuration:") | |
| print("-" * 50) | |
| for name, key in configs: | |
| r = results.get(key) | |
| if r and r.failed_emotions: | |
| print(f"\n{name}:") | |
| for em in r.failed_emotions[:10]: # Show first 10 | |
| acc = r.emotion_accuracy.get(em, 0) | |
| print(f" ❌ {em}: {acc:.1%}") | |
| # Save detailed reports | |
| print() | |
| print("=" * 80) | |
| print("SAVING REPORTS") | |
| print("=" * 80) | |
| timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
| for name, key in configs: | |
| r = results.get(key) | |
| if r: | |
| safe_name = key.replace("_", "-") | |
| md_path = report_gen.generate_markdown_report( | |
| r, | |
| filename=f"comparison_{safe_name}_{timestamp}.md" | |
| ) | |
| print(f" Saved: {md_path}") | |
| print() | |
| print("=" * 80) | |
| print("EVALUATION COMPLETE") | |
| print("=" * 80) | |
| return results | |
| if __name__ == "__main__": | |
| run_comparative_evaluation() | |