Emoji-AI-Avatar / evaluation /run_evaluation.py
Deminiko
Initial import: Emoji AI Avatar
25e624c
# -*- coding: utf-8 -*-
"""
Run Evaluation - Execute full sentiment analysis evaluation
Runs all tests and generates comprehensive reports
"""
import sys
import os
import time
# Add parent to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from avatar import SentimentAnalyzer, EmojiMapper
from evaluation.emotion_test_suite import EmotionTestSuite
from evaluation.accuracy_benchmark import AccuracyBenchmark
from evaluation.live_stream_test import LiveStreamTest
from evaluation.report_generator import ReportGenerator
def run_full_evaluation():
"""Run complete evaluation and generate reports"""
print("=" * 70)
print("EMOJI AI AVATAR - SENTIMENT ANALYSIS EVALUATION")
print("=" * 70)
print()
# Initialize components
print("[1/6] Initializing components...")
start_time = time.time()
analyzer = SentimentAnalyzer()
mapper = EmojiMapper()
test_suite = EmotionTestSuite()
benchmark = AccuracyBenchmark(analyzer, mapper)
stream_test = LiveStreamTest(analyzer, mapper)
report_gen = ReportGenerator()
init_time = time.time() - start_time
print(f" Components initialized in {init_time:.2f}s")
print(f" Emotions to test: {test_suite.get_emotion_count()}")
print(f" Test cases: {test_suite.get_test_count()}")
print()
# Run accuracy benchmark
print("[2/6] Running accuracy benchmark...")
benchmark_start = time.time()
benchmark_results = benchmark.run_benchmark(test_suite.EMOTION_TEST_DATA)
benchmark_time = time.time() - benchmark_start
print(f" Benchmark completed in {benchmark_time:.2f}s")
print(f" Accuracy: {benchmark_results.accuracy:.1%}")
print()
# Run live stream tests
print("[3/6] Running live stream transition tests...")
stream_start = time.time()
transition_results = stream_test.run_all_transitions()
stream_time = time.time() - stream_start
correct_transitions = sum(1 for r in transition_results if r.transition_correct)
trans_accuracy = correct_transitions / len(transition_results) if transition_results else 0
print(f" Transitions completed in {stream_time:.2f}s")
print(f" Transition accuracy: {trans_accuracy:.1%}")
print()
# Generate console summary
print("[4/6] Generating console summary...")
summary = report_gen.generate_summary_report(benchmark_results)
print()
print(summary)
print()
# Generate markdown report
print("[5/6] Generating markdown report...")
md_path = report_gen.generate_markdown_report(benchmark_results, transition_results)
print(f" Saved to: {md_path}")
print()
# Generate JSON report
print("[6/6] Generating JSON report...")
json_path = report_gen.generate_json_report(benchmark_results, transition_results)
print(f" Saved to: {json_path}")
print()
# Print live stream report
print(stream_test.get_transition_report(transition_results))
print()
# Final summary
total_time = time.time() - start_time
print("=" * 70)
print("EVALUATION COMPLETE")
print("=" * 70)
print(f"Total time: {total_time:.2f}s")
print(f"Overall accuracy: {benchmark_results.accuracy:.1%}")
print(f"Transition accuracy: {trans_accuracy:.1%}")
print(f"Avg inference time: {benchmark_results.avg_inference_time_ms:.2f}ms")
print()
print("Reports saved to:")
print(f" - {md_path}")
print(f" - {json_path}")
print("=" * 70)
# Return pass/fail for CI/CD
passing_threshold = 0.5 # 50% minimum accuracy
if benchmark_results.accuracy >= passing_threshold:
print("\n✅ EVALUATION PASSED")
return 0
else:
print(f"\n❌ EVALUATION FAILED (accuracy below {passing_threshold:.0%})")
return 1
if __name__ == "__main__":
exit_code = run_full_evaluation()
sys.exit(exit_code)