# -*- coding: utf-8 -*- """ Run Evaluation - Execute full sentiment analysis evaluation Runs all tests and generates comprehensive reports """ import sys import os import time # Add parent to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from avatar import SentimentAnalyzer, EmojiMapper from evaluation.emotion_test_suite import EmotionTestSuite from evaluation.accuracy_benchmark import AccuracyBenchmark from evaluation.live_stream_test import LiveStreamTest from evaluation.report_generator import ReportGenerator def run_full_evaluation(): """Run complete evaluation and generate reports""" print("=" * 70) print("EMOJI AI AVATAR - SENTIMENT ANALYSIS EVALUATION") print("=" * 70) print() # Initialize components print("[1/6] Initializing components...") start_time = time.time() analyzer = SentimentAnalyzer() mapper = EmojiMapper() test_suite = EmotionTestSuite() benchmark = AccuracyBenchmark(analyzer, mapper) stream_test = LiveStreamTest(analyzer, mapper) report_gen = ReportGenerator() init_time = time.time() - start_time print(f" Components initialized in {init_time:.2f}s") print(f" Emotions to test: {test_suite.get_emotion_count()}") print(f" Test cases: {test_suite.get_test_count()}") print() # Run accuracy benchmark print("[2/6] Running accuracy benchmark...") benchmark_start = time.time() benchmark_results = benchmark.run_benchmark(test_suite.EMOTION_TEST_DATA) benchmark_time = time.time() - benchmark_start print(f" Benchmark completed in {benchmark_time:.2f}s") print(f" Accuracy: {benchmark_results.accuracy:.1%}") print() # Run live stream tests print("[3/6] Running live stream transition tests...") stream_start = time.time() transition_results = stream_test.run_all_transitions() stream_time = time.time() - stream_start correct_transitions = sum(1 for r in transition_results if r.transition_correct) trans_accuracy = correct_transitions / len(transition_results) if transition_results else 0 print(f" Transitions completed in {stream_time:.2f}s") print(f" Transition accuracy: {trans_accuracy:.1%}") print() # Generate console summary print("[4/6] Generating console summary...") summary = report_gen.generate_summary_report(benchmark_results) print() print(summary) print() # Generate markdown report print("[5/6] Generating markdown report...") md_path = report_gen.generate_markdown_report(benchmark_results, transition_results) print(f" Saved to: {md_path}") print() # Generate JSON report print("[6/6] Generating JSON report...") json_path = report_gen.generate_json_report(benchmark_results, transition_results) print(f" Saved to: {json_path}") print() # Print live stream report print(stream_test.get_transition_report(transition_results)) print() # Final summary total_time = time.time() - start_time print("=" * 70) print("EVALUATION COMPLETE") print("=" * 70) print(f"Total time: {total_time:.2f}s") print(f"Overall accuracy: {benchmark_results.accuracy:.1%}") print(f"Transition accuracy: {trans_accuracy:.1%}") print(f"Avg inference time: {benchmark_results.avg_inference_time_ms:.2f}ms") print() print("Reports saved to:") print(f" - {md_path}") print(f" - {json_path}") print("=" * 70) # Return pass/fail for CI/CD passing_threshold = 0.5 # 50% minimum accuracy if benchmark_results.accuracy >= passing_threshold: print("\n✅ EVALUATION PASSED") return 0 else: print(f"\n❌ EVALUATION FAILED (accuracy below {passing_threshold:.0%})") return 1 if __name__ == "__main__": exit_code = run_full_evaluation() sys.exit(exit_code)