# -*- coding: utf-8 -*- """ Report Generator - Generate evaluation reports Creates markdown and JSON reports from evaluation results """ import json import os from datetime import datetime from typing import Dict, List, Any, Optional from dataclasses import asdict class ReportGenerator: """ Generate evaluation reports in multiple formats Creates markdown and JSON reports from benchmark results """ def __init__(self, output_dir: str = "evaluation/reports"): """ Initialize report generator Args: output_dir: Directory to save reports """ self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) def _get_timestamp(self) -> str: """Get formatted timestamp""" return datetime.now().strftime("%Y-%m-%d_%H-%M-%S") def generate_markdown_report( self, benchmark_results: Any, transition_results: Optional[List[Any]] = None, filename: Optional[str] = None ) -> str: """ Generate markdown report Args: benchmark_results: Results from AccuracyBenchmark transition_results: Optional results from LiveStreamTest filename: Optional filename (auto-generated if None) Returns: Path to generated report """ timestamp = self._get_timestamp() if filename is None: filename = f"evaluation_report_{timestamp}.md" filepath = os.path.join(self.output_dir, filename) lines = [ "# Emoji AI Avatar - Sentiment Analysis Evaluation Report", "", f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "", "---", "", "## Executive Summary", "", f"- **Total Emotions Tested:** {benchmark_results.total_tests}", f"- **Correct Predictions:** {benchmark_results.correct_tests}", f"- **Overall Accuracy:** {benchmark_results.accuracy:.1%}", f"- **Average Inference Time:** {benchmark_results.avg_inference_time_ms:.2f} ms", "", ] # Accuracy by emotion lines.extend([ "## Accuracy by Emotion", "", "| Emotion | Accuracy | Samples | Status |", "|---------|----------|---------|--------|", ]) emotion_accuracy = benchmark_results.emotion_accuracy for emotion, acc in sorted(emotion_accuracy.items(), key=lambda x: -x[1]): status = "✅ PASS" if acc >= 0.5 else "❌ FAIL" # Count samples for this emotion samples = len(benchmark_results.emotion_results.get(emotion, [])) lines.append(f"| {emotion} | {acc:.1%} | {samples} | {status} |") lines.append("") # Timing statistics lines.extend([ "## Performance Metrics", "", "| Metric | Value |", "|--------|-------|", f"| Min Inference Time | {benchmark_results.min_inference_time_ms:.2f} ms |", f"| Max Inference Time | {benchmark_results.max_inference_time_ms:.2f} ms |", f"| Avg Inference Time | {benchmark_results.avg_inference_time_ms:.2f} ms |", f"| Median Inference Time | {benchmark_results.median_inference_time_ms:.2f} ms |", "", ]) # Confusion matrix summary lines.extend([ "## Confusion Analysis", "", "### Most Common Misclassifications", "", "| Expected | Predicted | Count |", "|----------|-----------|-------|", ]) confusion = benchmark_results.confusion_matrix # Flatten the nested dict for easier processing misclassifications = [] for expected, predicted_counts in confusion.items(): for predicted, count in predicted_counts.items(): if expected != predicted: misclassifications.append((expected, predicted, count)) misclassifications.sort(key=lambda x: -x[2]) for exp, pred, count in misclassifications[:10]: lines.append(f"| {exp} | {pred} | {count} |") lines.append("") # Transition test results if available if transition_results: correct_transitions = sum(1 for r in transition_results if r.transition_correct) total_transitions = len(transition_results) trans_accuracy = correct_transitions / total_transitions if total_transitions > 0 else 0 lines.extend([ "## Live Emotion Transition Tests", "", f"- **Total Transitions:** {total_transitions}", f"- **Correct Transitions:** {correct_transitions}", f"- **Transition Accuracy:** {trans_accuracy:.1%}", "", "### Transition Details", "", "| From Text | To Text | Expected | Detected | Status |", "|-----------|---------|----------|----------|--------|", ]) for r in transition_results: status = "✅" if r.transition_correct else "❌" from_short = r.from_text[:25] + "..." if len(r.from_text) > 25 else r.from_text to_short = r.to_text[:25] + "..." if len(r.to_text) > 25 else r.to_text lines.append( f"| {from_short} | {to_short} | {r.to_emotion} | " f"{r.final_detected_emotion} | {status} |" ) lines.append("") # Detailed test results lines.extend([ "## Detailed Test Results", "", "### Failed Tests", "", ]) # Collect all failed results failures = [] for emotion, results_list in benchmark_results.emotion_results.items(): for r in results_list: if not r.is_correct: failures.append(r) if failures: lines.extend([ "| Text | Expected | Detected | Time (ms) |", "|------|----------|----------|-----------|", ]) for r in failures[:50]: # Show first 50 failures text_short = r.text[:40] + "..." if len(r.text) > 40 else r.text lines.append(f"| {text_short} | {r.expected_polarity} | {r.detected_polarity} | {r.inference_time_ms:.2f} |") else: lines.append("*All tests passed!*") lines.extend([ "", "---", "", f"*Report generated by Emoji AI Avatar Evaluation Framework*", ]) with open(filepath, "w", encoding="utf-8") as f: f.write("\n".join(lines)) return filepath def generate_json_report( self, benchmark_results: Any, transition_results: Optional[List[Any]] = None, filename: Optional[str] = None ) -> str: """ Generate JSON report for CI/CD integration Args: benchmark_results: Results from AccuracyBenchmark transition_results: Optional results from LiveStreamTest filename: Optional filename (auto-generated if None) Returns: Path to generated report """ timestamp = self._get_timestamp() if filename is None: filename = f"evaluation_report_{timestamp}.json" filepath = os.path.join(self.output_dir, filename) # Build report data # Flatten the nested confusion matrix for JSON flat_confusion = {} for expected, predicted_counts in benchmark_results.confusion_matrix.items(): for predicted, count in predicted_counts.items(): flat_confusion[f"{expected}|{predicted}"] = count # Flatten emotion results for JSON all_results = [] for emotion, results_list in benchmark_results.emotion_results.items(): for r in results_list: all_results.append(asdict(r)) report = { "meta": { "timestamp": datetime.now().isoformat(), "version": "1.0.0", }, "summary": { "total_tests": benchmark_results.total_tests, "correct": benchmark_results.correct_tests, "accuracy": benchmark_results.accuracy, "avg_time_ms": benchmark_results.avg_inference_time_ms, "min_time_ms": benchmark_results.min_inference_time_ms, "max_time_ms": benchmark_results.max_inference_time_ms, "median_time_ms": benchmark_results.median_inference_time_ms, }, "emotion_accuracy": dict(benchmark_results.emotion_accuracy), "confusion_matrix": flat_confusion, "results": all_results, } if transition_results: correct_transitions = sum(1 for r in transition_results if r.transition_correct) report["transitions"] = { "total": len(transition_results), "correct": correct_transitions, "accuracy": correct_transitions / len(transition_results) if transition_results else 0, "details": [asdict(r) for r in transition_results], } with open(filepath, "w", encoding="utf-8") as f: json.dump(report, f, indent=2) return filepath def generate_summary_report(self, benchmark_results: Any) -> str: """ Generate a brief console summary Args: benchmark_results: Results from AccuracyBenchmark Returns: Summary string """ lines = [ "=" * 60, "SENTIMENT ANALYSIS EVALUATION SUMMARY", "=" * 60, "", f"Total Tests: {benchmark_results.total_tests}", f"Correct: {benchmark_results.correct_tests}", f"Accuracy: {benchmark_results.accuracy:.1%}", f"Avg Time: {benchmark_results.avg_inference_time_ms:.2f} ms", "", "-" * 60, "EMOTION BREAKDOWN (Top 10)", "-" * 60, ] # Top 10 by accuracy emotion_accuracy = benchmark_results.emotion_accuracy sorted_emotions = sorted(emotion_accuracy.items(), key=lambda x: -x[1])[:10] for emotion, acc in sorted_emotions: bar = "█" * int(acc * 20) + "░" * (20 - int(acc * 20)) lines.append(f"{emotion:20} {bar} {acc:.1%}") lines.extend([ "", "-" * 60, "LOWEST PERFORMERS (Bottom 5)", "-" * 60, ]) # Bottom 5 by accuracy bottom_emotions = sorted(emotion_accuracy.items(), key=lambda x: x[1])[:5] for emotion, acc in bottom_emotions: bar = "█" * int(acc * 20) + "░" * (20 - int(acc * 20)) lines.append(f"{emotion:20} {bar} {acc:.1%}") lines.append("=" * 60) return "\n".join(lines) if __name__ == "__main__": # Demo usage print("Report Generator - Use with AccuracyBenchmark results")