Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| Report Generator - Generate evaluation reports | |
| Creates markdown and JSON reports from evaluation results | |
| """ | |
| import json | |
| import os | |
| from datetime import datetime | |
| from typing import Dict, List, Any, Optional | |
| from dataclasses import asdict | |
| class ReportGenerator: | |
| """ | |
| Generate evaluation reports in multiple formats | |
| Creates markdown and JSON reports from benchmark results | |
| """ | |
| def __init__(self, output_dir: str = "evaluation/reports"): | |
| """ | |
| Initialize report generator | |
| Args: | |
| output_dir: Directory to save reports | |
| """ | |
| self.output_dir = output_dir | |
| os.makedirs(output_dir, exist_ok=True) | |
| def _get_timestamp(self) -> str: | |
| """Get formatted timestamp""" | |
| return datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | |
| def generate_markdown_report( | |
| self, | |
| benchmark_results: Any, | |
| transition_results: Optional[List[Any]] = None, | |
| filename: Optional[str] = None | |
| ) -> str: | |
| """ | |
| Generate markdown report | |
| Args: | |
| benchmark_results: Results from AccuracyBenchmark | |
| transition_results: Optional results from LiveStreamTest | |
| filename: Optional filename (auto-generated if None) | |
| Returns: | |
| Path to generated report | |
| """ | |
| timestamp = self._get_timestamp() | |
| if filename is None: | |
| filename = f"evaluation_report_{timestamp}.md" | |
| filepath = os.path.join(self.output_dir, filename) | |
| lines = [ | |
| "# Emoji AI Avatar - Sentiment Analysis Evaluation Report", | |
| "", | |
| f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| "", | |
| "---", | |
| "", | |
| "## Executive Summary", | |
| "", | |
| f"- **Total Emotions Tested:** {benchmark_results.total_tests}", | |
| f"- **Correct Predictions:** {benchmark_results.correct_tests}", | |
| f"- **Overall Accuracy:** {benchmark_results.accuracy:.1%}", | |
| f"- **Average Inference Time:** {benchmark_results.avg_inference_time_ms:.2f} ms", | |
| "", | |
| ] | |
| # Accuracy by emotion | |
| lines.extend([ | |
| "## Accuracy by Emotion", | |
| "", | |
| "| Emotion | Accuracy | Samples | Status |", | |
| "|---------|----------|---------|--------|", | |
| ]) | |
| emotion_accuracy = benchmark_results.emotion_accuracy | |
| for emotion, acc in sorted(emotion_accuracy.items(), key=lambda x: -x[1]): | |
| status = "✅ PASS" if acc >= 0.5 else "❌ FAIL" | |
| # Count samples for this emotion | |
| samples = len(benchmark_results.emotion_results.get(emotion, [])) | |
| lines.append(f"| {emotion} | {acc:.1%} | {samples} | {status} |") | |
| lines.append("") | |
| # Timing statistics | |
| lines.extend([ | |
| "## Performance Metrics", | |
| "", | |
| "| Metric | Value |", | |
| "|--------|-------|", | |
| f"| Min Inference Time | {benchmark_results.min_inference_time_ms:.2f} ms |", | |
| f"| Max Inference Time | {benchmark_results.max_inference_time_ms:.2f} ms |", | |
| f"| Avg Inference Time | {benchmark_results.avg_inference_time_ms:.2f} ms |", | |
| f"| Median Inference Time | {benchmark_results.median_inference_time_ms:.2f} ms |", | |
| "", | |
| ]) | |
| # Confusion matrix summary | |
| lines.extend([ | |
| "## Confusion Analysis", | |
| "", | |
| "### Most Common Misclassifications", | |
| "", | |
| "| Expected | Predicted | Count |", | |
| "|----------|-----------|-------|", | |
| ]) | |
| confusion = benchmark_results.confusion_matrix | |
| # Flatten the nested dict for easier processing | |
| misclassifications = [] | |
| for expected, predicted_counts in confusion.items(): | |
| for predicted, count in predicted_counts.items(): | |
| if expected != predicted: | |
| misclassifications.append((expected, predicted, count)) | |
| misclassifications.sort(key=lambda x: -x[2]) | |
| for exp, pred, count in misclassifications[:10]: | |
| lines.append(f"| {exp} | {pred} | {count} |") | |
| lines.append("") | |
| # Transition test results if available | |
| if transition_results: | |
| correct_transitions = sum(1 for r in transition_results if r.transition_correct) | |
| total_transitions = len(transition_results) | |
| trans_accuracy = correct_transitions / total_transitions if total_transitions > 0 else 0 | |
| lines.extend([ | |
| "## Live Emotion Transition Tests", | |
| "", | |
| f"- **Total Transitions:** {total_transitions}", | |
| f"- **Correct Transitions:** {correct_transitions}", | |
| f"- **Transition Accuracy:** {trans_accuracy:.1%}", | |
| "", | |
| "### Transition Details", | |
| "", | |
| "| From Text | To Text | Expected | Detected | Status |", | |
| "|-----------|---------|----------|----------|--------|", | |
| ]) | |
| for r in transition_results: | |
| status = "✅" if r.transition_correct else "❌" | |
| from_short = r.from_text[:25] + "..." if len(r.from_text) > 25 else r.from_text | |
| to_short = r.to_text[:25] + "..." if len(r.to_text) > 25 else r.to_text | |
| lines.append( | |
| f"| {from_short} | {to_short} | {r.to_emotion} | " | |
| f"{r.final_detected_emotion} | {status} |" | |
| ) | |
| lines.append("") | |
| # Detailed test results | |
| lines.extend([ | |
| "## Detailed Test Results", | |
| "", | |
| "### Failed Tests", | |
| "", | |
| ]) | |
| # Collect all failed results | |
| failures = [] | |
| for emotion, results_list in benchmark_results.emotion_results.items(): | |
| for r in results_list: | |
| if not r.is_correct: | |
| failures.append(r) | |
| if failures: | |
| lines.extend([ | |
| "| Text | Expected | Detected | Time (ms) |", | |
| "|------|----------|----------|-----------|", | |
| ]) | |
| for r in failures[:50]: # Show first 50 failures | |
| text_short = r.text[:40] + "..." if len(r.text) > 40 else r.text | |
| lines.append(f"| {text_short} | {r.expected_polarity} | {r.detected_polarity} | {r.inference_time_ms:.2f} |") | |
| else: | |
| lines.append("*All tests passed!*") | |
| lines.extend([ | |
| "", | |
| "---", | |
| "", | |
| f"*Report generated by Emoji AI Avatar Evaluation Framework*", | |
| ]) | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines)) | |
| return filepath | |
| def generate_json_report( | |
| self, | |
| benchmark_results: Any, | |
| transition_results: Optional[List[Any]] = None, | |
| filename: Optional[str] = None | |
| ) -> str: | |
| """ | |
| Generate JSON report for CI/CD integration | |
| Args: | |
| benchmark_results: Results from AccuracyBenchmark | |
| transition_results: Optional results from LiveStreamTest | |
| filename: Optional filename (auto-generated if None) | |
| Returns: | |
| Path to generated report | |
| """ | |
| timestamp = self._get_timestamp() | |
| if filename is None: | |
| filename = f"evaluation_report_{timestamp}.json" | |
| filepath = os.path.join(self.output_dir, filename) | |
| # Build report data | |
| # Flatten the nested confusion matrix for JSON | |
| flat_confusion = {} | |
| for expected, predicted_counts in benchmark_results.confusion_matrix.items(): | |
| for predicted, count in predicted_counts.items(): | |
| flat_confusion[f"{expected}|{predicted}"] = count | |
| # Flatten emotion results for JSON | |
| all_results = [] | |
| for emotion, results_list in benchmark_results.emotion_results.items(): | |
| for r in results_list: | |
| all_results.append(asdict(r)) | |
| report = { | |
| "meta": { | |
| "timestamp": datetime.now().isoformat(), | |
| "version": "1.0.0", | |
| }, | |
| "summary": { | |
| "total_tests": benchmark_results.total_tests, | |
| "correct": benchmark_results.correct_tests, | |
| "accuracy": benchmark_results.accuracy, | |
| "avg_time_ms": benchmark_results.avg_inference_time_ms, | |
| "min_time_ms": benchmark_results.min_inference_time_ms, | |
| "max_time_ms": benchmark_results.max_inference_time_ms, | |
| "median_time_ms": benchmark_results.median_inference_time_ms, | |
| }, | |
| "emotion_accuracy": dict(benchmark_results.emotion_accuracy), | |
| "confusion_matrix": flat_confusion, | |
| "results": all_results, | |
| } | |
| if transition_results: | |
| correct_transitions = sum(1 for r in transition_results if r.transition_correct) | |
| report["transitions"] = { | |
| "total": len(transition_results), | |
| "correct": correct_transitions, | |
| "accuracy": correct_transitions / len(transition_results) if transition_results else 0, | |
| "details": [asdict(r) for r in transition_results], | |
| } | |
| with open(filepath, "w", encoding="utf-8") as f: | |
| json.dump(report, f, indent=2) | |
| return filepath | |
| def generate_summary_report(self, benchmark_results: Any) -> str: | |
| """ | |
| Generate a brief console summary | |
| Args: | |
| benchmark_results: Results from AccuracyBenchmark | |
| Returns: | |
| Summary string | |
| """ | |
| lines = [ | |
| "=" * 60, | |
| "SENTIMENT ANALYSIS EVALUATION SUMMARY", | |
| "=" * 60, | |
| "", | |
| f"Total Tests: {benchmark_results.total_tests}", | |
| f"Correct: {benchmark_results.correct_tests}", | |
| f"Accuracy: {benchmark_results.accuracy:.1%}", | |
| f"Avg Time: {benchmark_results.avg_inference_time_ms:.2f} ms", | |
| "", | |
| "-" * 60, | |
| "EMOTION BREAKDOWN (Top 10)", | |
| "-" * 60, | |
| ] | |
| # Top 10 by accuracy | |
| emotion_accuracy = benchmark_results.emotion_accuracy | |
| sorted_emotions = sorted(emotion_accuracy.items(), key=lambda x: -x[1])[:10] | |
| for emotion, acc in sorted_emotions: | |
| bar = "█" * int(acc * 20) + "░" * (20 - int(acc * 20)) | |
| lines.append(f"{emotion:20} {bar} {acc:.1%}") | |
| lines.extend([ | |
| "", | |
| "-" * 60, | |
| "LOWEST PERFORMERS (Bottom 5)", | |
| "-" * 60, | |
| ]) | |
| # Bottom 5 by accuracy | |
| bottom_emotions = sorted(emotion_accuracy.items(), key=lambda x: x[1])[:5] | |
| for emotion, acc in bottom_emotions: | |
| bar = "█" * int(acc * 20) + "░" * (20 - int(acc * 20)) | |
| lines.append(f"{emotion:20} {bar} {acc:.1%}") | |
| lines.append("=" * 60) | |
| return "\n".join(lines) | |
| if __name__ == "__main__": | |
| # Demo usage | |
| print("Report Generator - Use with AccuracyBenchmark results") | |