#!/usr/bin/env python3 """ YAZE AI Model Comparison Report Generator Generates comparison reports from evaluation results. Usage: python compare-models.py results/eval-*.json python compare-models.py --format markdown results/eval-20241125.json python compare-models.py --best results/eval-*.json """ import argparse import json import os import sys from datetime import datetime from pathlib import Path from typing import Any def load_results(file_paths: list[str]) -> list[dict]: """Load evaluation results from JSON files.""" results = [] for path in file_paths: try: with open(path, 'r') as f: data = json.load(f) data['_source_file'] = path results.append(data) except Exception as e: print(f"Warning: Could not load {path}: {e}", file=sys.stderr) return results def merge_results(results: list[dict]) -> dict: """Merge multiple result files into a single comparison.""" merged = { "sources": [], "models": {}, "timestamp": datetime.now().isoformat() } for result in results: merged["sources"].append(result.get('_source_file', 'unknown')) for model, model_data in result.get('models', {}).items(): if model not in merged["models"]: merged["models"][model] = { "runs": [], "summary": {} } merged["models"][model]["runs"].append({ "source": result.get('_source_file'), "timestamp": result.get('timestamp'), "summary": model_data.get('summary', {}), "task_count": len(model_data.get('tasks', [])) }) # Calculate averages across runs for model, data in merged["models"].items(): runs = data["runs"] if runs: data["summary"] = { "avg_accuracy": sum(r["summary"].get("avg_accuracy", 0) for r in runs) / len(runs), "avg_completeness": sum(r["summary"].get("avg_completeness", 0) for r in runs) / len(runs), "avg_tool_usage": sum(r["summary"].get("avg_tool_usage", 0) for r in runs) / len(runs), "avg_response_time": sum(r["summary"].get("avg_response_time", 0) for r in runs) / len(runs), "overall_score": sum(r["summary"].get("overall_score", 0) for r in runs) / len(runs), "run_count": len(runs) } return merged def format_table(merged: dict) -> str: """Format results as ASCII table.""" lines = [] lines.append("┌" + "─"*78 + "┐") lines.append("│" + " "*18 + "YAZE AI Model Comparison Report" + " "*27 + "│") lines.append("│" + " "*18 + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}" + " "*27 + "│") lines.append("├" + "─"*78 + "┤") lines.append("│ {:24} │ {:10} │ {:10} │ {:10} │ {:10} │ {:5} │".format( "Model", "Accuracy", "Complete", "Tool Use", "Speed", "Runs" )) lines.append("├" + "─"*78 + "┤") # Sort by overall score sorted_models = sorted( merged["models"].items(), key=lambda x: x[1]["summary"].get("overall_score", 0), reverse=True ) for model, data in sorted_models: summary = data["summary"] model_name = model[:24] if len(model) <= 24 else model[:21] + "..." lines.append("│ {:24} │ {:8.1f}/10 │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:5} │".format( model_name, summary.get("avg_accuracy", 0), summary.get("avg_completeness", 0), summary.get("avg_tool_usage", 0), summary.get("avg_response_time", 0), summary.get("run_count", 0) )) lines.append("├" + "─"*78 + "┤") # Add recommendation if sorted_models: best_model = sorted_models[0][0] best_score = sorted_models[0][1]["summary"].get("overall_score", 0) lines.append("│ {:76} │".format(f"Recommended: {best_model} (score: {best_score:.1f}/10)")) lines.append("└" + "─"*78 + "┘") return "\n".join(lines) def format_markdown(merged: dict) -> str: """Format results as Markdown.""" lines = [] lines.append("# YAZE AI Model Comparison Report") lines.append("") lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}") lines.append("") lines.append("## Summary") lines.append("") lines.append("| Model | Accuracy | Completeness | Tool Use | Speed | Overall | Runs |") lines.append("|-------|----------|--------------|----------|-------|---------|------|") sorted_models = sorted( merged["models"].items(), key=lambda x: x[1]["summary"].get("overall_score", 0), reverse=True ) for model, data in sorted_models: summary = data["summary"] lines.append("| {} | {:.1f}/10 | {:.1f}/10 | {:.1f}/10 | {:.1f}s | **{:.1f}/10** | {} |".format( model, summary.get("avg_accuracy", 0), summary.get("avg_completeness", 0), summary.get("avg_tool_usage", 0), summary.get("avg_response_time", 0), summary.get("overall_score", 0), summary.get("run_count", 0) )) lines.append("") # Recommendation section if sorted_models: best = sorted_models[0] lines.append("## Recommendation") lines.append("") lines.append(f"**Best Model:** `{best[0]}`") lines.append("") lines.append("### Strengths") lines.append("") summary = best[1]["summary"] if summary.get("avg_accuracy", 0) >= 8: lines.append("- ✅ High accuracy in responses") if summary.get("avg_tool_usage", 0) >= 8: lines.append("- ✅ Effective tool usage") if summary.get("avg_response_time", 0) <= 3: lines.append("- ✅ Fast response times") if summary.get("avg_completeness", 0) >= 8: lines.append("- ✅ Complete and detailed responses") lines.append("") lines.append("### Considerations") lines.append("") if summary.get("avg_accuracy", 0) < 7: lines.append("- ⚠️ Accuracy could be improved") if summary.get("avg_tool_usage", 0) < 7: lines.append("- ⚠️ Tool usage needs improvement") if summary.get("avg_response_time", 0) > 5: lines.append("- ⚠️ Response times are slow") # Source files section lines.append("") lines.append("## Sources") lines.append("") for source in merged.get("sources", []): lines.append(f"- `{source}`") return "\n".join(lines) def format_json(merged: dict) -> str: """Format results as JSON.""" # Remove internal fields output = {k: v for k, v in merged.items() if not k.startswith('_')} return json.dumps(output, indent=2) def get_best_model(merged: dict) -> str: """Get the name of the best performing model.""" sorted_models = sorted( merged["models"].items(), key=lambda x: x[1]["summary"].get("overall_score", 0), reverse=True ) if sorted_models: return sorted_models[0][0] return "unknown" def analyze_task_performance(results: list[dict]) -> dict: """Analyze performance broken down by task category.""" task_performance = {} for result in results: for model, model_data in result.get('models', {}).items(): for task in model_data.get('tasks', []): category = task.get('category', 'unknown') task_id = task.get('task_id', 'unknown') key = f"{category}/{task_id}" if key not in task_performance: task_performance[key] = { "category": category, "task_id": task_id, "task_name": task.get('task_name', 'Unknown'), "models": {} } if model not in task_performance[key]["models"]: task_performance[key]["models"][model] = { "scores": [], "times": [] } task_performance[key]["models"][model]["scores"].append( task.get('accuracy_score', 0) * 0.5 + task.get('completeness_score', 0) * 0.3 + task.get('tool_usage_score', 0) * 0.2 ) task_performance[key]["models"][model]["times"].append( task.get('response_time', 0) ) # Calculate averages for task_key, task_data in task_performance.items(): for model, model_scores in task_data["models"].items(): scores = model_scores["scores"] times = model_scores["times"] model_scores["avg_score"] = sum(scores) / len(scores) if scores else 0 model_scores["avg_time"] = sum(times) / len(times) if times else 0 return task_performance def format_task_analysis(task_performance: dict) -> str: """Format task-level analysis.""" lines = [] lines.append("\n## Task-Level Performance\n") # Group by category by_category = {} for key, data in task_performance.items(): cat = data["category"] if cat not in by_category: by_category[cat] = [] by_category[cat].append(data) for category, tasks in sorted(by_category.items()): lines.append(f"### {category.replace('_', ' ').title()}\n") lines.append("| Task | Best Model | Score | Time |") lines.append("|------|------------|-------|------|") for task in tasks: # Find best model for this task best_model = None best_score = 0 for model, scores in task["models"].items(): if scores["avg_score"] > best_score: best_score = scores["avg_score"] best_model = model if best_model: best_time = task["models"][best_model]["avg_time"] lines.append("| {} | {} | {:.1f}/10 | {:.1f}s |".format( task["task_name"], best_model, best_score, best_time )) lines.append("") return "\n".join(lines) def main(): parser = argparse.ArgumentParser( description="Generate comparison reports from AI evaluation results" ) parser.add_argument( "files", nargs="+", help="Evaluation result JSON files to compare" ) parser.add_argument( "--format", "-f", choices=["table", "markdown", "json"], default="table", help="Output format (default: table)" ) parser.add_argument( "--output", "-o", help="Output file (default: stdout)" ) parser.add_argument( "--best", action="store_true", help="Only output the best model name (for scripting)" ) parser.add_argument( "--task-analysis", action="store_true", help="Include task-level performance analysis" ) args = parser.parse_args() # Load and merge results results = load_results(args.files) if not results: print("No valid result files found", file=sys.stderr) sys.exit(1) merged = merge_results(results) # Handle --best flag if args.best: print(get_best_model(merged)) sys.exit(0) # Format output if args.format == "table": output = format_table(merged) elif args.format == "markdown": output = format_markdown(merged) if args.task_analysis: task_perf = analyze_task_performance(results) output += format_task_analysis(task_perf) else: output = format_json(merged) # Write output if args.output: with open(args.output, 'w') as f: f.write(output) print(f"Report written to: {args.output}") else: print(output) if __name__ == "__main__": main()