backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)
This commit is contained in:
370
scripts/ai/compare-models.py
Executable file
370
scripts/ai/compare-models.py
Executable file
@@ -0,0 +1,370 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YAZE AI Model Comparison Report Generator
|
||||
|
||||
Generates comparison reports from evaluation results.
|
||||
|
||||
Usage:
|
||||
python compare-models.py results/eval-*.json
|
||||
python compare-models.py --format markdown results/eval-20241125.json
|
||||
python compare-models.py --best results/eval-*.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def load_results(file_paths: list[str]) -> list[dict]:
|
||||
"""Load evaluation results from JSON files."""
|
||||
results = []
|
||||
for path in file_paths:
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
data['_source_file'] = path
|
||||
results.append(data)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not load {path}: {e}", file=sys.stderr)
|
||||
return results
|
||||
|
||||
|
||||
def merge_results(results: list[dict]) -> dict:
|
||||
"""Merge multiple result files into a single comparison."""
|
||||
merged = {
|
||||
"sources": [],
|
||||
"models": {},
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
for result in results:
|
||||
merged["sources"].append(result.get('_source_file', 'unknown'))
|
||||
|
||||
for model, model_data in result.get('models', {}).items():
|
||||
if model not in merged["models"]:
|
||||
merged["models"][model] = {
|
||||
"runs": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
merged["models"][model]["runs"].append({
|
||||
"source": result.get('_source_file'),
|
||||
"timestamp": result.get('timestamp'),
|
||||
"summary": model_data.get('summary', {}),
|
||||
"task_count": len(model_data.get('tasks', []))
|
||||
})
|
||||
|
||||
# Calculate averages across runs
|
||||
for model, data in merged["models"].items():
|
||||
runs = data["runs"]
|
||||
if runs:
|
||||
data["summary"] = {
|
||||
"avg_accuracy": sum(r["summary"].get("avg_accuracy", 0) for r in runs) / len(runs),
|
||||
"avg_completeness": sum(r["summary"].get("avg_completeness", 0) for r in runs) / len(runs),
|
||||
"avg_tool_usage": sum(r["summary"].get("avg_tool_usage", 0) for r in runs) / len(runs),
|
||||
"avg_response_time": sum(r["summary"].get("avg_response_time", 0) for r in runs) / len(runs),
|
||||
"overall_score": sum(r["summary"].get("overall_score", 0) for r in runs) / len(runs),
|
||||
"run_count": len(runs)
|
||||
}
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def format_table(merged: dict) -> str:
|
||||
"""Format results as ASCII table."""
|
||||
lines = []
|
||||
|
||||
lines.append("┌" + "─"*78 + "┐")
|
||||
lines.append("│" + " "*18 + "YAZE AI Model Comparison Report" + " "*27 + "│")
|
||||
lines.append("│" + " "*18 + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}" + " "*27 + "│")
|
||||
lines.append("├" + "─"*78 + "┤")
|
||||
lines.append("│ {:24} │ {:10} │ {:10} │ {:10} │ {:10} │ {:5} │".format(
|
||||
"Model", "Accuracy", "Complete", "Tool Use", "Speed", "Runs"
|
||||
))
|
||||
lines.append("├" + "─"*78 + "┤")
|
||||
|
||||
# Sort by overall score
|
||||
sorted_models = sorted(
|
||||
merged["models"].items(),
|
||||
key=lambda x: x[1]["summary"].get("overall_score", 0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
for model, data in sorted_models:
|
||||
summary = data["summary"]
|
||||
model_name = model[:24] if len(model) <= 24 else model[:21] + "..."
|
||||
|
||||
lines.append("│ {:24} │ {:8.1f}/10 │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:5} │".format(
|
||||
model_name,
|
||||
summary.get("avg_accuracy", 0),
|
||||
summary.get("avg_completeness", 0),
|
||||
summary.get("avg_tool_usage", 0),
|
||||
summary.get("avg_response_time", 0),
|
||||
summary.get("run_count", 0)
|
||||
))
|
||||
|
||||
lines.append("├" + "─"*78 + "┤")
|
||||
|
||||
# Add recommendation
|
||||
if sorted_models:
|
||||
best_model = sorted_models[0][0]
|
||||
best_score = sorted_models[0][1]["summary"].get("overall_score", 0)
|
||||
lines.append("│ {:76} │".format(f"Recommended: {best_model} (score: {best_score:.1f}/10)"))
|
||||
|
||||
lines.append("└" + "─"*78 + "┘")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_markdown(merged: dict) -> str:
|
||||
"""Format results as Markdown."""
|
||||
lines = []
|
||||
|
||||
lines.append("# YAZE AI Model Comparison Report")
|
||||
lines.append("")
|
||||
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||
lines.append("")
|
||||
lines.append("## Summary")
|
||||
lines.append("")
|
||||
lines.append("| Model | Accuracy | Completeness | Tool Use | Speed | Overall | Runs |")
|
||||
lines.append("|-------|----------|--------------|----------|-------|---------|------|")
|
||||
|
||||
sorted_models = sorted(
|
||||
merged["models"].items(),
|
||||
key=lambda x: x[1]["summary"].get("overall_score", 0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
for model, data in sorted_models:
|
||||
summary = data["summary"]
|
||||
lines.append("| {} | {:.1f}/10 | {:.1f}/10 | {:.1f}/10 | {:.1f}s | **{:.1f}/10** | {} |".format(
|
||||
model,
|
||||
summary.get("avg_accuracy", 0),
|
||||
summary.get("avg_completeness", 0),
|
||||
summary.get("avg_tool_usage", 0),
|
||||
summary.get("avg_response_time", 0),
|
||||
summary.get("overall_score", 0),
|
||||
summary.get("run_count", 0)
|
||||
))
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Recommendation section
|
||||
if sorted_models:
|
||||
best = sorted_models[0]
|
||||
lines.append("## Recommendation")
|
||||
lines.append("")
|
||||
lines.append(f"**Best Model:** `{best[0]}`")
|
||||
lines.append("")
|
||||
lines.append("### Strengths")
|
||||
lines.append("")
|
||||
|
||||
summary = best[1]["summary"]
|
||||
if summary.get("avg_accuracy", 0) >= 8:
|
||||
lines.append("- ✅ High accuracy in responses")
|
||||
if summary.get("avg_tool_usage", 0) >= 8:
|
||||
lines.append("- ✅ Effective tool usage")
|
||||
if summary.get("avg_response_time", 0) <= 3:
|
||||
lines.append("- ✅ Fast response times")
|
||||
if summary.get("avg_completeness", 0) >= 8:
|
||||
lines.append("- ✅ Complete and detailed responses")
|
||||
|
||||
lines.append("")
|
||||
lines.append("### Considerations")
|
||||
lines.append("")
|
||||
|
||||
if summary.get("avg_accuracy", 0) < 7:
|
||||
lines.append("- ⚠️ Accuracy could be improved")
|
||||
if summary.get("avg_tool_usage", 0) < 7:
|
||||
lines.append("- ⚠️ Tool usage needs improvement")
|
||||
if summary.get("avg_response_time", 0) > 5:
|
||||
lines.append("- ⚠️ Response times are slow")
|
||||
|
||||
# Source files section
|
||||
lines.append("")
|
||||
lines.append("## Sources")
|
||||
lines.append("")
|
||||
for source in merged.get("sources", []):
|
||||
lines.append(f"- `{source}`")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_json(merged: dict) -> str:
|
||||
"""Format results as JSON."""
|
||||
# Remove internal fields
|
||||
output = {k: v for k, v in merged.items() if not k.startswith('_')}
|
||||
return json.dumps(output, indent=2)
|
||||
|
||||
|
||||
def get_best_model(merged: dict) -> str:
|
||||
"""Get the name of the best performing model."""
|
||||
sorted_models = sorted(
|
||||
merged["models"].items(),
|
||||
key=lambda x: x[1]["summary"].get("overall_score", 0),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
if sorted_models:
|
||||
return sorted_models[0][0]
|
||||
return "unknown"
|
||||
|
||||
|
||||
def analyze_task_performance(results: list[dict]) -> dict:
|
||||
"""Analyze performance broken down by task category."""
|
||||
task_performance = {}
|
||||
|
||||
for result in results:
|
||||
for model, model_data in result.get('models', {}).items():
|
||||
for task in model_data.get('tasks', []):
|
||||
category = task.get('category', 'unknown')
|
||||
task_id = task.get('task_id', 'unknown')
|
||||
|
||||
key = f"{category}/{task_id}"
|
||||
if key not in task_performance:
|
||||
task_performance[key] = {
|
||||
"category": category,
|
||||
"task_id": task_id,
|
||||
"task_name": task.get('task_name', 'Unknown'),
|
||||
"models": {}
|
||||
}
|
||||
|
||||
if model not in task_performance[key]["models"]:
|
||||
task_performance[key]["models"][model] = {
|
||||
"scores": [],
|
||||
"times": []
|
||||
}
|
||||
|
||||
task_performance[key]["models"][model]["scores"].append(
|
||||
task.get('accuracy_score', 0) * 0.5 +
|
||||
task.get('completeness_score', 0) * 0.3 +
|
||||
task.get('tool_usage_score', 0) * 0.2
|
||||
)
|
||||
task_performance[key]["models"][model]["times"].append(
|
||||
task.get('response_time', 0)
|
||||
)
|
||||
|
||||
# Calculate averages
|
||||
for task_key, task_data in task_performance.items():
|
||||
for model, model_scores in task_data["models"].items():
|
||||
scores = model_scores["scores"]
|
||||
times = model_scores["times"]
|
||||
model_scores["avg_score"] = sum(scores) / len(scores) if scores else 0
|
||||
model_scores["avg_time"] = sum(times) / len(times) if times else 0
|
||||
|
||||
return task_performance
|
||||
|
||||
|
||||
def format_task_analysis(task_performance: dict) -> str:
|
||||
"""Format task-level analysis."""
|
||||
lines = []
|
||||
lines.append("\n## Task-Level Performance\n")
|
||||
|
||||
# Group by category
|
||||
by_category = {}
|
||||
for key, data in task_performance.items():
|
||||
cat = data["category"]
|
||||
if cat not in by_category:
|
||||
by_category[cat] = []
|
||||
by_category[cat].append(data)
|
||||
|
||||
for category, tasks in sorted(by_category.items()):
|
||||
lines.append(f"### {category.replace('_', ' ').title()}\n")
|
||||
lines.append("| Task | Best Model | Score | Time |")
|
||||
lines.append("|------|------------|-------|------|")
|
||||
|
||||
for task in tasks:
|
||||
# Find best model for this task
|
||||
best_model = None
|
||||
best_score = 0
|
||||
for model, scores in task["models"].items():
|
||||
if scores["avg_score"] > best_score:
|
||||
best_score = scores["avg_score"]
|
||||
best_model = model
|
||||
|
||||
if best_model:
|
||||
best_time = task["models"][best_model]["avg_time"]
|
||||
lines.append("| {} | {} | {:.1f}/10 | {:.1f}s |".format(
|
||||
task["task_name"],
|
||||
best_model,
|
||||
best_score,
|
||||
best_time
|
||||
))
|
||||
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate comparison reports from AI evaluation results"
|
||||
)
|
||||
parser.add_argument(
|
||||
"files",
|
||||
nargs="+",
|
||||
help="Evaluation result JSON files to compare"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format", "-f",
|
||||
choices=["table", "markdown", "json"],
|
||||
default="table",
|
||||
help="Output format (default: table)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
help="Output file (default: stdout)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--best",
|
||||
action="store_true",
|
||||
help="Only output the best model name (for scripting)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task-analysis",
|
||||
action="store_true",
|
||||
help="Include task-level performance analysis"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load and merge results
|
||||
results = load_results(args.files)
|
||||
if not results:
|
||||
print("No valid result files found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
merged = merge_results(results)
|
||||
|
||||
# Handle --best flag
|
||||
if args.best:
|
||||
print(get_best_model(merged))
|
||||
sys.exit(0)
|
||||
|
||||
# Format output
|
||||
if args.format == "table":
|
||||
output = format_table(merged)
|
||||
elif args.format == "markdown":
|
||||
output = format_markdown(merged)
|
||||
if args.task_analysis:
|
||||
task_perf = analyze_task_performance(results)
|
||||
output += format_task_analysis(task_perf)
|
||||
else:
|
||||
output = format_json(merged)
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(output)
|
||||
print(f"Report written to: {args.output}")
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
596
scripts/ai/eval-runner.py
Executable file
596
scripts/ai/eval-runner.py
Executable file
@@ -0,0 +1,596 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YAZE AI Model Evaluation Runner
|
||||
|
||||
Runs evaluation tasks against multiple AI models and produces scored results.
|
||||
|
||||
Usage:
|
||||
python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
|
||||
python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
|
||||
|
||||
Requirements:
|
||||
pip install requests pyyaml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskResult:
|
||||
"""Result of a single task evaluation."""
|
||||
task_id: str
|
||||
task_name: str
|
||||
category: str
|
||||
model: str
|
||||
prompt: str
|
||||
response: str
|
||||
response_time: float
|
||||
accuracy_score: float = 0.0
|
||||
completeness_score: float = 0.0
|
||||
tool_usage_score: float = 0.0
|
||||
pattern_matches: list = field(default_factory=list)
|
||||
tools_used: list = field(default_factory=list)
|
||||
error: Optional[str] = None
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
"""Calculate weighted overall score."""
|
||||
# Default weights from eval-tasks.yaml
|
||||
weights = {
|
||||
'accuracy': 0.4,
|
||||
'completeness': 0.3,
|
||||
'tool_usage': 0.2,
|
||||
'response_time': 0.1
|
||||
}
|
||||
|
||||
# Normalize response time to 0-10 scale (lower is better)
|
||||
# 0s = 10, 60s+ = 0
|
||||
time_score = max(0, 10 - (self.response_time / 6))
|
||||
|
||||
return (
|
||||
weights['accuracy'] * self.accuracy_score +
|
||||
weights['completeness'] * self.completeness_score +
|
||||
weights['tool_usage'] * self.tool_usage_score +
|
||||
weights['response_time'] * time_score
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelResults:
|
||||
"""Aggregated results for a single model."""
|
||||
model: str
|
||||
tasks: list[TaskResult] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def avg_accuracy(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_completeness(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_tool_usage(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_response_time(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.response_time for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.overall_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for Ollama API."""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:11434"):
|
||||
self.base_url = base_url
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Ollama is running."""
|
||||
try:
|
||||
resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
|
||||
return resp.status_code == 200
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
|
||||
def list_models(self) -> list[str]:
|
||||
"""List available models."""
|
||||
try:
|
||||
resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
return [m['name'] for m in data.get('models', [])]
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
return []
|
||||
|
||||
def pull_model(self, model: str) -> bool:
|
||||
"""Pull a model if not available."""
|
||||
print(f" Pulling model {model}...", end=" ", flush=True)
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{self.base_url}/api/pull",
|
||||
json={"name": model},
|
||||
timeout=600 # 10 minutes for large models
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
print("Done")
|
||||
return True
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Failed: {e}")
|
||||
return False
|
||||
|
||||
def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
|
||||
"""
|
||||
Send a chat message and return response + response time.
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, response_time_seconds)
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{self.base_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False
|
||||
},
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
content = data.get("message", {}).get("content", "")
|
||||
return content, elapsed
|
||||
else:
|
||||
return f"Error: HTTP {resp.status_code}", elapsed
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return "Error: Request timed out", timeout
|
||||
except requests.exceptions.RequestException as e:
|
||||
return f"Error: {str(e)}", time.time() - start_time
|
||||
|
||||
|
||||
class TaskEvaluator:
|
||||
"""Evaluates task responses and assigns scores."""
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
|
||||
def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
|
||||
"""Evaluate a response for a task."""
|
||||
result = TaskResult(
|
||||
task_id=task['id'],
|
||||
task_name=task['name'],
|
||||
category=task.get('category', 'unknown'),
|
||||
model=task.get('model', 'unknown'),
|
||||
prompt=task.get('prompt', ''),
|
||||
response=response,
|
||||
response_time=response_time
|
||||
)
|
||||
|
||||
if response.startswith("Error:"):
|
||||
result.error = response
|
||||
return result
|
||||
|
||||
# Check pattern matches
|
||||
expected_patterns = task.get('expected_patterns', [])
|
||||
for pattern in expected_patterns:
|
||||
if re.search(pattern, response, re.IGNORECASE):
|
||||
result.pattern_matches.append(pattern)
|
||||
|
||||
# Score accuracy based on pattern matches
|
||||
if expected_patterns:
|
||||
match_ratio = len(result.pattern_matches) / len(expected_patterns)
|
||||
result.accuracy_score = match_ratio * 10
|
||||
else:
|
||||
# No patterns defined, give neutral score
|
||||
result.accuracy_score = 5.0
|
||||
|
||||
# Score completeness based on response length and structure
|
||||
result.completeness_score = self._score_completeness(response, task)
|
||||
|
||||
# Score tool usage
|
||||
result.tool_usage_score = self._score_tool_usage(response, task)
|
||||
|
||||
return result
|
||||
|
||||
def _score_completeness(self, response: str, task: dict) -> float:
|
||||
"""Score completeness based on response characteristics."""
|
||||
score = 0.0
|
||||
|
||||
# Base score for having a response
|
||||
if len(response.strip()) > 0:
|
||||
score += 2.0
|
||||
|
||||
# Length bonus (up to 4 points)
|
||||
word_count = len(response.split())
|
||||
if word_count >= 20:
|
||||
score += min(4.0, word_count / 50)
|
||||
|
||||
# Structure bonus (up to 2 points)
|
||||
if '\n' in response:
|
||||
score += 1.0 # Multi-line response
|
||||
if '- ' in response or '* ' in response:
|
||||
score += 0.5 # List items
|
||||
if any(c.isdigit() for c in response):
|
||||
score += 0.5 # Contains numbers/data
|
||||
|
||||
# Code block bonus
|
||||
if '```' in response or ' ' in response:
|
||||
score += 1.0
|
||||
|
||||
return min(10.0, score)
|
||||
|
||||
def _score_tool_usage(self, response: str, task: dict) -> float:
|
||||
"""Score tool usage based on task requirements."""
|
||||
required_tool = task.get('required_tool')
|
||||
|
||||
if not required_tool:
|
||||
# No tool required, check if response is sensible
|
||||
return 7.0 # Neutral-good score
|
||||
|
||||
# Check if the response mentions using tools
|
||||
tool_patterns = [
|
||||
r'filesystem-list',
|
||||
r'filesystem-read',
|
||||
r'filesystem-exists',
|
||||
r'filesystem-info',
|
||||
r'build-configure',
|
||||
r'build-compile',
|
||||
r'build-test',
|
||||
r'memory-analyze',
|
||||
r'memory-search',
|
||||
]
|
||||
|
||||
tools_mentioned = []
|
||||
for pattern in tool_patterns:
|
||||
if re.search(pattern, response, re.IGNORECASE):
|
||||
tools_mentioned.append(pattern)
|
||||
|
||||
if required_tool.lower() in ' '.join(tools_mentioned).lower():
|
||||
return 10.0 # Used the required tool
|
||||
elif tools_mentioned:
|
||||
return 6.0 # Used some tools but not the required one
|
||||
else:
|
||||
return 3.0 # Didn't use any tools when one was required
|
||||
|
||||
|
||||
def load_config(config_path: str) -> dict:
|
||||
"""Load the evaluation tasks configuration."""
|
||||
with open(config_path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
|
||||
"""Get all tasks for specified categories."""
|
||||
tasks = []
|
||||
|
||||
for cat_name, cat_data in config.get('categories', {}).items():
|
||||
if 'all' in categories or cat_name in categories:
|
||||
for task in cat_data.get('tasks', []):
|
||||
task['category'] = cat_name
|
||||
tasks.append(task)
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
models: list[str],
|
||||
tasks: list[dict],
|
||||
client: OllamaClient,
|
||||
evaluator: TaskEvaluator,
|
||||
timeout: int = 120
|
||||
) -> dict[str, ModelResults]:
|
||||
"""Run evaluation for all models and tasks."""
|
||||
results = {}
|
||||
|
||||
total = len(models) * len(tasks)
|
||||
current = 0
|
||||
|
||||
for model in models:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Evaluating: {model}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
model_results = ModelResults(model=model)
|
||||
|
||||
for task in tasks:
|
||||
current += 1
|
||||
print(f"\n [{current}/{total}] {task['id']}: {task['name']}")
|
||||
|
||||
# Handle multi-turn tasks differently
|
||||
if task.get('multi_turn'):
|
||||
response, resp_time = run_multi_turn_task(
|
||||
client, model, task, timeout
|
||||
)
|
||||
else:
|
||||
prompt = task.get('prompt', '')
|
||||
print(f" Prompt: {prompt[:60]}...")
|
||||
response, resp_time = client.chat(model, prompt, timeout)
|
||||
|
||||
print(f" Response time: {resp_time:.2f}s")
|
||||
|
||||
# Create a copy of task with model info
|
||||
task_with_model = {**task, 'model': model}
|
||||
|
||||
# Evaluate the response
|
||||
result = evaluator.evaluate(task_with_model, response, resp_time)
|
||||
model_results.tasks.append(result)
|
||||
|
||||
print(f" Accuracy: {result.accuracy_score:.1f}/10")
|
||||
print(f" Completeness: {result.completeness_score:.1f}/10")
|
||||
print(f" Tool Usage: {result.tool_usage_score:.1f}/10")
|
||||
print(f" Overall: {result.overall_score:.1f}/10")
|
||||
|
||||
results[model] = model_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_multi_turn_task(
|
||||
client: OllamaClient,
|
||||
model: str,
|
||||
task: dict,
|
||||
timeout: int
|
||||
) -> tuple[str, float]:
|
||||
"""Run a multi-turn conversation task."""
|
||||
prompts = task.get('prompts', [])
|
||||
if not prompts:
|
||||
return "Error: No prompts defined for multi-turn task", 0.0
|
||||
|
||||
total_time = 0.0
|
||||
all_responses = []
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
# For simplicity, we send each prompt independently
|
||||
# A more sophisticated version would maintain conversation context
|
||||
print(f" Turn {i+1}: {prompt[:50]}...")
|
||||
response, resp_time = client.chat(model, prompt, timeout)
|
||||
total_time += resp_time
|
||||
all_responses.append(f"Turn {i+1}: {response}")
|
||||
|
||||
return "\n\n".join(all_responses), total_time
|
||||
|
||||
|
||||
def print_summary(results: dict[str, ModelResults]):
|
||||
"""Print a summary table of results."""
|
||||
print("\n")
|
||||
print("┌" + "─"*70 + "┐")
|
||||
print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
|
||||
print("├" + "─"*70 + "┤")
|
||||
print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
|
||||
"Model", "Accuracy", "Tool Use", "Speed", "Overall"
|
||||
))
|
||||
print("├" + "─"*70 + "┤")
|
||||
|
||||
for model, model_results in sorted(
|
||||
results.items(),
|
||||
key=lambda x: x[1].overall_score,
|
||||
reverse=True
|
||||
):
|
||||
# Format model name (truncate if needed)
|
||||
model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
|
||||
|
||||
print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
|
||||
model_name,
|
||||
model_results.avg_accuracy,
|
||||
model_results.avg_tool_usage,
|
||||
model_results.avg_response_time,
|
||||
model_results.overall_score
|
||||
))
|
||||
|
||||
print("└" + "─"*70 + "┘")
|
||||
|
||||
|
||||
def save_results(results: dict[str, ModelResults], output_path: str):
|
||||
"""Save detailed results to JSON file."""
|
||||
output_data = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"version": "1.0",
|
||||
"models": {}
|
||||
}
|
||||
|
||||
for model, model_results in results.items():
|
||||
output_data["models"][model] = {
|
||||
"summary": {
|
||||
"avg_accuracy": model_results.avg_accuracy,
|
||||
"avg_completeness": model_results.avg_completeness,
|
||||
"avg_tool_usage": model_results.avg_tool_usage,
|
||||
"avg_response_time": model_results.avg_response_time,
|
||||
"overall_score": model_results.overall_score,
|
||||
},
|
||||
"tasks": [asdict(t) for t in model_results.tasks]
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
print(f"\nResults saved to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="YAZE AI Model Evaluation Runner"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--models", "-m",
|
||||
type=str,
|
||||
help="Comma-separated list of models to evaluate"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all-models",
|
||||
action="store_true",
|
||||
help="Evaluate all available models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--default-models",
|
||||
action="store_true",
|
||||
help="Evaluate default models from config"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks", "-t",
|
||||
type=str,
|
||||
default="all",
|
||||
help="Task categories to run (comma-separated, or 'all')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config", "-c",
|
||||
type=str,
|
||||
default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
|
||||
help="Path to evaluation config file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=str,
|
||||
help="Output file for results (default: results/eval-TIMESTAMP.json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=120,
|
||||
help="Timeout in seconds for each task (default: 120)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ollama-url",
|
||||
type=str,
|
||||
default="http://localhost:11434",
|
||||
help="Ollama API URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be evaluated without running"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load configuration
|
||||
print("Loading configuration...")
|
||||
try:
|
||||
config = load_config(args.config)
|
||||
except Exception as e:
|
||||
print(f"Error loading config: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize Ollama client
|
||||
client = OllamaClient(args.ollama_url)
|
||||
|
||||
if not client.is_available():
|
||||
print("Error: Ollama is not running. Start it with 'ollama serve'")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine which models to evaluate
|
||||
available_models = client.list_models()
|
||||
print(f"Available models: {', '.join(available_models) or 'none'}")
|
||||
|
||||
if args.all_models:
|
||||
models = available_models
|
||||
elif args.default_models:
|
||||
default_model_names = [
|
||||
m['name'] for m in config.get('default_models', [])
|
||||
]
|
||||
models = [m for m in default_model_names if m in available_models]
|
||||
# Offer to pull missing models
|
||||
missing = [m for m in default_model_names if m not in available_models]
|
||||
if missing:
|
||||
print(f"Missing default models: {', '.join(missing)}")
|
||||
for m in missing:
|
||||
if client.pull_model(m):
|
||||
models.append(m)
|
||||
elif args.models:
|
||||
models = [m.strip() for m in args.models.split(',')]
|
||||
# Validate models exist
|
||||
for m in models:
|
||||
if m not in available_models:
|
||||
print(f"Warning: Model '{m}' not found. Attempting to pull...")
|
||||
if not client.pull_model(m):
|
||||
print(f" Failed to pull {m}, skipping")
|
||||
models.remove(m)
|
||||
else:
|
||||
# Default to first available model
|
||||
models = available_models[:1] if available_models else []
|
||||
|
||||
if not models:
|
||||
print("No models available for evaluation")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Models to evaluate: {', '.join(models)}")
|
||||
|
||||
# Get tasks
|
||||
categories = [c.strip() for c in args.tasks.split(',')]
|
||||
tasks = get_tasks_for_categories(config, categories)
|
||||
|
||||
if not tasks:
|
||||
print(f"No tasks found for categories: {args.tasks}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Tasks to run: {len(tasks)}")
|
||||
for task in tasks:
|
||||
print(f" - [{task['category']}] {task['id']}: {task['name']}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDry run complete. Use --help for options.")
|
||||
sys.exit(0)
|
||||
|
||||
# Run evaluation
|
||||
evaluator = TaskEvaluator(config)
|
||||
results = run_evaluation(
|
||||
models, tasks, client, evaluator, args.timeout
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print_summary(results)
|
||||
|
||||
# Save results
|
||||
output_path = args.output or os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"results",
|
||||
f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
||||
)
|
||||
save_results(results, output_path)
|
||||
|
||||
# Return exit code based on best model score
|
||||
best_score = max(r.overall_score for r in results.values())
|
||||
if best_score >= 7.0:
|
||||
sys.exit(0) # Good
|
||||
elif best_score >= 5.0:
|
||||
sys.exit(1) # Okay
|
||||
else:
|
||||
sys.exit(2) # Poor
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
383
scripts/ai/eval-tasks.yaml
Normal file
383
scripts/ai/eval-tasks.yaml
Normal file
@@ -0,0 +1,383 @@
|
||||
# YAZE AI Model Evaluation Tasks
|
||||
#
|
||||
# This file defines evaluation tasks for comparing different AI models
|
||||
# used with the z3ed CLI agent system.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/ai/run-model-eval.sh --models "llama3,qwen2.5,codellama" --tasks all
|
||||
# ./scripts/ai/run-model-eval.sh --tasks rom_inspection --models "llama3"
|
||||
#
|
||||
# Scoring:
|
||||
# Each task is scored on a 0-10 scale across multiple dimensions:
|
||||
# - accuracy: Did the model answer correctly?
|
||||
# - completeness: Did it include all relevant information?
|
||||
# - tool_usage: Did it use tools appropriately?
|
||||
# - response_time: Measured in seconds (lower is better)
|
||||
|
||||
version: "1.0"
|
||||
|
||||
# Models to evaluate by default
|
||||
default_models:
|
||||
- name: "llama3.2:latest"
|
||||
description: "Meta's Llama 3.2 - default baseline"
|
||||
type: "baseline"
|
||||
- name: "qwen2.5-coder:7b"
|
||||
description: "Qwen 2.5 Coder - optimized for code"
|
||||
type: "code"
|
||||
- name: "codellama:7b"
|
||||
description: "Meta's CodeLlama - code generation"
|
||||
type: "code"
|
||||
- name: "mistral:7b"
|
||||
description: "Mistral 7B - general purpose"
|
||||
type: "general"
|
||||
- name: "phi3:medium"
|
||||
description: "Microsoft Phi-3 - efficient"
|
||||
type: "efficient"
|
||||
|
||||
# Scoring weights for overall score calculation
|
||||
scoring_weights:
|
||||
accuracy: 0.4
|
||||
completeness: 0.3
|
||||
tool_usage: 0.2
|
||||
response_time: 0.1
|
||||
|
||||
# Maximum response time before timeout (seconds)
|
||||
timeout: 120
|
||||
|
||||
# Evaluation task categories
|
||||
categories:
|
||||
rom_inspection:
|
||||
description: "Tasks that inspect ROM data structures"
|
||||
tasks:
|
||||
- id: "list_dungeons"
|
||||
name: "List Dungeons"
|
||||
prompt: "What dungeons are in this ROM? List their names and IDs."
|
||||
expected_patterns:
|
||||
- "eastern palace|palace of darkness|desert palace"
|
||||
- "tower of hera|swamp palace|skull woods"
|
||||
- "thieves|ice palace|misery mire"
|
||||
required_tool: null
|
||||
scoring:
|
||||
accuracy_criteria: "Lists at least 8 dungeons with correct names"
|
||||
completeness_criteria: "Includes dungeon IDs or entrance info"
|
||||
|
||||
- id: "describe_overworld"
|
||||
name: "Describe Overworld Map"
|
||||
prompt: "Describe overworld map 0 (Light World). What areas and features are visible?"
|
||||
expected_patterns:
|
||||
- "light world|hyrule"
|
||||
- "castle|sanctuary|kakariko"
|
||||
required_tool: null
|
||||
scoring:
|
||||
accuracy_criteria: "Correctly identifies the Light World"
|
||||
completeness_criteria: "Mentions multiple notable locations"
|
||||
|
||||
- id: "find_sprites"
|
||||
name: "Find Sprites in Room"
|
||||
prompt: "What sprites are present in dungeon room 0? List their types and positions."
|
||||
expected_patterns:
|
||||
- "sprite|enemy|npc"
|
||||
- "position|coordinate|x|y"
|
||||
required_tool: null
|
||||
scoring:
|
||||
accuracy_criteria: "Lists sprites with correct types"
|
||||
completeness_criteria: "Includes position data"
|
||||
|
||||
- id: "entrance_info"
|
||||
name: "Get Entrance Information"
|
||||
prompt: "Where is the entrance to the Eastern Palace?"
|
||||
expected_patterns:
|
||||
- "eastern|palace|entrance"
|
||||
- "east|light world"
|
||||
required_tool: null
|
||||
scoring:
|
||||
accuracy_criteria: "Correctly identifies entrance location"
|
||||
completeness_criteria: "Provides coordinates or map reference"
|
||||
|
||||
code_analysis:
|
||||
description: "Tasks that analyze or generate code"
|
||||
tasks:
|
||||
- id: "explain_function"
|
||||
name: "Explain Function"
|
||||
prompt: "Explain what the function LoadDungeonRoom does in the codebase."
|
||||
expected_patterns:
|
||||
- "dungeon|room|load"
|
||||
- "tilemap|object|sprite"
|
||||
required_tool: "filesystem-read"
|
||||
scoring:
|
||||
accuracy_criteria: "Correctly describes the function purpose"
|
||||
completeness_criteria: "Explains key steps or data flows"
|
||||
|
||||
- id: "find_bugs"
|
||||
name: "Find Potential Issues"
|
||||
prompt: "Are there any potential issues with how sprite coordinates are handled in room loading?"
|
||||
expected_patterns:
|
||||
- "bounds|overflow|check"
|
||||
- "coordinate|position"
|
||||
required_tool: "filesystem-read"
|
||||
scoring:
|
||||
accuracy_criteria: "Identifies real or plausible issues"
|
||||
completeness_criteria: "Explains why the issue matters"
|
||||
|
||||
- id: "suggest_refactor"
|
||||
name: "Suggest Refactoring"
|
||||
prompt: "How could the dungeon editor's room rendering be improved for performance?"
|
||||
expected_patterns:
|
||||
- "cache|batch|optimize"
|
||||
- "render|draw|update"
|
||||
required_tool: "filesystem-read"
|
||||
scoring:
|
||||
accuracy_criteria: "Suggests valid optimization strategies"
|
||||
completeness_criteria: "Explains implementation approach"
|
||||
|
||||
tool_calling:
|
||||
description: "Tasks that require proper tool usage"
|
||||
tasks:
|
||||
- id: "list_files"
|
||||
name: "List Source Files"
|
||||
prompt: "List all .cc files in src/app/editor/"
|
||||
expected_patterns:
|
||||
- "\\.cc"
|
||||
- "editor"
|
||||
required_tool: "filesystem-list"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses filesystem-list tool correctly"
|
||||
completeness_criteria: "Lists files in correct directory"
|
||||
|
||||
- id: "read_file"
|
||||
name: "Read File Contents"
|
||||
prompt: "What are the first 20 lines of src/app/rom.h?"
|
||||
expected_patterns:
|
||||
- "#ifndef|#define|#include"
|
||||
- "rom|Rom"
|
||||
required_tool: "filesystem-read"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses filesystem-read with correct path"
|
||||
completeness_criteria: "Shows actual file content"
|
||||
|
||||
- id: "check_existence"
|
||||
name: "Check File Existence"
|
||||
prompt: "Does the file src/app/editor/dungeon/dungeon_editor.cc exist?"
|
||||
expected_patterns:
|
||||
- "exists|found|yes"
|
||||
required_tool: "filesystem-exists"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses filesystem-exists tool"
|
||||
completeness_criteria: "Provides clear yes/no answer"
|
||||
|
||||
- id: "build_status"
|
||||
name: "Get Build Status"
|
||||
prompt: "What build presets are available for macOS?"
|
||||
expected_patterns:
|
||||
- "mac-dbg|mac-rel|mac-ai|mac-test"
|
||||
- "preset|configure"
|
||||
required_tool: "build-configure"
|
||||
scoring:
|
||||
accuracy_criteria: "Lists valid macOS presets"
|
||||
completeness_criteria: "Describes preset purposes"
|
||||
|
||||
visual_analysis:
|
||||
description: "Tasks for visual analysis and pattern recognition"
|
||||
tasks:
|
||||
- id: "find_similar_tiles"
|
||||
name: "Find Similar Tiles"
|
||||
prompt: "Find tiles similar to tile 42 in the ROM. Use a similarity threshold of 85%."
|
||||
expected_patterns:
|
||||
- "similar|match|tile"
|
||||
- "similarity|score|percent"
|
||||
required_tool: "visual-find-similar-tiles"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses visual-find-similar-tiles with correct parameters"
|
||||
completeness_criteria: "Returns list of matching tiles with scores"
|
||||
|
||||
- id: "analyze_spritesheet"
|
||||
name: "Analyze Spritesheet"
|
||||
prompt: "Analyze graphics sheet 10 to find unused regions that could be used for custom graphics."
|
||||
expected_patterns:
|
||||
- "unused|empty|free"
|
||||
- "region|space|tile"
|
||||
required_tool: "visual-analyze-spritesheet"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses visual-analyze-spritesheet tool"
|
||||
completeness_criteria: "Reports locations and sizes of free regions"
|
||||
|
||||
- id: "palette_usage"
|
||||
name: "Palette Usage Analysis"
|
||||
prompt: "Analyze which palettes are used most frequently in the overworld maps."
|
||||
expected_patterns:
|
||||
- "palette|color"
|
||||
- "usage|count|percent"
|
||||
required_tool: "visual-palette-usage"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses visual-palette-usage with overworld type"
|
||||
completeness_criteria: "Shows palette usage statistics"
|
||||
|
||||
- id: "tile_histogram"
|
||||
name: "Tile Usage Histogram"
|
||||
prompt: "Generate a histogram of the top 20 most used tiles in dungeon rooms."
|
||||
expected_patterns:
|
||||
- "tile|usage|histogram"
|
||||
- "count|frequency|top"
|
||||
required_tool: "visual-tile-histogram"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses visual-tile-histogram with dungeon type"
|
||||
completeness_criteria: "Lists top tiles with usage counts"
|
||||
|
||||
project_management:
|
||||
description: "Tasks for project state and snapshot management"
|
||||
tasks:
|
||||
- id: "project_status"
|
||||
name: "Get Project Status"
|
||||
prompt: "What is the current project status? Show me any pending edits and available snapshots."
|
||||
expected_patterns:
|
||||
- "project|status|snapshot"
|
||||
- "edit|pending|initialized"
|
||||
required_tool: "project-status"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses project-status tool correctly"
|
||||
completeness_criteria: "Reports project state, snapshots, and ROM checksum"
|
||||
|
||||
- id: "create_snapshot"
|
||||
name: "Create Project Snapshot"
|
||||
prompt: "Create a snapshot named 'v1.0' with description 'Initial sprite modifications'."
|
||||
expected_patterns:
|
||||
- "snapshot|created|v1.0"
|
||||
- "edit|delta|saved"
|
||||
required_tool: "project-snapshot"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses project-snapshot with correct name parameter"
|
||||
completeness_criteria: "Confirms snapshot creation with details"
|
||||
|
||||
- id: "compare_snapshots"
|
||||
name: "Compare Snapshots"
|
||||
prompt: "Compare snapshots 'before-fix' and 'after-fix' to see what changed."
|
||||
expected_patterns:
|
||||
- "diff|compare|changed"
|
||||
- "added|removed|modified"
|
||||
required_tool: "project-diff"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses project-diff with both snapshot names"
|
||||
completeness_criteria: "Shows detailed comparison of edits"
|
||||
|
||||
- id: "restore_checkpoint"
|
||||
name: "Restore to Checkpoint"
|
||||
prompt: "Restore the ROM to the 'stable' snapshot."
|
||||
expected_patterns:
|
||||
- "restore|snapshot|stable"
|
||||
- "applied|reverted|edit"
|
||||
required_tool: "project-restore"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses project-restore with correct snapshot name"
|
||||
completeness_criteria: "Confirms restoration and lists applied edits"
|
||||
|
||||
code_generation:
|
||||
description: "Tasks for ASM code generation and patching"
|
||||
tasks:
|
||||
- id: "generate_hook"
|
||||
name: "Generate ASM Hook"
|
||||
prompt: "Generate an ASM hook at address $008040 with label MyCustomHook and 2 NOPs for alignment."
|
||||
expected_patterns:
|
||||
- "hook|JSL|008040"
|
||||
- "MyCustomHook|NOP"
|
||||
required_tool: "codegen-asm-hook"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses codegen-asm-hook with correct address and label"
|
||||
completeness_criteria: "Generates valid ASM with proper hook structure"
|
||||
|
||||
- id: "find_freespace"
|
||||
name: "Find Freespace for Patch"
|
||||
prompt: "Generate a freespace patch for 256 bytes of code labeled 'NewSpriteCode', preferring bank $3F."
|
||||
expected_patterns:
|
||||
- "freespace|org|NewSpriteCode"
|
||||
- "1F8000|bank|free"
|
||||
required_tool: "codegen-freespace-patch"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses codegen-freespace-patch with size and label"
|
||||
completeness_criteria: "Reports available regions and generates allocation code"
|
||||
|
||||
- id: "sprite_template"
|
||||
name: "Generate Sprite Template"
|
||||
prompt: "Generate a sprite template named 'FollowerSprite' with init code that sets sprite state and main code that follows the player."
|
||||
expected_patterns:
|
||||
- "sprite|FollowerSprite|template"
|
||||
- "init|main|0DD0"
|
||||
required_tool: "codegen-sprite-template"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses codegen-sprite-template with name and custom code"
|
||||
completeness_criteria: "Generates complete sprite with init and main sections"
|
||||
|
||||
- id: "event_handler"
|
||||
name: "Generate Event Handler"
|
||||
prompt: "Generate an NMI event handler labeled 'FrameCounter' that increments a counter each frame."
|
||||
expected_patterns:
|
||||
- "NMI|event|handler"
|
||||
- "FrameCounter|INC|counter"
|
||||
required_tool: "codegen-event-handler"
|
||||
scoring:
|
||||
accuracy_criteria: "Uses codegen-event-handler with type=nmi and label"
|
||||
completeness_criteria: "Generates handler with state preservation and custom code"
|
||||
|
||||
conversation:
|
||||
description: "Tasks testing multi-turn dialog and context"
|
||||
tasks:
|
||||
- id: "follow_up"
|
||||
name: "Follow-up Questions"
|
||||
multi_turn: true
|
||||
prompts:
|
||||
- "What is the main purpose of the Rom class?"
|
||||
- "What methods does it have for loading data?"
|
||||
- "Can you show me an example of using LoadFromFile?"
|
||||
expected_patterns:
|
||||
- "rom|ROM|file"
|
||||
- "load|read|parse"
|
||||
- "example|code|usage"
|
||||
scoring:
|
||||
accuracy_criteria: "Maintains context across turns"
|
||||
completeness_criteria: "Each response builds on previous"
|
||||
|
||||
- id: "clarification"
|
||||
name: "Handle Clarification"
|
||||
multi_turn: true
|
||||
prompts:
|
||||
- "How do I add a new sprite?"
|
||||
- "I mean in the dungeon editor, not the overworld"
|
||||
expected_patterns:
|
||||
- "sprite|dungeon|editor"
|
||||
- "add|create|place"
|
||||
scoring:
|
||||
accuracy_criteria: "Adjusts response based on clarification"
|
||||
completeness_criteria: "Provides dungeon-specific instructions"
|
||||
|
||||
# Scoring rubric definitions
|
||||
scoring_rubric:
|
||||
accuracy:
|
||||
10: "Perfect - completely correct with no errors"
|
||||
8: "Excellent - minor inaccuracies that don't affect understanding"
|
||||
6: "Good - mostly correct with some notable errors"
|
||||
4: "Fair - partially correct but missing key points"
|
||||
2: "Poor - significant errors or misunderstandings"
|
||||
0: "Incorrect - completely wrong or off-topic"
|
||||
|
||||
completeness:
|
||||
10: "Comprehensive - covers all aspects thoroughly"
|
||||
8: "Very complete - covers most aspects well"
|
||||
6: "Adequate - covers main points but missing some details"
|
||||
4: "Partial - covers some points but lacks depth"
|
||||
2: "Minimal - barely addresses the question"
|
||||
0: "Incomplete - doesn't meaningfully address the question"
|
||||
|
||||
tool_usage:
|
||||
10: "Perfect - uses correct tools with proper parameters"
|
||||
8: "Good - uses appropriate tools with minor parameter issues"
|
||||
6: "Adequate - uses tools but not optimally"
|
||||
4: "Fair - attempts tool use but with errors"
|
||||
2: "Poor - wrong tool or significant usage errors"
|
||||
0: "Failed - doesn't use required tools or fails completely"
|
||||
|
||||
# Report configuration
|
||||
reporting:
|
||||
output_format: "table" # table, json, markdown
|
||||
show_individual_scores: true
|
||||
show_response_samples: true
|
||||
max_sample_length: 500
|
||||
|
||||
3
scripts/ai/results/.gitkeep
Normal file
3
scripts/ai/results/.gitkeep
Normal file
@@ -0,0 +1,3 @@
|
||||
# This directory stores AI evaluation results
|
||||
# Results are gitignored but this file keeps the directory in the repo
|
||||
|
||||
340
scripts/ai/run-model-eval.sh
Executable file
340
scripts/ai/run-model-eval.sh
Executable file
@@ -0,0 +1,340 @@
|
||||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# YAZE AI Model Evaluation Script
|
||||
#
|
||||
# Runs AI model evaluations using the eval-runner.py engine.
|
||||
#
|
||||
# Usage:
|
||||
# ./run-model-eval.sh # Run with defaults
|
||||
# ./run-model-eval.sh --models llama3,qwen2.5 # Specific models
|
||||
# ./run-model-eval.sh --all # All available models
|
||||
# ./run-model-eval.sh --quick # Quick smoke test
|
||||
# ./run-model-eval.sh --compare # Compare and report
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Ollama running (ollama serve)
|
||||
# - Python 3.10+ with requests and pyyaml
|
||||
# - At least one model pulled (ollama pull llama3.2)
|
||||
# =============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
RESULTS_DIR="$SCRIPT_DIR/results"
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default settings
|
||||
MODELS=""
|
||||
TASKS="all"
|
||||
TIMEOUT=120
|
||||
DRY_RUN=false
|
||||
COMPARE=false
|
||||
QUICK_MODE=false
|
||||
ALL_MODELS=false
|
||||
DEFAULT_MODELS=false
|
||||
VERBOSE=false
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
print_header() {
|
||||
echo -e "${CYAN}"
|
||||
echo "╔════════════════════════════════════════════════════════════════════╗"
|
||||
echo "║ YAZE AI Model Evaluation ║"
|
||||
echo "╚════════════════════════════════════════════════════════════════════╝"
|
||||
echo -e "${NC}"
|
||||
}
|
||||
|
||||
print_step() {
|
||||
echo -e "${BLUE}[*]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[✓]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[!]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[✗]${NC} $1"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --models, -m LIST Comma-separated list of models to evaluate"
|
||||
echo " --all Evaluate all available models"
|
||||
echo " --default Evaluate default models from config"
|
||||
echo " --tasks, -t LIST Task categories (default: all)"
|
||||
echo " Options: rom_inspection, code_analysis, tool_calling, conversation"
|
||||
echo " --timeout SEC Timeout per task in seconds (default: 120)"
|
||||
echo " --quick Quick smoke test (fewer tasks)"
|
||||
echo " --dry-run Show what would run without executing"
|
||||
echo " --compare Generate comparison report after evaluation"
|
||||
echo " --verbose, -v Verbose output"
|
||||
echo " --help, -h Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
|
||||
echo " $0 --all --compare"
|
||||
echo " $0 --quick --default"
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
print_step "Checking prerequisites..."
|
||||
|
||||
local missing=false
|
||||
|
||||
# Check Python
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
print_error "Python 3 not found"
|
||||
missing=true
|
||||
else
|
||||
print_success "Python 3 found: $(python3 --version)"
|
||||
fi
|
||||
|
||||
# Check Python packages
|
||||
if python3 -c "import requests" 2>/dev/null; then
|
||||
print_success "Python 'requests' package installed"
|
||||
else
|
||||
print_warning "Python 'requests' package missing - installing..."
|
||||
pip3 install requests --quiet || missing=true
|
||||
fi
|
||||
|
||||
if python3 -c "import yaml" 2>/dev/null; then
|
||||
print_success "Python 'pyyaml' package installed"
|
||||
else
|
||||
print_warning "Python 'pyyaml' package missing - installing..."
|
||||
pip3 install pyyaml --quiet || missing=true
|
||||
fi
|
||||
|
||||
# Check Ollama
|
||||
if ! command -v ollama &> /dev/null; then
|
||||
print_error "Ollama not found. Install from https://ollama.ai"
|
||||
missing=true
|
||||
else
|
||||
print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
|
||||
fi
|
||||
|
||||
# Check if Ollama is running
|
||||
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
||||
print_success "Ollama server is running"
|
||||
else
|
||||
print_warning "Ollama server not running - attempting to start..."
|
||||
ollama serve &> /dev/null &
|
||||
sleep 3
|
||||
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
||||
print_success "Ollama server started"
|
||||
else
|
||||
print_error "Could not start Ollama server. Run 'ollama serve' manually."
|
||||
missing=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if $missing; then
|
||||
print_error "Prerequisites check failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
list_available_models() {
|
||||
curl -s http://localhost:11434/api/tags | python3 -c "
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
for model in data.get('models', []):
|
||||
print(model['name'])
|
||||
" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
ensure_model() {
|
||||
local model=$1
|
||||
local available=$(list_available_models)
|
||||
|
||||
if echo "$available" | grep -q "^$model$"; then
|
||||
return 0
|
||||
else
|
||||
print_warning "Model '$model' not found, pulling..."
|
||||
ollama pull "$model"
|
||||
return $?
|
||||
fi
|
||||
}
|
||||
|
||||
run_evaluation() {
|
||||
local args=()
|
||||
|
||||
if [ -n "$MODELS" ]; then
|
||||
args+=(--models "$MODELS")
|
||||
elif $ALL_MODELS; then
|
||||
args+=(--all-models)
|
||||
elif $DEFAULT_MODELS; then
|
||||
args+=(--default-models)
|
||||
fi
|
||||
|
||||
args+=(--tasks "$TASKS")
|
||||
args+=(--timeout "$TIMEOUT")
|
||||
args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
|
||||
|
||||
if $DRY_RUN; then
|
||||
args+=(--dry-run)
|
||||
fi
|
||||
|
||||
local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
|
||||
args+=(--output "$output_file")
|
||||
|
||||
print_step "Running evaluation..."
|
||||
if $VERBOSE; then
|
||||
echo " Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
|
||||
local exit_code=$?
|
||||
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
print_success "Evaluation completed successfully"
|
||||
elif [ $exit_code -eq 1 ]; then
|
||||
print_warning "Evaluation completed with moderate scores"
|
||||
else
|
||||
print_error "Evaluation completed with poor scores"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
run_comparison() {
|
||||
print_step "Generating comparison report..."
|
||||
|
||||
local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
|
||||
|
||||
if [ -z "$result_files" ]; then
|
||||
print_error "No result files found"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
|
||||
|
||||
python3 "$SCRIPT_DIR/compare-models.py" \
|
||||
--format markdown \
|
||||
--task-analysis \
|
||||
--output "$report_file" \
|
||||
$result_files
|
||||
|
||||
print_success "Comparison report: $report_file"
|
||||
|
||||
# Also print table to console
|
||||
echo ""
|
||||
python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
|
||||
}
|
||||
|
||||
quick_test() {
|
||||
print_step "Running quick smoke test..."
|
||||
|
||||
# Get first available model
|
||||
local available=$(list_available_models | head -1)
|
||||
|
||||
if [ -z "$available" ]; then
|
||||
print_error "No models available. Pull a model with: ollama pull llama3.2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_step "Using model: $available"
|
||||
|
||||
# Run just one task category
|
||||
python3 "$SCRIPT_DIR/eval-runner.py" \
|
||||
--models "$available" \
|
||||
--tasks tool_calling \
|
||||
--timeout 60 \
|
||||
--config "$SCRIPT_DIR/eval-tasks.yaml"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Main
|
||||
# =============================================================================
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--models|-m)
|
||||
MODELS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--all)
|
||||
ALL_MODELS=true
|
||||
shift
|
||||
;;
|
||||
--default)
|
||||
DEFAULT_MODELS=true
|
||||
shift
|
||||
;;
|
||||
--tasks|-t)
|
||||
TASKS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--timeout)
|
||||
TIMEOUT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--quick)
|
||||
QUICK_MODE=true
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--compare)
|
||||
COMPARE=true
|
||||
shift
|
||||
;;
|
||||
--verbose|-v)
|
||||
VERBOSE=true
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown option: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Ensure results directory exists
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
print_header
|
||||
check_prerequisites
|
||||
|
||||
if $QUICK_MODE; then
|
||||
quick_test
|
||||
elif $DRY_RUN; then
|
||||
run_evaluation
|
||||
else
|
||||
run_evaluation
|
||||
|
||||
if $COMPARE; then
|
||||
echo ""
|
||||
run_comparison
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
print_success "Done!"
|
||||
|
||||
Reference in New Issue
Block a user