backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)

This commit is contained in:
scawful
2025-12-22 00:20:49 +00:00
parent 2934c82b75
commit 5c4cd57ff8
1259 changed files with 239160 additions and 43801 deletions

370
scripts/ai/compare-models.py Executable file
View File

@@ -0,0 +1,370 @@
#!/usr/bin/env python3
"""
YAZE AI Model Comparison Report Generator
Generates comparison reports from evaluation results.
Usage:
python compare-models.py results/eval-*.json
python compare-models.py --format markdown results/eval-20241125.json
python compare-models.py --best results/eval-*.json
"""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
def load_results(file_paths: list[str]) -> list[dict]:
"""Load evaluation results from JSON files."""
results = []
for path in file_paths:
try:
with open(path, 'r') as f:
data = json.load(f)
data['_source_file'] = path
results.append(data)
except Exception as e:
print(f"Warning: Could not load {path}: {e}", file=sys.stderr)
return results
def merge_results(results: list[dict]) -> dict:
"""Merge multiple result files into a single comparison."""
merged = {
"sources": [],
"models": {},
"timestamp": datetime.now().isoformat()
}
for result in results:
merged["sources"].append(result.get('_source_file', 'unknown'))
for model, model_data in result.get('models', {}).items():
if model not in merged["models"]:
merged["models"][model] = {
"runs": [],
"summary": {}
}
merged["models"][model]["runs"].append({
"source": result.get('_source_file'),
"timestamp": result.get('timestamp'),
"summary": model_data.get('summary', {}),
"task_count": len(model_data.get('tasks', []))
})
# Calculate averages across runs
for model, data in merged["models"].items():
runs = data["runs"]
if runs:
data["summary"] = {
"avg_accuracy": sum(r["summary"].get("avg_accuracy", 0) for r in runs) / len(runs),
"avg_completeness": sum(r["summary"].get("avg_completeness", 0) for r in runs) / len(runs),
"avg_tool_usage": sum(r["summary"].get("avg_tool_usage", 0) for r in runs) / len(runs),
"avg_response_time": sum(r["summary"].get("avg_response_time", 0) for r in runs) / len(runs),
"overall_score": sum(r["summary"].get("overall_score", 0) for r in runs) / len(runs),
"run_count": len(runs)
}
return merged
def format_table(merged: dict) -> str:
"""Format results as ASCII table."""
lines = []
lines.append("" + ""*78 + "")
lines.append("" + " "*18 + "YAZE AI Model Comparison Report" + " "*27 + "")
lines.append("" + " "*18 + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}" + " "*27 + "")
lines.append("" + ""*78 + "")
lines.append("{:24}{:10}{:10}{:10}{:10}{:5}".format(
"Model", "Accuracy", "Complete", "Tool Use", "Speed", "Runs"
))
lines.append("" + ""*78 + "")
# Sort by overall score
sorted_models = sorted(
merged["models"].items(),
key=lambda x: x[1]["summary"].get("overall_score", 0),
reverse=True
)
for model, data in sorted_models:
summary = data["summary"]
model_name = model[:24] if len(model) <= 24 else model[:21] + "..."
lines.append("{:24}{:8.1f}/10 │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:5}".format(
model_name,
summary.get("avg_accuracy", 0),
summary.get("avg_completeness", 0),
summary.get("avg_tool_usage", 0),
summary.get("avg_response_time", 0),
summary.get("run_count", 0)
))
lines.append("" + ""*78 + "")
# Add recommendation
if sorted_models:
best_model = sorted_models[0][0]
best_score = sorted_models[0][1]["summary"].get("overall_score", 0)
lines.append("{:76}".format(f"Recommended: {best_model} (score: {best_score:.1f}/10)"))
lines.append("" + ""*78 + "")
return "\n".join(lines)
def format_markdown(merged: dict) -> str:
"""Format results as Markdown."""
lines = []
lines.append("# YAZE AI Model Comparison Report")
lines.append("")
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
lines.append("")
lines.append("## Summary")
lines.append("")
lines.append("| Model | Accuracy | Completeness | Tool Use | Speed | Overall | Runs |")
lines.append("|-------|----------|--------------|----------|-------|---------|------|")
sorted_models = sorted(
merged["models"].items(),
key=lambda x: x[1]["summary"].get("overall_score", 0),
reverse=True
)
for model, data in sorted_models:
summary = data["summary"]
lines.append("| {} | {:.1f}/10 | {:.1f}/10 | {:.1f}/10 | {:.1f}s | **{:.1f}/10** | {} |".format(
model,
summary.get("avg_accuracy", 0),
summary.get("avg_completeness", 0),
summary.get("avg_tool_usage", 0),
summary.get("avg_response_time", 0),
summary.get("overall_score", 0),
summary.get("run_count", 0)
))
lines.append("")
# Recommendation section
if sorted_models:
best = sorted_models[0]
lines.append("## Recommendation")
lines.append("")
lines.append(f"**Best Model:** `{best[0]}`")
lines.append("")
lines.append("### Strengths")
lines.append("")
summary = best[1]["summary"]
if summary.get("avg_accuracy", 0) >= 8:
lines.append("- ✅ High accuracy in responses")
if summary.get("avg_tool_usage", 0) >= 8:
lines.append("- ✅ Effective tool usage")
if summary.get("avg_response_time", 0) <= 3:
lines.append("- ✅ Fast response times")
if summary.get("avg_completeness", 0) >= 8:
lines.append("- ✅ Complete and detailed responses")
lines.append("")
lines.append("### Considerations")
lines.append("")
if summary.get("avg_accuracy", 0) < 7:
lines.append("- ⚠️ Accuracy could be improved")
if summary.get("avg_tool_usage", 0) < 7:
lines.append("- ⚠️ Tool usage needs improvement")
if summary.get("avg_response_time", 0) > 5:
lines.append("- ⚠️ Response times are slow")
# Source files section
lines.append("")
lines.append("## Sources")
lines.append("")
for source in merged.get("sources", []):
lines.append(f"- `{source}`")
return "\n".join(lines)
def format_json(merged: dict) -> str:
"""Format results as JSON."""
# Remove internal fields
output = {k: v for k, v in merged.items() if not k.startswith('_')}
return json.dumps(output, indent=2)
def get_best_model(merged: dict) -> str:
"""Get the name of the best performing model."""
sorted_models = sorted(
merged["models"].items(),
key=lambda x: x[1]["summary"].get("overall_score", 0),
reverse=True
)
if sorted_models:
return sorted_models[0][0]
return "unknown"
def analyze_task_performance(results: list[dict]) -> dict:
"""Analyze performance broken down by task category."""
task_performance = {}
for result in results:
for model, model_data in result.get('models', {}).items():
for task in model_data.get('tasks', []):
category = task.get('category', 'unknown')
task_id = task.get('task_id', 'unknown')
key = f"{category}/{task_id}"
if key not in task_performance:
task_performance[key] = {
"category": category,
"task_id": task_id,
"task_name": task.get('task_name', 'Unknown'),
"models": {}
}
if model not in task_performance[key]["models"]:
task_performance[key]["models"][model] = {
"scores": [],
"times": []
}
task_performance[key]["models"][model]["scores"].append(
task.get('accuracy_score', 0) * 0.5 +
task.get('completeness_score', 0) * 0.3 +
task.get('tool_usage_score', 0) * 0.2
)
task_performance[key]["models"][model]["times"].append(
task.get('response_time', 0)
)
# Calculate averages
for task_key, task_data in task_performance.items():
for model, model_scores in task_data["models"].items():
scores = model_scores["scores"]
times = model_scores["times"]
model_scores["avg_score"] = sum(scores) / len(scores) if scores else 0
model_scores["avg_time"] = sum(times) / len(times) if times else 0
return task_performance
def format_task_analysis(task_performance: dict) -> str:
"""Format task-level analysis."""
lines = []
lines.append("\n## Task-Level Performance\n")
# Group by category
by_category = {}
for key, data in task_performance.items():
cat = data["category"]
if cat not in by_category:
by_category[cat] = []
by_category[cat].append(data)
for category, tasks in sorted(by_category.items()):
lines.append(f"### {category.replace('_', ' ').title()}\n")
lines.append("| Task | Best Model | Score | Time |")
lines.append("|------|------------|-------|------|")
for task in tasks:
# Find best model for this task
best_model = None
best_score = 0
for model, scores in task["models"].items():
if scores["avg_score"] > best_score:
best_score = scores["avg_score"]
best_model = model
if best_model:
best_time = task["models"][best_model]["avg_time"]
lines.append("| {} | {} | {:.1f}/10 | {:.1f}s |".format(
task["task_name"],
best_model,
best_score,
best_time
))
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Generate comparison reports from AI evaluation results"
)
parser.add_argument(
"files",
nargs="+",
help="Evaluation result JSON files to compare"
)
parser.add_argument(
"--format", "-f",
choices=["table", "markdown", "json"],
default="table",
help="Output format (default: table)"
)
parser.add_argument(
"--output", "-o",
help="Output file (default: stdout)"
)
parser.add_argument(
"--best",
action="store_true",
help="Only output the best model name (for scripting)"
)
parser.add_argument(
"--task-analysis",
action="store_true",
help="Include task-level performance analysis"
)
args = parser.parse_args()
# Load and merge results
results = load_results(args.files)
if not results:
print("No valid result files found", file=sys.stderr)
sys.exit(1)
merged = merge_results(results)
# Handle --best flag
if args.best:
print(get_best_model(merged))
sys.exit(0)
# Format output
if args.format == "table":
output = format_table(merged)
elif args.format == "markdown":
output = format_markdown(merged)
if args.task_analysis:
task_perf = analyze_task_performance(results)
output += format_task_analysis(task_perf)
else:
output = format_json(merged)
# Write output
if args.output:
with open(args.output, 'w') as f:
f.write(output)
print(f"Report written to: {args.output}")
else:
print(output)
if __name__ == "__main__":
main()

596
scripts/ai/eval-runner.py Executable file
View File

@@ -0,0 +1,596 @@
#!/usr/bin/env python3
"""
YAZE AI Model Evaluation Runner
Runs evaluation tasks against multiple AI models and produces scored results.
Usage:
python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
Requirements:
pip install requests pyyaml
"""
import argparse
import json
import os
import re
import subprocess
import sys
import time
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
import requests
import yaml
@dataclass
class TaskResult:
"""Result of a single task evaluation."""
task_id: str
task_name: str
category: str
model: str
prompt: str
response: str
response_time: float
accuracy_score: float = 0.0
completeness_score: float = 0.0
tool_usage_score: float = 0.0
pattern_matches: list = field(default_factory=list)
tools_used: list = field(default_factory=list)
error: Optional[str] = None
@property
def overall_score(self) -> float:
"""Calculate weighted overall score."""
# Default weights from eval-tasks.yaml
weights = {
'accuracy': 0.4,
'completeness': 0.3,
'tool_usage': 0.2,
'response_time': 0.1
}
# Normalize response time to 0-10 scale (lower is better)
# 0s = 10, 60s+ = 0
time_score = max(0, 10 - (self.response_time / 6))
return (
weights['accuracy'] * self.accuracy_score +
weights['completeness'] * self.completeness_score +
weights['tool_usage'] * self.tool_usage_score +
weights['response_time'] * time_score
)
@dataclass
class ModelResults:
"""Aggregated results for a single model."""
model: str
tasks: list[TaskResult] = field(default_factory=list)
@property
def avg_accuracy(self) -> float:
if not self.tasks:
return 0.0
return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
@property
def avg_completeness(self) -> float:
if not self.tasks:
return 0.0
return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
@property
def avg_tool_usage(self) -> float:
if not self.tasks:
return 0.0
return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
@property
def avg_response_time(self) -> float:
if not self.tasks:
return 0.0
return sum(t.response_time for t in self.tasks) / len(self.tasks)
@property
def overall_score(self) -> float:
if not self.tasks:
return 0.0
return sum(t.overall_score for t in self.tasks) / len(self.tasks)
class OllamaClient:
"""Client for Ollama API."""
def __init__(self, base_url: str = "http://localhost:11434"):
self.base_url = base_url
def is_available(self) -> bool:
"""Check if Ollama is running."""
try:
resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
return resp.status_code == 200
except requests.exceptions.RequestException:
return False
def list_models(self) -> list[str]:
"""List available models."""
try:
resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
if resp.status_code == 200:
data = resp.json()
return [m['name'] for m in data.get('models', [])]
except requests.exceptions.RequestException:
pass
return []
def pull_model(self, model: str) -> bool:
"""Pull a model if not available."""
print(f" Pulling model {model}...", end=" ", flush=True)
try:
resp = requests.post(
f"{self.base_url}/api/pull",
json={"name": model},
timeout=600 # 10 minutes for large models
)
if resp.status_code == 200:
print("Done")
return True
except requests.exceptions.RequestException as e:
print(f"Failed: {e}")
return False
def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
"""
Send a chat message and return response + response time.
Returns:
Tuple of (response_text, response_time_seconds)
"""
start_time = time.time()
try:
resp = requests.post(
f"{self.base_url}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False
},
timeout=timeout
)
elapsed = time.time() - start_time
if resp.status_code == 200:
data = resp.json()
content = data.get("message", {}).get("content", "")
return content, elapsed
else:
return f"Error: HTTP {resp.status_code}", elapsed
except requests.exceptions.Timeout:
return "Error: Request timed out", timeout
except requests.exceptions.RequestException as e:
return f"Error: {str(e)}", time.time() - start_time
class TaskEvaluator:
"""Evaluates task responses and assigns scores."""
def __init__(self, config: dict):
self.config = config
def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
"""Evaluate a response for a task."""
result = TaskResult(
task_id=task['id'],
task_name=task['name'],
category=task.get('category', 'unknown'),
model=task.get('model', 'unknown'),
prompt=task.get('prompt', ''),
response=response,
response_time=response_time
)
if response.startswith("Error:"):
result.error = response
return result
# Check pattern matches
expected_patterns = task.get('expected_patterns', [])
for pattern in expected_patterns:
if re.search(pattern, response, re.IGNORECASE):
result.pattern_matches.append(pattern)
# Score accuracy based on pattern matches
if expected_patterns:
match_ratio = len(result.pattern_matches) / len(expected_patterns)
result.accuracy_score = match_ratio * 10
else:
# No patterns defined, give neutral score
result.accuracy_score = 5.0
# Score completeness based on response length and structure
result.completeness_score = self._score_completeness(response, task)
# Score tool usage
result.tool_usage_score = self._score_tool_usage(response, task)
return result
def _score_completeness(self, response: str, task: dict) -> float:
"""Score completeness based on response characteristics."""
score = 0.0
# Base score for having a response
if len(response.strip()) > 0:
score += 2.0
# Length bonus (up to 4 points)
word_count = len(response.split())
if word_count >= 20:
score += min(4.0, word_count / 50)
# Structure bonus (up to 2 points)
if '\n' in response:
score += 1.0 # Multi-line response
if '- ' in response or '* ' in response:
score += 0.5 # List items
if any(c.isdigit() for c in response):
score += 0.5 # Contains numbers/data
# Code block bonus
if '```' in response or ' ' in response:
score += 1.0
return min(10.0, score)
def _score_tool_usage(self, response: str, task: dict) -> float:
"""Score tool usage based on task requirements."""
required_tool = task.get('required_tool')
if not required_tool:
# No tool required, check if response is sensible
return 7.0 # Neutral-good score
# Check if the response mentions using tools
tool_patterns = [
r'filesystem-list',
r'filesystem-read',
r'filesystem-exists',
r'filesystem-info',
r'build-configure',
r'build-compile',
r'build-test',
r'memory-analyze',
r'memory-search',
]
tools_mentioned = []
for pattern in tool_patterns:
if re.search(pattern, response, re.IGNORECASE):
tools_mentioned.append(pattern)
if required_tool.lower() in ' '.join(tools_mentioned).lower():
return 10.0 # Used the required tool
elif tools_mentioned:
return 6.0 # Used some tools but not the required one
else:
return 3.0 # Didn't use any tools when one was required
def load_config(config_path: str) -> dict:
"""Load the evaluation tasks configuration."""
with open(config_path, 'r') as f:
return yaml.safe_load(f)
def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
"""Get all tasks for specified categories."""
tasks = []
for cat_name, cat_data in config.get('categories', {}).items():
if 'all' in categories or cat_name in categories:
for task in cat_data.get('tasks', []):
task['category'] = cat_name
tasks.append(task)
return tasks
def run_evaluation(
models: list[str],
tasks: list[dict],
client: OllamaClient,
evaluator: TaskEvaluator,
timeout: int = 120
) -> dict[str, ModelResults]:
"""Run evaluation for all models and tasks."""
results = {}
total = len(models) * len(tasks)
current = 0
for model in models:
print(f"\n{'='*60}")
print(f"Evaluating: {model}")
print(f"{'='*60}")
model_results = ModelResults(model=model)
for task in tasks:
current += 1
print(f"\n [{current}/{total}] {task['id']}: {task['name']}")
# Handle multi-turn tasks differently
if task.get('multi_turn'):
response, resp_time = run_multi_turn_task(
client, model, task, timeout
)
else:
prompt = task.get('prompt', '')
print(f" Prompt: {prompt[:60]}...")
response, resp_time = client.chat(model, prompt, timeout)
print(f" Response time: {resp_time:.2f}s")
# Create a copy of task with model info
task_with_model = {**task, 'model': model}
# Evaluate the response
result = evaluator.evaluate(task_with_model, response, resp_time)
model_results.tasks.append(result)
print(f" Accuracy: {result.accuracy_score:.1f}/10")
print(f" Completeness: {result.completeness_score:.1f}/10")
print(f" Tool Usage: {result.tool_usage_score:.1f}/10")
print(f" Overall: {result.overall_score:.1f}/10")
results[model] = model_results
return results
def run_multi_turn_task(
client: OllamaClient,
model: str,
task: dict,
timeout: int
) -> tuple[str, float]:
"""Run a multi-turn conversation task."""
prompts = task.get('prompts', [])
if not prompts:
return "Error: No prompts defined for multi-turn task", 0.0
total_time = 0.0
all_responses = []
for i, prompt in enumerate(prompts):
# For simplicity, we send each prompt independently
# A more sophisticated version would maintain conversation context
print(f" Turn {i+1}: {prompt[:50]}...")
response, resp_time = client.chat(model, prompt, timeout)
total_time += resp_time
all_responses.append(f"Turn {i+1}: {response}")
return "\n\n".join(all_responses), total_time
def print_summary(results: dict[str, ModelResults]):
"""Print a summary table of results."""
print("\n")
print("" + ""*70 + "")
print("" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "")
print("" + ""*70 + "")
print("{:20}{:10}{:10}{:10}{:10}".format(
"Model", "Accuracy", "Tool Use", "Speed", "Overall"
))
print("" + ""*70 + "")
for model, model_results in sorted(
results.items(),
key=lambda x: x[1].overall_score,
reverse=True
):
# Format model name (truncate if needed)
model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
print("{:20}{:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
model_name,
model_results.avg_accuracy,
model_results.avg_tool_usage,
model_results.avg_response_time,
model_results.overall_score
))
print("" + ""*70 + "")
def save_results(results: dict[str, ModelResults], output_path: str):
"""Save detailed results to JSON file."""
output_data = {
"timestamp": datetime.now().isoformat(),
"version": "1.0",
"models": {}
}
for model, model_results in results.items():
output_data["models"][model] = {
"summary": {
"avg_accuracy": model_results.avg_accuracy,
"avg_completeness": model_results.avg_completeness,
"avg_tool_usage": model_results.avg_tool_usage,
"avg_response_time": model_results.avg_response_time,
"overall_score": model_results.overall_score,
},
"tasks": [asdict(t) for t in model_results.tasks]
}
os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
with open(output_path, 'w') as f:
json.dump(output_data, f, indent=2)
print(f"\nResults saved to: {output_path}")
def main():
parser = argparse.ArgumentParser(
description="YAZE AI Model Evaluation Runner"
)
parser.add_argument(
"--models", "-m",
type=str,
help="Comma-separated list of models to evaluate"
)
parser.add_argument(
"--all-models",
action="store_true",
help="Evaluate all available models"
)
parser.add_argument(
"--default-models",
action="store_true",
help="Evaluate default models from config"
)
parser.add_argument(
"--tasks", "-t",
type=str,
default="all",
help="Task categories to run (comma-separated, or 'all')"
)
parser.add_argument(
"--config", "-c",
type=str,
default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
help="Path to evaluation config file"
)
parser.add_argument(
"--output", "-o",
type=str,
help="Output file for results (default: results/eval-TIMESTAMP.json)"
)
parser.add_argument(
"--timeout",
type=int,
default=120,
help="Timeout in seconds for each task (default: 120)"
)
parser.add_argument(
"--ollama-url",
type=str,
default="http://localhost:11434",
help="Ollama API URL"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be evaluated without running"
)
args = parser.parse_args()
# Load configuration
print("Loading configuration...")
try:
config = load_config(args.config)
except Exception as e:
print(f"Error loading config: {e}")
sys.exit(1)
# Initialize Ollama client
client = OllamaClient(args.ollama_url)
if not client.is_available():
print("Error: Ollama is not running. Start it with 'ollama serve'")
sys.exit(1)
# Determine which models to evaluate
available_models = client.list_models()
print(f"Available models: {', '.join(available_models) or 'none'}")
if args.all_models:
models = available_models
elif args.default_models:
default_model_names = [
m['name'] for m in config.get('default_models', [])
]
models = [m for m in default_model_names if m in available_models]
# Offer to pull missing models
missing = [m for m in default_model_names if m not in available_models]
if missing:
print(f"Missing default models: {', '.join(missing)}")
for m in missing:
if client.pull_model(m):
models.append(m)
elif args.models:
models = [m.strip() for m in args.models.split(',')]
# Validate models exist
for m in models:
if m not in available_models:
print(f"Warning: Model '{m}' not found. Attempting to pull...")
if not client.pull_model(m):
print(f" Failed to pull {m}, skipping")
models.remove(m)
else:
# Default to first available model
models = available_models[:1] if available_models else []
if not models:
print("No models available for evaluation")
sys.exit(1)
print(f"Models to evaluate: {', '.join(models)}")
# Get tasks
categories = [c.strip() for c in args.tasks.split(',')]
tasks = get_tasks_for_categories(config, categories)
if not tasks:
print(f"No tasks found for categories: {args.tasks}")
sys.exit(1)
print(f"Tasks to run: {len(tasks)}")
for task in tasks:
print(f" - [{task['category']}] {task['id']}: {task['name']}")
if args.dry_run:
print("\nDry run complete. Use --help for options.")
sys.exit(0)
# Run evaluation
evaluator = TaskEvaluator(config)
results = run_evaluation(
models, tasks, client, evaluator, args.timeout
)
# Print summary
print_summary(results)
# Save results
output_path = args.output or os.path.join(
os.path.dirname(__file__),
"results",
f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
)
save_results(results, output_path)
# Return exit code based on best model score
best_score = max(r.overall_score for r in results.values())
if best_score >= 7.0:
sys.exit(0) # Good
elif best_score >= 5.0:
sys.exit(1) # Okay
else:
sys.exit(2) # Poor
if __name__ == "__main__":
main()

383
scripts/ai/eval-tasks.yaml Normal file
View File

@@ -0,0 +1,383 @@
# YAZE AI Model Evaluation Tasks
#
# This file defines evaluation tasks for comparing different AI models
# used with the z3ed CLI agent system.
#
# Usage:
# ./scripts/ai/run-model-eval.sh --models "llama3,qwen2.5,codellama" --tasks all
# ./scripts/ai/run-model-eval.sh --tasks rom_inspection --models "llama3"
#
# Scoring:
# Each task is scored on a 0-10 scale across multiple dimensions:
# - accuracy: Did the model answer correctly?
# - completeness: Did it include all relevant information?
# - tool_usage: Did it use tools appropriately?
# - response_time: Measured in seconds (lower is better)
version: "1.0"
# Models to evaluate by default
default_models:
- name: "llama3.2:latest"
description: "Meta's Llama 3.2 - default baseline"
type: "baseline"
- name: "qwen2.5-coder:7b"
description: "Qwen 2.5 Coder - optimized for code"
type: "code"
- name: "codellama:7b"
description: "Meta's CodeLlama - code generation"
type: "code"
- name: "mistral:7b"
description: "Mistral 7B - general purpose"
type: "general"
- name: "phi3:medium"
description: "Microsoft Phi-3 - efficient"
type: "efficient"
# Scoring weights for overall score calculation
scoring_weights:
accuracy: 0.4
completeness: 0.3
tool_usage: 0.2
response_time: 0.1
# Maximum response time before timeout (seconds)
timeout: 120
# Evaluation task categories
categories:
rom_inspection:
description: "Tasks that inspect ROM data structures"
tasks:
- id: "list_dungeons"
name: "List Dungeons"
prompt: "What dungeons are in this ROM? List their names and IDs."
expected_patterns:
- "eastern palace|palace of darkness|desert palace"
- "tower of hera|swamp palace|skull woods"
- "thieves|ice palace|misery mire"
required_tool: null
scoring:
accuracy_criteria: "Lists at least 8 dungeons with correct names"
completeness_criteria: "Includes dungeon IDs or entrance info"
- id: "describe_overworld"
name: "Describe Overworld Map"
prompt: "Describe overworld map 0 (Light World). What areas and features are visible?"
expected_patterns:
- "light world|hyrule"
- "castle|sanctuary|kakariko"
required_tool: null
scoring:
accuracy_criteria: "Correctly identifies the Light World"
completeness_criteria: "Mentions multiple notable locations"
- id: "find_sprites"
name: "Find Sprites in Room"
prompt: "What sprites are present in dungeon room 0? List their types and positions."
expected_patterns:
- "sprite|enemy|npc"
- "position|coordinate|x|y"
required_tool: null
scoring:
accuracy_criteria: "Lists sprites with correct types"
completeness_criteria: "Includes position data"
- id: "entrance_info"
name: "Get Entrance Information"
prompt: "Where is the entrance to the Eastern Palace?"
expected_patterns:
- "eastern|palace|entrance"
- "east|light world"
required_tool: null
scoring:
accuracy_criteria: "Correctly identifies entrance location"
completeness_criteria: "Provides coordinates or map reference"
code_analysis:
description: "Tasks that analyze or generate code"
tasks:
- id: "explain_function"
name: "Explain Function"
prompt: "Explain what the function LoadDungeonRoom does in the codebase."
expected_patterns:
- "dungeon|room|load"
- "tilemap|object|sprite"
required_tool: "filesystem-read"
scoring:
accuracy_criteria: "Correctly describes the function purpose"
completeness_criteria: "Explains key steps or data flows"
- id: "find_bugs"
name: "Find Potential Issues"
prompt: "Are there any potential issues with how sprite coordinates are handled in room loading?"
expected_patterns:
- "bounds|overflow|check"
- "coordinate|position"
required_tool: "filesystem-read"
scoring:
accuracy_criteria: "Identifies real or plausible issues"
completeness_criteria: "Explains why the issue matters"
- id: "suggest_refactor"
name: "Suggest Refactoring"
prompt: "How could the dungeon editor's room rendering be improved for performance?"
expected_patterns:
- "cache|batch|optimize"
- "render|draw|update"
required_tool: "filesystem-read"
scoring:
accuracy_criteria: "Suggests valid optimization strategies"
completeness_criteria: "Explains implementation approach"
tool_calling:
description: "Tasks that require proper tool usage"
tasks:
- id: "list_files"
name: "List Source Files"
prompt: "List all .cc files in src/app/editor/"
expected_patterns:
- "\\.cc"
- "editor"
required_tool: "filesystem-list"
scoring:
accuracy_criteria: "Uses filesystem-list tool correctly"
completeness_criteria: "Lists files in correct directory"
- id: "read_file"
name: "Read File Contents"
prompt: "What are the first 20 lines of src/app/rom.h?"
expected_patterns:
- "#ifndef|#define|#include"
- "rom|Rom"
required_tool: "filesystem-read"
scoring:
accuracy_criteria: "Uses filesystem-read with correct path"
completeness_criteria: "Shows actual file content"
- id: "check_existence"
name: "Check File Existence"
prompt: "Does the file src/app/editor/dungeon/dungeon_editor.cc exist?"
expected_patterns:
- "exists|found|yes"
required_tool: "filesystem-exists"
scoring:
accuracy_criteria: "Uses filesystem-exists tool"
completeness_criteria: "Provides clear yes/no answer"
- id: "build_status"
name: "Get Build Status"
prompt: "What build presets are available for macOS?"
expected_patterns:
- "mac-dbg|mac-rel|mac-ai|mac-test"
- "preset|configure"
required_tool: "build-configure"
scoring:
accuracy_criteria: "Lists valid macOS presets"
completeness_criteria: "Describes preset purposes"
visual_analysis:
description: "Tasks for visual analysis and pattern recognition"
tasks:
- id: "find_similar_tiles"
name: "Find Similar Tiles"
prompt: "Find tiles similar to tile 42 in the ROM. Use a similarity threshold of 85%."
expected_patterns:
- "similar|match|tile"
- "similarity|score|percent"
required_tool: "visual-find-similar-tiles"
scoring:
accuracy_criteria: "Uses visual-find-similar-tiles with correct parameters"
completeness_criteria: "Returns list of matching tiles with scores"
- id: "analyze_spritesheet"
name: "Analyze Spritesheet"
prompt: "Analyze graphics sheet 10 to find unused regions that could be used for custom graphics."
expected_patterns:
- "unused|empty|free"
- "region|space|tile"
required_tool: "visual-analyze-spritesheet"
scoring:
accuracy_criteria: "Uses visual-analyze-spritesheet tool"
completeness_criteria: "Reports locations and sizes of free regions"
- id: "palette_usage"
name: "Palette Usage Analysis"
prompt: "Analyze which palettes are used most frequently in the overworld maps."
expected_patterns:
- "palette|color"
- "usage|count|percent"
required_tool: "visual-palette-usage"
scoring:
accuracy_criteria: "Uses visual-palette-usage with overworld type"
completeness_criteria: "Shows palette usage statistics"
- id: "tile_histogram"
name: "Tile Usage Histogram"
prompt: "Generate a histogram of the top 20 most used tiles in dungeon rooms."
expected_patterns:
- "tile|usage|histogram"
- "count|frequency|top"
required_tool: "visual-tile-histogram"
scoring:
accuracy_criteria: "Uses visual-tile-histogram with dungeon type"
completeness_criteria: "Lists top tiles with usage counts"
project_management:
description: "Tasks for project state and snapshot management"
tasks:
- id: "project_status"
name: "Get Project Status"
prompt: "What is the current project status? Show me any pending edits and available snapshots."
expected_patterns:
- "project|status|snapshot"
- "edit|pending|initialized"
required_tool: "project-status"
scoring:
accuracy_criteria: "Uses project-status tool correctly"
completeness_criteria: "Reports project state, snapshots, and ROM checksum"
- id: "create_snapshot"
name: "Create Project Snapshot"
prompt: "Create a snapshot named 'v1.0' with description 'Initial sprite modifications'."
expected_patterns:
- "snapshot|created|v1.0"
- "edit|delta|saved"
required_tool: "project-snapshot"
scoring:
accuracy_criteria: "Uses project-snapshot with correct name parameter"
completeness_criteria: "Confirms snapshot creation with details"
- id: "compare_snapshots"
name: "Compare Snapshots"
prompt: "Compare snapshots 'before-fix' and 'after-fix' to see what changed."
expected_patterns:
- "diff|compare|changed"
- "added|removed|modified"
required_tool: "project-diff"
scoring:
accuracy_criteria: "Uses project-diff with both snapshot names"
completeness_criteria: "Shows detailed comparison of edits"
- id: "restore_checkpoint"
name: "Restore to Checkpoint"
prompt: "Restore the ROM to the 'stable' snapshot."
expected_patterns:
- "restore|snapshot|stable"
- "applied|reverted|edit"
required_tool: "project-restore"
scoring:
accuracy_criteria: "Uses project-restore with correct snapshot name"
completeness_criteria: "Confirms restoration and lists applied edits"
code_generation:
description: "Tasks for ASM code generation and patching"
tasks:
- id: "generate_hook"
name: "Generate ASM Hook"
prompt: "Generate an ASM hook at address $008040 with label MyCustomHook and 2 NOPs for alignment."
expected_patterns:
- "hook|JSL|008040"
- "MyCustomHook|NOP"
required_tool: "codegen-asm-hook"
scoring:
accuracy_criteria: "Uses codegen-asm-hook with correct address and label"
completeness_criteria: "Generates valid ASM with proper hook structure"
- id: "find_freespace"
name: "Find Freespace for Patch"
prompt: "Generate a freespace patch for 256 bytes of code labeled 'NewSpriteCode', preferring bank $3F."
expected_patterns:
- "freespace|org|NewSpriteCode"
- "1F8000|bank|free"
required_tool: "codegen-freespace-patch"
scoring:
accuracy_criteria: "Uses codegen-freespace-patch with size and label"
completeness_criteria: "Reports available regions and generates allocation code"
- id: "sprite_template"
name: "Generate Sprite Template"
prompt: "Generate a sprite template named 'FollowerSprite' with init code that sets sprite state and main code that follows the player."
expected_patterns:
- "sprite|FollowerSprite|template"
- "init|main|0DD0"
required_tool: "codegen-sprite-template"
scoring:
accuracy_criteria: "Uses codegen-sprite-template with name and custom code"
completeness_criteria: "Generates complete sprite with init and main sections"
- id: "event_handler"
name: "Generate Event Handler"
prompt: "Generate an NMI event handler labeled 'FrameCounter' that increments a counter each frame."
expected_patterns:
- "NMI|event|handler"
- "FrameCounter|INC|counter"
required_tool: "codegen-event-handler"
scoring:
accuracy_criteria: "Uses codegen-event-handler with type=nmi and label"
completeness_criteria: "Generates handler with state preservation and custom code"
conversation:
description: "Tasks testing multi-turn dialog and context"
tasks:
- id: "follow_up"
name: "Follow-up Questions"
multi_turn: true
prompts:
- "What is the main purpose of the Rom class?"
- "What methods does it have for loading data?"
- "Can you show me an example of using LoadFromFile?"
expected_patterns:
- "rom|ROM|file"
- "load|read|parse"
- "example|code|usage"
scoring:
accuracy_criteria: "Maintains context across turns"
completeness_criteria: "Each response builds on previous"
- id: "clarification"
name: "Handle Clarification"
multi_turn: true
prompts:
- "How do I add a new sprite?"
- "I mean in the dungeon editor, not the overworld"
expected_patterns:
- "sprite|dungeon|editor"
- "add|create|place"
scoring:
accuracy_criteria: "Adjusts response based on clarification"
completeness_criteria: "Provides dungeon-specific instructions"
# Scoring rubric definitions
scoring_rubric:
accuracy:
10: "Perfect - completely correct with no errors"
8: "Excellent - minor inaccuracies that don't affect understanding"
6: "Good - mostly correct with some notable errors"
4: "Fair - partially correct but missing key points"
2: "Poor - significant errors or misunderstandings"
0: "Incorrect - completely wrong or off-topic"
completeness:
10: "Comprehensive - covers all aspects thoroughly"
8: "Very complete - covers most aspects well"
6: "Adequate - covers main points but missing some details"
4: "Partial - covers some points but lacks depth"
2: "Minimal - barely addresses the question"
0: "Incomplete - doesn't meaningfully address the question"
tool_usage:
10: "Perfect - uses correct tools with proper parameters"
8: "Good - uses appropriate tools with minor parameter issues"
6: "Adequate - uses tools but not optimally"
4: "Fair - attempts tool use but with errors"
2: "Poor - wrong tool or significant usage errors"
0: "Failed - doesn't use required tools or fails completely"
# Report configuration
reporting:
output_format: "table" # table, json, markdown
show_individual_scores: true
show_response_samples: true
max_sample_length: 500

View File

@@ -0,0 +1,3 @@
# This directory stores AI evaluation results
# Results are gitignored but this file keeps the directory in the repo

340
scripts/ai/run-model-eval.sh Executable file
View File

@@ -0,0 +1,340 @@
#!/bin/bash
# =============================================================================
# YAZE AI Model Evaluation Script
#
# Runs AI model evaluations using the eval-runner.py engine.
#
# Usage:
# ./run-model-eval.sh # Run with defaults
# ./run-model-eval.sh --models llama3,qwen2.5 # Specific models
# ./run-model-eval.sh --all # All available models
# ./run-model-eval.sh --quick # Quick smoke test
# ./run-model-eval.sh --compare # Compare and report
#
# Prerequisites:
# - Ollama running (ollama serve)
# - Python 3.10+ with requests and pyyaml
# - At least one model pulled (ollama pull llama3.2)
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
RESULTS_DIR="$SCRIPT_DIR/results"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Default settings
MODELS=""
TASKS="all"
TIMEOUT=120
DRY_RUN=false
COMPARE=false
QUICK_MODE=false
ALL_MODELS=false
DEFAULT_MODELS=false
VERBOSE=false
# =============================================================================
# Helper Functions
# =============================================================================
print_header() {
echo -e "${CYAN}"
echo "╔════════════════════════════════════════════════════════════════════╗"
echo "║ YAZE AI Model Evaluation ║"
echo "╚════════════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
}
print_step() {
echo -e "${BLUE}[*]${NC} $1"
}
print_success() {
echo -e "${GREEN}[✓]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[!]${NC} $1"
}
print_error() {
echo -e "${RED}[✗]${NC} $1"
}
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --models, -m LIST Comma-separated list of models to evaluate"
echo " --all Evaluate all available models"
echo " --default Evaluate default models from config"
echo " --tasks, -t LIST Task categories (default: all)"
echo " Options: rom_inspection, code_analysis, tool_calling, conversation"
echo " --timeout SEC Timeout per task in seconds (default: 120)"
echo " --quick Quick smoke test (fewer tasks)"
echo " --dry-run Show what would run without executing"
echo " --compare Generate comparison report after evaluation"
echo " --verbose, -v Verbose output"
echo " --help, -h Show this help message"
echo ""
echo "Examples:"
echo " $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
echo " $0 --all --compare"
echo " $0 --quick --default"
}
check_prerequisites() {
print_step "Checking prerequisites..."
local missing=false
# Check Python
if ! command -v python3 &> /dev/null; then
print_error "Python 3 not found"
missing=true
else
print_success "Python 3 found: $(python3 --version)"
fi
# Check Python packages
if python3 -c "import requests" 2>/dev/null; then
print_success "Python 'requests' package installed"
else
print_warning "Python 'requests' package missing - installing..."
pip3 install requests --quiet || missing=true
fi
if python3 -c "import yaml" 2>/dev/null; then
print_success "Python 'pyyaml' package installed"
else
print_warning "Python 'pyyaml' package missing - installing..."
pip3 install pyyaml --quiet || missing=true
fi
# Check Ollama
if ! command -v ollama &> /dev/null; then
print_error "Ollama not found. Install from https://ollama.ai"
missing=true
else
print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
fi
# Check if Ollama is running
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
print_success "Ollama server is running"
else
print_warning "Ollama server not running - attempting to start..."
ollama serve &> /dev/null &
sleep 3
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
print_success "Ollama server started"
else
print_error "Could not start Ollama server. Run 'ollama serve' manually."
missing=true
fi
fi
if $missing; then
print_error "Prerequisites check failed"
exit 1
fi
echo ""
}
list_available_models() {
curl -s http://localhost:11434/api/tags | python3 -c "
import json, sys
data = json.load(sys.stdin)
for model in data.get('models', []):
print(model['name'])
" 2>/dev/null || echo ""
}
ensure_model() {
local model=$1
local available=$(list_available_models)
if echo "$available" | grep -q "^$model$"; then
return 0
else
print_warning "Model '$model' not found, pulling..."
ollama pull "$model"
return $?
fi
}
run_evaluation() {
local args=()
if [ -n "$MODELS" ]; then
args+=(--models "$MODELS")
elif $ALL_MODELS; then
args+=(--all-models)
elif $DEFAULT_MODELS; then
args+=(--default-models)
fi
args+=(--tasks "$TASKS")
args+=(--timeout "$TIMEOUT")
args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
if $DRY_RUN; then
args+=(--dry-run)
fi
local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
args+=(--output "$output_file")
print_step "Running evaluation..."
if $VERBOSE; then
echo " Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
fi
echo ""
python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
local exit_code=$?
if [ $exit_code -eq 0 ]; then
print_success "Evaluation completed successfully"
elif [ $exit_code -eq 1 ]; then
print_warning "Evaluation completed with moderate scores"
else
print_error "Evaluation completed with poor scores"
fi
return 0
}
run_comparison() {
print_step "Generating comparison report..."
local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
if [ -z "$result_files" ]; then
print_error "No result files found"
return 1
fi
local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
python3 "$SCRIPT_DIR/compare-models.py" \
--format markdown \
--task-analysis \
--output "$report_file" \
$result_files
print_success "Comparison report: $report_file"
# Also print table to console
echo ""
python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
}
quick_test() {
print_step "Running quick smoke test..."
# Get first available model
local available=$(list_available_models | head -1)
if [ -z "$available" ]; then
print_error "No models available. Pull a model with: ollama pull llama3.2"
exit 1
fi
print_step "Using model: $available"
# Run just one task category
python3 "$SCRIPT_DIR/eval-runner.py" \
--models "$available" \
--tasks tool_calling \
--timeout 60 \
--config "$SCRIPT_DIR/eval-tasks.yaml"
}
# =============================================================================
# Main
# =============================================================================
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--models|-m)
MODELS="$2"
shift 2
;;
--all)
ALL_MODELS=true
shift
;;
--default)
DEFAULT_MODELS=true
shift
;;
--tasks|-t)
TASKS="$2"
shift 2
;;
--timeout)
TIMEOUT="$2"
shift 2
;;
--quick)
QUICK_MODE=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--compare)
COMPARE=true
shift
;;
--verbose|-v)
VERBOSE=true
shift
;;
--help|-h)
usage
exit 0
;;
*)
print_error "Unknown option: $1"
usage
exit 1
;;
esac
done
# Ensure results directory exists
mkdir -p "$RESULTS_DIR"
print_header
check_prerequisites
if $QUICK_MODE; then
quick_test
elif $DRY_RUN; then
run_evaluation
else
run_evaluation
if $COMPARE; then
echo ""
run_comparison
fi
fi
echo ""
print_success "Done!"