backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)
This commit is contained in:
596
scripts/ai/eval-runner.py
Executable file
596
scripts/ai/eval-runner.py
Executable file
@@ -0,0 +1,596 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YAZE AI Model Evaluation Runner
|
||||
|
||||
Runs evaluation tasks against multiple AI models and produces scored results.
|
||||
|
||||
Usage:
|
||||
python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
|
||||
python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
|
||||
|
||||
Requirements:
|
||||
pip install requests pyyaml
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskResult:
|
||||
"""Result of a single task evaluation."""
|
||||
task_id: str
|
||||
task_name: str
|
||||
category: str
|
||||
model: str
|
||||
prompt: str
|
||||
response: str
|
||||
response_time: float
|
||||
accuracy_score: float = 0.0
|
||||
completeness_score: float = 0.0
|
||||
tool_usage_score: float = 0.0
|
||||
pattern_matches: list = field(default_factory=list)
|
||||
tools_used: list = field(default_factory=list)
|
||||
error: Optional[str] = None
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
"""Calculate weighted overall score."""
|
||||
# Default weights from eval-tasks.yaml
|
||||
weights = {
|
||||
'accuracy': 0.4,
|
||||
'completeness': 0.3,
|
||||
'tool_usage': 0.2,
|
||||
'response_time': 0.1
|
||||
}
|
||||
|
||||
# Normalize response time to 0-10 scale (lower is better)
|
||||
# 0s = 10, 60s+ = 0
|
||||
time_score = max(0, 10 - (self.response_time / 6))
|
||||
|
||||
return (
|
||||
weights['accuracy'] * self.accuracy_score +
|
||||
weights['completeness'] * self.completeness_score +
|
||||
weights['tool_usage'] * self.tool_usage_score +
|
||||
weights['response_time'] * time_score
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelResults:
|
||||
"""Aggregated results for a single model."""
|
||||
model: str
|
||||
tasks: list[TaskResult] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def avg_accuracy(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_completeness(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_tool_usage(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def avg_response_time(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.response_time for t in self.tasks) / len(self.tasks)
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
if not self.tasks:
|
||||
return 0.0
|
||||
return sum(t.overall_score for t in self.tasks) / len(self.tasks)
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for Ollama API."""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:11434"):
|
||||
self.base_url = base_url
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Ollama is running."""
|
||||
try:
|
||||
resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
|
||||
return resp.status_code == 200
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
|
||||
def list_models(self) -> list[str]:
|
||||
"""List available models."""
|
||||
try:
|
||||
resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
return [m['name'] for m in data.get('models', [])]
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
return []
|
||||
|
||||
def pull_model(self, model: str) -> bool:
|
||||
"""Pull a model if not available."""
|
||||
print(f" Pulling model {model}...", end=" ", flush=True)
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{self.base_url}/api/pull",
|
||||
json={"name": model},
|
||||
timeout=600 # 10 minutes for large models
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
print("Done")
|
||||
return True
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Failed: {e}")
|
||||
return False
|
||||
|
||||
def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
|
||||
"""
|
||||
Send a chat message and return response + response time.
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, response_time_seconds)
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{self.base_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False
|
||||
},
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
content = data.get("message", {}).get("content", "")
|
||||
return content, elapsed
|
||||
else:
|
||||
return f"Error: HTTP {resp.status_code}", elapsed
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return "Error: Request timed out", timeout
|
||||
except requests.exceptions.RequestException as e:
|
||||
return f"Error: {str(e)}", time.time() - start_time
|
||||
|
||||
|
||||
class TaskEvaluator:
|
||||
"""Evaluates task responses and assigns scores."""
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
|
||||
def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
|
||||
"""Evaluate a response for a task."""
|
||||
result = TaskResult(
|
||||
task_id=task['id'],
|
||||
task_name=task['name'],
|
||||
category=task.get('category', 'unknown'),
|
||||
model=task.get('model', 'unknown'),
|
||||
prompt=task.get('prompt', ''),
|
||||
response=response,
|
||||
response_time=response_time
|
||||
)
|
||||
|
||||
if response.startswith("Error:"):
|
||||
result.error = response
|
||||
return result
|
||||
|
||||
# Check pattern matches
|
||||
expected_patterns = task.get('expected_patterns', [])
|
||||
for pattern in expected_patterns:
|
||||
if re.search(pattern, response, re.IGNORECASE):
|
||||
result.pattern_matches.append(pattern)
|
||||
|
||||
# Score accuracy based on pattern matches
|
||||
if expected_patterns:
|
||||
match_ratio = len(result.pattern_matches) / len(expected_patterns)
|
||||
result.accuracy_score = match_ratio * 10
|
||||
else:
|
||||
# No patterns defined, give neutral score
|
||||
result.accuracy_score = 5.0
|
||||
|
||||
# Score completeness based on response length and structure
|
||||
result.completeness_score = self._score_completeness(response, task)
|
||||
|
||||
# Score tool usage
|
||||
result.tool_usage_score = self._score_tool_usage(response, task)
|
||||
|
||||
return result
|
||||
|
||||
def _score_completeness(self, response: str, task: dict) -> float:
|
||||
"""Score completeness based on response characteristics."""
|
||||
score = 0.0
|
||||
|
||||
# Base score for having a response
|
||||
if len(response.strip()) > 0:
|
||||
score += 2.0
|
||||
|
||||
# Length bonus (up to 4 points)
|
||||
word_count = len(response.split())
|
||||
if word_count >= 20:
|
||||
score += min(4.0, word_count / 50)
|
||||
|
||||
# Structure bonus (up to 2 points)
|
||||
if '\n' in response:
|
||||
score += 1.0 # Multi-line response
|
||||
if '- ' in response or '* ' in response:
|
||||
score += 0.5 # List items
|
||||
if any(c.isdigit() for c in response):
|
||||
score += 0.5 # Contains numbers/data
|
||||
|
||||
# Code block bonus
|
||||
if '```' in response or ' ' in response:
|
||||
score += 1.0
|
||||
|
||||
return min(10.0, score)
|
||||
|
||||
def _score_tool_usage(self, response: str, task: dict) -> float:
|
||||
"""Score tool usage based on task requirements."""
|
||||
required_tool = task.get('required_tool')
|
||||
|
||||
if not required_tool:
|
||||
# No tool required, check if response is sensible
|
||||
return 7.0 # Neutral-good score
|
||||
|
||||
# Check if the response mentions using tools
|
||||
tool_patterns = [
|
||||
r'filesystem-list',
|
||||
r'filesystem-read',
|
||||
r'filesystem-exists',
|
||||
r'filesystem-info',
|
||||
r'build-configure',
|
||||
r'build-compile',
|
||||
r'build-test',
|
||||
r'memory-analyze',
|
||||
r'memory-search',
|
||||
]
|
||||
|
||||
tools_mentioned = []
|
||||
for pattern in tool_patterns:
|
||||
if re.search(pattern, response, re.IGNORECASE):
|
||||
tools_mentioned.append(pattern)
|
||||
|
||||
if required_tool.lower() in ' '.join(tools_mentioned).lower():
|
||||
return 10.0 # Used the required tool
|
||||
elif tools_mentioned:
|
||||
return 6.0 # Used some tools but not the required one
|
||||
else:
|
||||
return 3.0 # Didn't use any tools when one was required
|
||||
|
||||
|
||||
def load_config(config_path: str) -> dict:
|
||||
"""Load the evaluation tasks configuration."""
|
||||
with open(config_path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
|
||||
"""Get all tasks for specified categories."""
|
||||
tasks = []
|
||||
|
||||
for cat_name, cat_data in config.get('categories', {}).items():
|
||||
if 'all' in categories or cat_name in categories:
|
||||
for task in cat_data.get('tasks', []):
|
||||
task['category'] = cat_name
|
||||
tasks.append(task)
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
def run_evaluation(
|
||||
models: list[str],
|
||||
tasks: list[dict],
|
||||
client: OllamaClient,
|
||||
evaluator: TaskEvaluator,
|
||||
timeout: int = 120
|
||||
) -> dict[str, ModelResults]:
|
||||
"""Run evaluation for all models and tasks."""
|
||||
results = {}
|
||||
|
||||
total = len(models) * len(tasks)
|
||||
current = 0
|
||||
|
||||
for model in models:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Evaluating: {model}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
model_results = ModelResults(model=model)
|
||||
|
||||
for task in tasks:
|
||||
current += 1
|
||||
print(f"\n [{current}/{total}] {task['id']}: {task['name']}")
|
||||
|
||||
# Handle multi-turn tasks differently
|
||||
if task.get('multi_turn'):
|
||||
response, resp_time = run_multi_turn_task(
|
||||
client, model, task, timeout
|
||||
)
|
||||
else:
|
||||
prompt = task.get('prompt', '')
|
||||
print(f" Prompt: {prompt[:60]}...")
|
||||
response, resp_time = client.chat(model, prompt, timeout)
|
||||
|
||||
print(f" Response time: {resp_time:.2f}s")
|
||||
|
||||
# Create a copy of task with model info
|
||||
task_with_model = {**task, 'model': model}
|
||||
|
||||
# Evaluate the response
|
||||
result = evaluator.evaluate(task_with_model, response, resp_time)
|
||||
model_results.tasks.append(result)
|
||||
|
||||
print(f" Accuracy: {result.accuracy_score:.1f}/10")
|
||||
print(f" Completeness: {result.completeness_score:.1f}/10")
|
||||
print(f" Tool Usage: {result.tool_usage_score:.1f}/10")
|
||||
print(f" Overall: {result.overall_score:.1f}/10")
|
||||
|
||||
results[model] = model_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_multi_turn_task(
|
||||
client: OllamaClient,
|
||||
model: str,
|
||||
task: dict,
|
||||
timeout: int
|
||||
) -> tuple[str, float]:
|
||||
"""Run a multi-turn conversation task."""
|
||||
prompts = task.get('prompts', [])
|
||||
if not prompts:
|
||||
return "Error: No prompts defined for multi-turn task", 0.0
|
||||
|
||||
total_time = 0.0
|
||||
all_responses = []
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
# For simplicity, we send each prompt independently
|
||||
# A more sophisticated version would maintain conversation context
|
||||
print(f" Turn {i+1}: {prompt[:50]}...")
|
||||
response, resp_time = client.chat(model, prompt, timeout)
|
||||
total_time += resp_time
|
||||
all_responses.append(f"Turn {i+1}: {response}")
|
||||
|
||||
return "\n\n".join(all_responses), total_time
|
||||
|
||||
|
||||
def print_summary(results: dict[str, ModelResults]):
|
||||
"""Print a summary table of results."""
|
||||
print("\n")
|
||||
print("┌" + "─"*70 + "┐")
|
||||
print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
|
||||
print("├" + "─"*70 + "┤")
|
||||
print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
|
||||
"Model", "Accuracy", "Tool Use", "Speed", "Overall"
|
||||
))
|
||||
print("├" + "─"*70 + "┤")
|
||||
|
||||
for model, model_results in sorted(
|
||||
results.items(),
|
||||
key=lambda x: x[1].overall_score,
|
||||
reverse=True
|
||||
):
|
||||
# Format model name (truncate if needed)
|
||||
model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
|
||||
|
||||
print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
|
||||
model_name,
|
||||
model_results.avg_accuracy,
|
||||
model_results.avg_tool_usage,
|
||||
model_results.avg_response_time,
|
||||
model_results.overall_score
|
||||
))
|
||||
|
||||
print("└" + "─"*70 + "┘")
|
||||
|
||||
|
||||
def save_results(results: dict[str, ModelResults], output_path: str):
|
||||
"""Save detailed results to JSON file."""
|
||||
output_data = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"version": "1.0",
|
||||
"models": {}
|
||||
}
|
||||
|
||||
for model, model_results in results.items():
|
||||
output_data["models"][model] = {
|
||||
"summary": {
|
||||
"avg_accuracy": model_results.avg_accuracy,
|
||||
"avg_completeness": model_results.avg_completeness,
|
||||
"avg_tool_usage": model_results.avg_tool_usage,
|
||||
"avg_response_time": model_results.avg_response_time,
|
||||
"overall_score": model_results.overall_score,
|
||||
},
|
||||
"tasks": [asdict(t) for t in model_results.tasks]
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
print(f"\nResults saved to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="YAZE AI Model Evaluation Runner"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--models", "-m",
|
||||
type=str,
|
||||
help="Comma-separated list of models to evaluate"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all-models",
|
||||
action="store_true",
|
||||
help="Evaluate all available models"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--default-models",
|
||||
action="store_true",
|
||||
help="Evaluate default models from config"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tasks", "-t",
|
||||
type=str,
|
||||
default="all",
|
||||
help="Task categories to run (comma-separated, or 'all')"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config", "-c",
|
||||
type=str,
|
||||
default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
|
||||
help="Path to evaluation config file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=str,
|
||||
help="Output file for results (default: results/eval-TIMESTAMP.json)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=120,
|
||||
help="Timeout in seconds for each task (default: 120)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ollama-url",
|
||||
type=str,
|
||||
default="http://localhost:11434",
|
||||
help="Ollama API URL"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be evaluated without running"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load configuration
|
||||
print("Loading configuration...")
|
||||
try:
|
||||
config = load_config(args.config)
|
||||
except Exception as e:
|
||||
print(f"Error loading config: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize Ollama client
|
||||
client = OllamaClient(args.ollama_url)
|
||||
|
||||
if not client.is_available():
|
||||
print("Error: Ollama is not running. Start it with 'ollama serve'")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine which models to evaluate
|
||||
available_models = client.list_models()
|
||||
print(f"Available models: {', '.join(available_models) or 'none'}")
|
||||
|
||||
if args.all_models:
|
||||
models = available_models
|
||||
elif args.default_models:
|
||||
default_model_names = [
|
||||
m['name'] for m in config.get('default_models', [])
|
||||
]
|
||||
models = [m for m in default_model_names if m in available_models]
|
||||
# Offer to pull missing models
|
||||
missing = [m for m in default_model_names if m not in available_models]
|
||||
if missing:
|
||||
print(f"Missing default models: {', '.join(missing)}")
|
||||
for m in missing:
|
||||
if client.pull_model(m):
|
||||
models.append(m)
|
||||
elif args.models:
|
||||
models = [m.strip() for m in args.models.split(',')]
|
||||
# Validate models exist
|
||||
for m in models:
|
||||
if m not in available_models:
|
||||
print(f"Warning: Model '{m}' not found. Attempting to pull...")
|
||||
if not client.pull_model(m):
|
||||
print(f" Failed to pull {m}, skipping")
|
||||
models.remove(m)
|
||||
else:
|
||||
# Default to first available model
|
||||
models = available_models[:1] if available_models else []
|
||||
|
||||
if not models:
|
||||
print("No models available for evaluation")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Models to evaluate: {', '.join(models)}")
|
||||
|
||||
# Get tasks
|
||||
categories = [c.strip() for c in args.tasks.split(',')]
|
||||
tasks = get_tasks_for_categories(config, categories)
|
||||
|
||||
if not tasks:
|
||||
print(f"No tasks found for categories: {args.tasks}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Tasks to run: {len(tasks)}")
|
||||
for task in tasks:
|
||||
print(f" - [{task['category']}] {task['id']}: {task['name']}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDry run complete. Use --help for options.")
|
||||
sys.exit(0)
|
||||
|
||||
# Run evaluation
|
||||
evaluator = TaskEvaluator(config)
|
||||
results = run_evaluation(
|
||||
models, tasks, client, evaluator, args.timeout
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print_summary(results)
|
||||
|
||||
# Save results
|
||||
output_path = args.output or os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"results",
|
||||
f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
||||
)
|
||||
save_results(results, output_path)
|
||||
|
||||
# Return exit code based on best model score
|
||||
best_score = max(r.overall_score for r in results.values())
|
||||
if best_score >= 7.0:
|
||||
sys.exit(0) # Good
|
||||
elif best_score >= 5.0:
|
||||
sys.exit(1) # Okay
|
||||
else:
|
||||
sys.exit(2) # Poor
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user