597 lines
19 KiB
Python
Executable File
597 lines
19 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
YAZE AI Model Evaluation Runner
|
|
|
|
Runs evaluation tasks against multiple AI models and produces scored results.
|
|
|
|
Usage:
|
|
python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
|
|
python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
|
|
|
|
Requirements:
|
|
pip install requests pyyaml
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import requests
|
|
import yaml
|
|
|
|
|
|
@dataclass
|
|
class TaskResult:
|
|
"""Result of a single task evaluation."""
|
|
task_id: str
|
|
task_name: str
|
|
category: str
|
|
model: str
|
|
prompt: str
|
|
response: str
|
|
response_time: float
|
|
accuracy_score: float = 0.0
|
|
completeness_score: float = 0.0
|
|
tool_usage_score: float = 0.0
|
|
pattern_matches: list = field(default_factory=list)
|
|
tools_used: list = field(default_factory=list)
|
|
error: Optional[str] = None
|
|
|
|
@property
|
|
def overall_score(self) -> float:
|
|
"""Calculate weighted overall score."""
|
|
# Default weights from eval-tasks.yaml
|
|
weights = {
|
|
'accuracy': 0.4,
|
|
'completeness': 0.3,
|
|
'tool_usage': 0.2,
|
|
'response_time': 0.1
|
|
}
|
|
|
|
# Normalize response time to 0-10 scale (lower is better)
|
|
# 0s = 10, 60s+ = 0
|
|
time_score = max(0, 10 - (self.response_time / 6))
|
|
|
|
return (
|
|
weights['accuracy'] * self.accuracy_score +
|
|
weights['completeness'] * self.completeness_score +
|
|
weights['tool_usage'] * self.tool_usage_score +
|
|
weights['response_time'] * time_score
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ModelResults:
|
|
"""Aggregated results for a single model."""
|
|
model: str
|
|
tasks: list[TaskResult] = field(default_factory=list)
|
|
|
|
@property
|
|
def avg_accuracy(self) -> float:
|
|
if not self.tasks:
|
|
return 0.0
|
|
return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
|
|
|
|
@property
|
|
def avg_completeness(self) -> float:
|
|
if not self.tasks:
|
|
return 0.0
|
|
return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
|
|
|
|
@property
|
|
def avg_tool_usage(self) -> float:
|
|
if not self.tasks:
|
|
return 0.0
|
|
return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
|
|
|
|
@property
|
|
def avg_response_time(self) -> float:
|
|
if not self.tasks:
|
|
return 0.0
|
|
return sum(t.response_time for t in self.tasks) / len(self.tasks)
|
|
|
|
@property
|
|
def overall_score(self) -> float:
|
|
if not self.tasks:
|
|
return 0.0
|
|
return sum(t.overall_score for t in self.tasks) / len(self.tasks)
|
|
|
|
|
|
class OllamaClient:
|
|
"""Client for Ollama API."""
|
|
|
|
def __init__(self, base_url: str = "http://localhost:11434"):
|
|
self.base_url = base_url
|
|
|
|
def is_available(self) -> bool:
|
|
"""Check if Ollama is running."""
|
|
try:
|
|
resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
|
|
return resp.status_code == 200
|
|
except requests.exceptions.RequestException:
|
|
return False
|
|
|
|
def list_models(self) -> list[str]:
|
|
"""List available models."""
|
|
try:
|
|
resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
return [m['name'] for m in data.get('models', [])]
|
|
except requests.exceptions.RequestException:
|
|
pass
|
|
return []
|
|
|
|
def pull_model(self, model: str) -> bool:
|
|
"""Pull a model if not available."""
|
|
print(f" Pulling model {model}...", end=" ", flush=True)
|
|
try:
|
|
resp = requests.post(
|
|
f"{self.base_url}/api/pull",
|
|
json={"name": model},
|
|
timeout=600 # 10 minutes for large models
|
|
)
|
|
if resp.status_code == 200:
|
|
print("Done")
|
|
return True
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Failed: {e}")
|
|
return False
|
|
|
|
def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
|
|
"""
|
|
Send a chat message and return response + response time.
|
|
|
|
Returns:
|
|
Tuple of (response_text, response_time_seconds)
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
resp = requests.post(
|
|
f"{self.base_url}/api/chat",
|
|
json={
|
|
"model": model,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"stream": False
|
|
},
|
|
timeout=timeout
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
content = data.get("message", {}).get("content", "")
|
|
return content, elapsed
|
|
else:
|
|
return f"Error: HTTP {resp.status_code}", elapsed
|
|
|
|
except requests.exceptions.Timeout:
|
|
return "Error: Request timed out", timeout
|
|
except requests.exceptions.RequestException as e:
|
|
return f"Error: {str(e)}", time.time() - start_time
|
|
|
|
|
|
class TaskEvaluator:
|
|
"""Evaluates task responses and assigns scores."""
|
|
|
|
def __init__(self, config: dict):
|
|
self.config = config
|
|
|
|
def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
|
|
"""Evaluate a response for a task."""
|
|
result = TaskResult(
|
|
task_id=task['id'],
|
|
task_name=task['name'],
|
|
category=task.get('category', 'unknown'),
|
|
model=task.get('model', 'unknown'),
|
|
prompt=task.get('prompt', ''),
|
|
response=response,
|
|
response_time=response_time
|
|
)
|
|
|
|
if response.startswith("Error:"):
|
|
result.error = response
|
|
return result
|
|
|
|
# Check pattern matches
|
|
expected_patterns = task.get('expected_patterns', [])
|
|
for pattern in expected_patterns:
|
|
if re.search(pattern, response, re.IGNORECASE):
|
|
result.pattern_matches.append(pattern)
|
|
|
|
# Score accuracy based on pattern matches
|
|
if expected_patterns:
|
|
match_ratio = len(result.pattern_matches) / len(expected_patterns)
|
|
result.accuracy_score = match_ratio * 10
|
|
else:
|
|
# No patterns defined, give neutral score
|
|
result.accuracy_score = 5.0
|
|
|
|
# Score completeness based on response length and structure
|
|
result.completeness_score = self._score_completeness(response, task)
|
|
|
|
# Score tool usage
|
|
result.tool_usage_score = self._score_tool_usage(response, task)
|
|
|
|
return result
|
|
|
|
def _score_completeness(self, response: str, task: dict) -> float:
|
|
"""Score completeness based on response characteristics."""
|
|
score = 0.0
|
|
|
|
# Base score for having a response
|
|
if len(response.strip()) > 0:
|
|
score += 2.0
|
|
|
|
# Length bonus (up to 4 points)
|
|
word_count = len(response.split())
|
|
if word_count >= 20:
|
|
score += min(4.0, word_count / 50)
|
|
|
|
# Structure bonus (up to 2 points)
|
|
if '\n' in response:
|
|
score += 1.0 # Multi-line response
|
|
if '- ' in response or '* ' in response:
|
|
score += 0.5 # List items
|
|
if any(c.isdigit() for c in response):
|
|
score += 0.5 # Contains numbers/data
|
|
|
|
# Code block bonus
|
|
if '```' in response or ' ' in response:
|
|
score += 1.0
|
|
|
|
return min(10.0, score)
|
|
|
|
def _score_tool_usage(self, response: str, task: dict) -> float:
|
|
"""Score tool usage based on task requirements."""
|
|
required_tool = task.get('required_tool')
|
|
|
|
if not required_tool:
|
|
# No tool required, check if response is sensible
|
|
return 7.0 # Neutral-good score
|
|
|
|
# Check if the response mentions using tools
|
|
tool_patterns = [
|
|
r'filesystem-list',
|
|
r'filesystem-read',
|
|
r'filesystem-exists',
|
|
r'filesystem-info',
|
|
r'build-configure',
|
|
r'build-compile',
|
|
r'build-test',
|
|
r'memory-analyze',
|
|
r'memory-search',
|
|
]
|
|
|
|
tools_mentioned = []
|
|
for pattern in tool_patterns:
|
|
if re.search(pattern, response, re.IGNORECASE):
|
|
tools_mentioned.append(pattern)
|
|
|
|
if required_tool.lower() in ' '.join(tools_mentioned).lower():
|
|
return 10.0 # Used the required tool
|
|
elif tools_mentioned:
|
|
return 6.0 # Used some tools but not the required one
|
|
else:
|
|
return 3.0 # Didn't use any tools when one was required
|
|
|
|
|
|
def load_config(config_path: str) -> dict:
|
|
"""Load the evaluation tasks configuration."""
|
|
with open(config_path, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
|
|
"""Get all tasks for specified categories."""
|
|
tasks = []
|
|
|
|
for cat_name, cat_data in config.get('categories', {}).items():
|
|
if 'all' in categories or cat_name in categories:
|
|
for task in cat_data.get('tasks', []):
|
|
task['category'] = cat_name
|
|
tasks.append(task)
|
|
|
|
return tasks
|
|
|
|
|
|
def run_evaluation(
|
|
models: list[str],
|
|
tasks: list[dict],
|
|
client: OllamaClient,
|
|
evaluator: TaskEvaluator,
|
|
timeout: int = 120
|
|
) -> dict[str, ModelResults]:
|
|
"""Run evaluation for all models and tasks."""
|
|
results = {}
|
|
|
|
total = len(models) * len(tasks)
|
|
current = 0
|
|
|
|
for model in models:
|
|
print(f"\n{'='*60}")
|
|
print(f"Evaluating: {model}")
|
|
print(f"{'='*60}")
|
|
|
|
model_results = ModelResults(model=model)
|
|
|
|
for task in tasks:
|
|
current += 1
|
|
print(f"\n [{current}/{total}] {task['id']}: {task['name']}")
|
|
|
|
# Handle multi-turn tasks differently
|
|
if task.get('multi_turn'):
|
|
response, resp_time = run_multi_turn_task(
|
|
client, model, task, timeout
|
|
)
|
|
else:
|
|
prompt = task.get('prompt', '')
|
|
print(f" Prompt: {prompt[:60]}...")
|
|
response, resp_time = client.chat(model, prompt, timeout)
|
|
|
|
print(f" Response time: {resp_time:.2f}s")
|
|
|
|
# Create a copy of task with model info
|
|
task_with_model = {**task, 'model': model}
|
|
|
|
# Evaluate the response
|
|
result = evaluator.evaluate(task_with_model, response, resp_time)
|
|
model_results.tasks.append(result)
|
|
|
|
print(f" Accuracy: {result.accuracy_score:.1f}/10")
|
|
print(f" Completeness: {result.completeness_score:.1f}/10")
|
|
print(f" Tool Usage: {result.tool_usage_score:.1f}/10")
|
|
print(f" Overall: {result.overall_score:.1f}/10")
|
|
|
|
results[model] = model_results
|
|
|
|
return results
|
|
|
|
|
|
def run_multi_turn_task(
|
|
client: OllamaClient,
|
|
model: str,
|
|
task: dict,
|
|
timeout: int
|
|
) -> tuple[str, float]:
|
|
"""Run a multi-turn conversation task."""
|
|
prompts = task.get('prompts', [])
|
|
if not prompts:
|
|
return "Error: No prompts defined for multi-turn task", 0.0
|
|
|
|
total_time = 0.0
|
|
all_responses = []
|
|
|
|
for i, prompt in enumerate(prompts):
|
|
# For simplicity, we send each prompt independently
|
|
# A more sophisticated version would maintain conversation context
|
|
print(f" Turn {i+1}: {prompt[:50]}...")
|
|
response, resp_time = client.chat(model, prompt, timeout)
|
|
total_time += resp_time
|
|
all_responses.append(f"Turn {i+1}: {response}")
|
|
|
|
return "\n\n".join(all_responses), total_time
|
|
|
|
|
|
def print_summary(results: dict[str, ModelResults]):
|
|
"""Print a summary table of results."""
|
|
print("\n")
|
|
print("┌" + "─"*70 + "┐")
|
|
print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
|
|
print("├" + "─"*70 + "┤")
|
|
print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
|
|
"Model", "Accuracy", "Tool Use", "Speed", "Overall"
|
|
))
|
|
print("├" + "─"*70 + "┤")
|
|
|
|
for model, model_results in sorted(
|
|
results.items(),
|
|
key=lambda x: x[1].overall_score,
|
|
reverse=True
|
|
):
|
|
# Format model name (truncate if needed)
|
|
model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
|
|
|
|
print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
|
|
model_name,
|
|
model_results.avg_accuracy,
|
|
model_results.avg_tool_usage,
|
|
model_results.avg_response_time,
|
|
model_results.overall_score
|
|
))
|
|
|
|
print("└" + "─"*70 + "┘")
|
|
|
|
|
|
def save_results(results: dict[str, ModelResults], output_path: str):
|
|
"""Save detailed results to JSON file."""
|
|
output_data = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"version": "1.0",
|
|
"models": {}
|
|
}
|
|
|
|
for model, model_results in results.items():
|
|
output_data["models"][model] = {
|
|
"summary": {
|
|
"avg_accuracy": model_results.avg_accuracy,
|
|
"avg_completeness": model_results.avg_completeness,
|
|
"avg_tool_usage": model_results.avg_tool_usage,
|
|
"avg_response_time": model_results.avg_response_time,
|
|
"overall_score": model_results.overall_score,
|
|
},
|
|
"tasks": [asdict(t) for t in model_results.tasks]
|
|
}
|
|
|
|
os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
|
|
with open(output_path, 'w') as f:
|
|
json.dump(output_data, f, indent=2)
|
|
|
|
print(f"\nResults saved to: {output_path}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="YAZE AI Model Evaluation Runner"
|
|
)
|
|
parser.add_argument(
|
|
"--models", "-m",
|
|
type=str,
|
|
help="Comma-separated list of models to evaluate"
|
|
)
|
|
parser.add_argument(
|
|
"--all-models",
|
|
action="store_true",
|
|
help="Evaluate all available models"
|
|
)
|
|
parser.add_argument(
|
|
"--default-models",
|
|
action="store_true",
|
|
help="Evaluate default models from config"
|
|
)
|
|
parser.add_argument(
|
|
"--tasks", "-t",
|
|
type=str,
|
|
default="all",
|
|
help="Task categories to run (comma-separated, or 'all')"
|
|
)
|
|
parser.add_argument(
|
|
"--config", "-c",
|
|
type=str,
|
|
default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
|
|
help="Path to evaluation config file"
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o",
|
|
type=str,
|
|
help="Output file for results (default: results/eval-TIMESTAMP.json)"
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=120,
|
|
help="Timeout in seconds for each task (default: 120)"
|
|
)
|
|
parser.add_argument(
|
|
"--ollama-url",
|
|
type=str,
|
|
default="http://localhost:11434",
|
|
help="Ollama API URL"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be evaluated without running"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load configuration
|
|
print("Loading configuration...")
|
|
try:
|
|
config = load_config(args.config)
|
|
except Exception as e:
|
|
print(f"Error loading config: {e}")
|
|
sys.exit(1)
|
|
|
|
# Initialize Ollama client
|
|
client = OllamaClient(args.ollama_url)
|
|
|
|
if not client.is_available():
|
|
print("Error: Ollama is not running. Start it with 'ollama serve'")
|
|
sys.exit(1)
|
|
|
|
# Determine which models to evaluate
|
|
available_models = client.list_models()
|
|
print(f"Available models: {', '.join(available_models) or 'none'}")
|
|
|
|
if args.all_models:
|
|
models = available_models
|
|
elif args.default_models:
|
|
default_model_names = [
|
|
m['name'] for m in config.get('default_models', [])
|
|
]
|
|
models = [m for m in default_model_names if m in available_models]
|
|
# Offer to pull missing models
|
|
missing = [m for m in default_model_names if m not in available_models]
|
|
if missing:
|
|
print(f"Missing default models: {', '.join(missing)}")
|
|
for m in missing:
|
|
if client.pull_model(m):
|
|
models.append(m)
|
|
elif args.models:
|
|
models = [m.strip() for m in args.models.split(',')]
|
|
# Validate models exist
|
|
for m in models:
|
|
if m not in available_models:
|
|
print(f"Warning: Model '{m}' not found. Attempting to pull...")
|
|
if not client.pull_model(m):
|
|
print(f" Failed to pull {m}, skipping")
|
|
models.remove(m)
|
|
else:
|
|
# Default to first available model
|
|
models = available_models[:1] if available_models else []
|
|
|
|
if not models:
|
|
print("No models available for evaluation")
|
|
sys.exit(1)
|
|
|
|
print(f"Models to evaluate: {', '.join(models)}")
|
|
|
|
# Get tasks
|
|
categories = [c.strip() for c in args.tasks.split(',')]
|
|
tasks = get_tasks_for_categories(config, categories)
|
|
|
|
if not tasks:
|
|
print(f"No tasks found for categories: {args.tasks}")
|
|
sys.exit(1)
|
|
|
|
print(f"Tasks to run: {len(tasks)}")
|
|
for task in tasks:
|
|
print(f" - [{task['category']}] {task['id']}: {task['name']}")
|
|
|
|
if args.dry_run:
|
|
print("\nDry run complete. Use --help for options.")
|
|
sys.exit(0)
|
|
|
|
# Run evaluation
|
|
evaluator = TaskEvaluator(config)
|
|
results = run_evaluation(
|
|
models, tasks, client, evaluator, args.timeout
|
|
)
|
|
|
|
# Print summary
|
|
print_summary(results)
|
|
|
|
# Save results
|
|
output_path = args.output or os.path.join(
|
|
os.path.dirname(__file__),
|
|
"results",
|
|
f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
|
)
|
|
save_results(results, output_path)
|
|
|
|
# Return exit code based on best model score
|
|
best_score = max(r.overall_score for r in results.values())
|
|
if best_score >= 7.0:
|
|
sys.exit(0) # Good
|
|
elif best_score >= 5.0:
|
|
sys.exit(1) # Okay
|
|
else:
|
|
sys.exit(2) # Poor
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|