#!/usr/bin/env python3 """ YAZE AI Model Evaluation Runner Runs evaluation tasks against multiple AI models and produces scored results. Usage: python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json Requirements: pip install requests pyyaml """ import argparse import json import os import re import subprocess import sys import time from dataclasses import asdict, dataclass, field from datetime import datetime from pathlib import Path from typing import Any, Optional import requests import yaml @dataclass class TaskResult: """Result of a single task evaluation.""" task_id: str task_name: str category: str model: str prompt: str response: str response_time: float accuracy_score: float = 0.0 completeness_score: float = 0.0 tool_usage_score: float = 0.0 pattern_matches: list = field(default_factory=list) tools_used: list = field(default_factory=list) error: Optional[str] = None @property def overall_score(self) -> float: """Calculate weighted overall score.""" # Default weights from eval-tasks.yaml weights = { 'accuracy': 0.4, 'completeness': 0.3, 'tool_usage': 0.2, 'response_time': 0.1 } # Normalize response time to 0-10 scale (lower is better) # 0s = 10, 60s+ = 0 time_score = max(0, 10 - (self.response_time / 6)) return ( weights['accuracy'] * self.accuracy_score + weights['completeness'] * self.completeness_score + weights['tool_usage'] * self.tool_usage_score + weights['response_time'] * time_score ) @dataclass class ModelResults: """Aggregated results for a single model.""" model: str tasks: list[TaskResult] = field(default_factory=list) @property def avg_accuracy(self) -> float: if not self.tasks: return 0.0 return sum(t.accuracy_score for t in self.tasks) / len(self.tasks) @property def avg_completeness(self) -> float: if not self.tasks: return 0.0 return sum(t.completeness_score for t in self.tasks) / len(self.tasks) @property def avg_tool_usage(self) -> float: if not self.tasks: return 0.0 return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks) @property def avg_response_time(self) -> float: if not self.tasks: return 0.0 return sum(t.response_time for t in self.tasks) / len(self.tasks) @property def overall_score(self) -> float: if not self.tasks: return 0.0 return sum(t.overall_score for t in self.tasks) / len(self.tasks) class OllamaClient: """Client for Ollama API.""" def __init__(self, base_url: str = "http://localhost:11434"): self.base_url = base_url def is_available(self) -> bool: """Check if Ollama is running.""" try: resp = requests.get(f"{self.base_url}/api/tags", timeout=5) return resp.status_code == 200 except requests.exceptions.RequestException: return False def list_models(self) -> list[str]: """List available models.""" try: resp = requests.get(f"{self.base_url}/api/tags", timeout=10) if resp.status_code == 200: data = resp.json() return [m['name'] for m in data.get('models', [])] except requests.exceptions.RequestException: pass return [] def pull_model(self, model: str) -> bool: """Pull a model if not available.""" print(f" Pulling model {model}...", end=" ", flush=True) try: resp = requests.post( f"{self.base_url}/api/pull", json={"name": model}, timeout=600 # 10 minutes for large models ) if resp.status_code == 200: print("Done") return True except requests.exceptions.RequestException as e: print(f"Failed: {e}") return False def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]: """ Send a chat message and return response + response time. Returns: Tuple of (response_text, response_time_seconds) """ start_time = time.time() try: resp = requests.post( f"{self.base_url}/api/chat", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False }, timeout=timeout ) elapsed = time.time() - start_time if resp.status_code == 200: data = resp.json() content = data.get("message", {}).get("content", "") return content, elapsed else: return f"Error: HTTP {resp.status_code}", elapsed except requests.exceptions.Timeout: return "Error: Request timed out", timeout except requests.exceptions.RequestException as e: return f"Error: {str(e)}", time.time() - start_time class TaskEvaluator: """Evaluates task responses and assigns scores.""" def __init__(self, config: dict): self.config = config def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult: """Evaluate a response for a task.""" result = TaskResult( task_id=task['id'], task_name=task['name'], category=task.get('category', 'unknown'), model=task.get('model', 'unknown'), prompt=task.get('prompt', ''), response=response, response_time=response_time ) if response.startswith("Error:"): result.error = response return result # Check pattern matches expected_patterns = task.get('expected_patterns', []) for pattern in expected_patterns: if re.search(pattern, response, re.IGNORECASE): result.pattern_matches.append(pattern) # Score accuracy based on pattern matches if expected_patterns: match_ratio = len(result.pattern_matches) / len(expected_patterns) result.accuracy_score = match_ratio * 10 else: # No patterns defined, give neutral score result.accuracy_score = 5.0 # Score completeness based on response length and structure result.completeness_score = self._score_completeness(response, task) # Score tool usage result.tool_usage_score = self._score_tool_usage(response, task) return result def _score_completeness(self, response: str, task: dict) -> float: """Score completeness based on response characteristics.""" score = 0.0 # Base score for having a response if len(response.strip()) > 0: score += 2.0 # Length bonus (up to 4 points) word_count = len(response.split()) if word_count >= 20: score += min(4.0, word_count / 50) # Structure bonus (up to 2 points) if '\n' in response: score += 1.0 # Multi-line response if '- ' in response or '* ' in response: score += 0.5 # List items if any(c.isdigit() for c in response): score += 0.5 # Contains numbers/data # Code block bonus if '```' in response or ' ' in response: score += 1.0 return min(10.0, score) def _score_tool_usage(self, response: str, task: dict) -> float: """Score tool usage based on task requirements.""" required_tool = task.get('required_tool') if not required_tool: # No tool required, check if response is sensible return 7.0 # Neutral-good score # Check if the response mentions using tools tool_patterns = [ r'filesystem-list', r'filesystem-read', r'filesystem-exists', r'filesystem-info', r'build-configure', r'build-compile', r'build-test', r'memory-analyze', r'memory-search', ] tools_mentioned = [] for pattern in tool_patterns: if re.search(pattern, response, re.IGNORECASE): tools_mentioned.append(pattern) if required_tool.lower() in ' '.join(tools_mentioned).lower(): return 10.0 # Used the required tool elif tools_mentioned: return 6.0 # Used some tools but not the required one else: return 3.0 # Didn't use any tools when one was required def load_config(config_path: str) -> dict: """Load the evaluation tasks configuration.""" with open(config_path, 'r') as f: return yaml.safe_load(f) def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]: """Get all tasks for specified categories.""" tasks = [] for cat_name, cat_data in config.get('categories', {}).items(): if 'all' in categories or cat_name in categories: for task in cat_data.get('tasks', []): task['category'] = cat_name tasks.append(task) return tasks def run_evaluation( models: list[str], tasks: list[dict], client: OllamaClient, evaluator: TaskEvaluator, timeout: int = 120 ) -> dict[str, ModelResults]: """Run evaluation for all models and tasks.""" results = {} total = len(models) * len(tasks) current = 0 for model in models: print(f"\n{'='*60}") print(f"Evaluating: {model}") print(f"{'='*60}") model_results = ModelResults(model=model) for task in tasks: current += 1 print(f"\n [{current}/{total}] {task['id']}: {task['name']}") # Handle multi-turn tasks differently if task.get('multi_turn'): response, resp_time = run_multi_turn_task( client, model, task, timeout ) else: prompt = task.get('prompt', '') print(f" Prompt: {prompt[:60]}...") response, resp_time = client.chat(model, prompt, timeout) print(f" Response time: {resp_time:.2f}s") # Create a copy of task with model info task_with_model = {**task, 'model': model} # Evaluate the response result = evaluator.evaluate(task_with_model, response, resp_time) model_results.tasks.append(result) print(f" Accuracy: {result.accuracy_score:.1f}/10") print(f" Completeness: {result.completeness_score:.1f}/10") print(f" Tool Usage: {result.tool_usage_score:.1f}/10") print(f" Overall: {result.overall_score:.1f}/10") results[model] = model_results return results def run_multi_turn_task( client: OllamaClient, model: str, task: dict, timeout: int ) -> tuple[str, float]: """Run a multi-turn conversation task.""" prompts = task.get('prompts', []) if not prompts: return "Error: No prompts defined for multi-turn task", 0.0 total_time = 0.0 all_responses = [] for i, prompt in enumerate(prompts): # For simplicity, we send each prompt independently # A more sophisticated version would maintain conversation context print(f" Turn {i+1}: {prompt[:50]}...") response, resp_time = client.chat(model, prompt, timeout) total_time += resp_time all_responses.append(f"Turn {i+1}: {response}") return "\n\n".join(all_responses), total_time def print_summary(results: dict[str, ModelResults]): """Print a summary table of results.""" print("\n") print("┌" + "─"*70 + "┐") print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│") print("├" + "─"*70 + "┤") print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format( "Model", "Accuracy", "Tool Use", "Speed", "Overall" )) print("├" + "─"*70 + "┤") for model, model_results in sorted( results.items(), key=lambda x: x[1].overall_score, reverse=True ): # Format model name (truncate if needed) model_name = model[:20] if len(model) <= 20 else model[:17] + "..." print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format( model_name, model_results.avg_accuracy, model_results.avg_tool_usage, model_results.avg_response_time, model_results.overall_score )) print("└" + "─"*70 + "┘") def save_results(results: dict[str, ModelResults], output_path: str): """Save detailed results to JSON file.""" output_data = { "timestamp": datetime.now().isoformat(), "version": "1.0", "models": {} } for model, model_results in results.items(): output_data["models"][model] = { "summary": { "avg_accuracy": model_results.avg_accuracy, "avg_completeness": model_results.avg_completeness, "avg_tool_usage": model_results.avg_tool_usage, "avg_response_time": model_results.avg_response_time, "overall_score": model_results.overall_score, }, "tasks": [asdict(t) for t in model_results.tasks] } os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True) with open(output_path, 'w') as f: json.dump(output_data, f, indent=2) print(f"\nResults saved to: {output_path}") def main(): parser = argparse.ArgumentParser( description="YAZE AI Model Evaluation Runner" ) parser.add_argument( "--models", "-m", type=str, help="Comma-separated list of models to evaluate" ) parser.add_argument( "--all-models", action="store_true", help="Evaluate all available models" ) parser.add_argument( "--default-models", action="store_true", help="Evaluate default models from config" ) parser.add_argument( "--tasks", "-t", type=str, default="all", help="Task categories to run (comma-separated, or 'all')" ) parser.add_argument( "--config", "-c", type=str, default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"), help="Path to evaluation config file" ) parser.add_argument( "--output", "-o", type=str, help="Output file for results (default: results/eval-TIMESTAMP.json)" ) parser.add_argument( "--timeout", type=int, default=120, help="Timeout in seconds for each task (default: 120)" ) parser.add_argument( "--ollama-url", type=str, default="http://localhost:11434", help="Ollama API URL" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be evaluated without running" ) args = parser.parse_args() # Load configuration print("Loading configuration...") try: config = load_config(args.config) except Exception as e: print(f"Error loading config: {e}") sys.exit(1) # Initialize Ollama client client = OllamaClient(args.ollama_url) if not client.is_available(): print("Error: Ollama is not running. Start it with 'ollama serve'") sys.exit(1) # Determine which models to evaluate available_models = client.list_models() print(f"Available models: {', '.join(available_models) or 'none'}") if args.all_models: models = available_models elif args.default_models: default_model_names = [ m['name'] for m in config.get('default_models', []) ] models = [m for m in default_model_names if m in available_models] # Offer to pull missing models missing = [m for m in default_model_names if m not in available_models] if missing: print(f"Missing default models: {', '.join(missing)}") for m in missing: if client.pull_model(m): models.append(m) elif args.models: models = [m.strip() for m in args.models.split(',')] # Validate models exist for m in models: if m not in available_models: print(f"Warning: Model '{m}' not found. Attempting to pull...") if not client.pull_model(m): print(f" Failed to pull {m}, skipping") models.remove(m) else: # Default to first available model models = available_models[:1] if available_models else [] if not models: print("No models available for evaluation") sys.exit(1) print(f"Models to evaluate: {', '.join(models)}") # Get tasks categories = [c.strip() for c in args.tasks.split(',')] tasks = get_tasks_for_categories(config, categories) if not tasks: print(f"No tasks found for categories: {args.tasks}") sys.exit(1) print(f"Tasks to run: {len(tasks)}") for task in tasks: print(f" - [{task['category']}] {task['id']}: {task['name']}") if args.dry_run: print("\nDry run complete. Use --help for options.") sys.exit(0) # Run evaluation evaluator = TaskEvaluator(config) results = run_evaluation( models, tasks, client, evaluator, args.timeout ) # Print summary print_summary(results) # Save results output_path = args.output or os.path.join( os.path.dirname(__file__), "results", f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json" ) save_results(results, output_path) # Return exit code based on best model score best_score = max(r.overall_score for r in results.values()) if best_score >= 7.0: sys.exit(0) # Good elif best_score >= 5.0: sys.exit(1) # Okay else: sys.exit(2) # Poor if __name__ == "__main__": main()