backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)

2025-12-22 00:20:49 +00:00
parent 2934c82b75
commit 5c4cd57ff8
1259 changed files with 239160 additions and 43801 deletions
--- a/scripts/ai/compare-models.py
+++ b/scripts/ai/compare-models.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+"""
+YAZE AI Model Comparison Report Generator
+
+Generates comparison reports from evaluation results.
+
+Usage:
+    python compare-models.py results/eval-*.json
+    python compare-models.py --format markdown results/eval-20241125.json
+    python compare-models.py --best results/eval-*.json
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+def load_results(file_paths: list[str]) -> list[dict]:
+    """Load evaluation results from JSON files."""
+    results = []
+    for path in file_paths:
+        try:
+            with open(path, 'r') as f:
+                data = json.load(f)
+                data['_source_file'] = path
+                results.append(data)
+        except Exception as e:
+            print(f"Warning: Could not load {path}: {e}", file=sys.stderr)
+    return results
+
+
+def merge_results(results: list[dict]) -> dict:
+    """Merge multiple result files into a single comparison."""
+    merged = {
+        "sources": [],
+        "models": {},
+        "timestamp": datetime.now().isoformat()
+    }
+    
+    for result in results:
+        merged["sources"].append(result.get('_source_file', 'unknown'))
+        
+        for model, model_data in result.get('models', {}).items():
+            if model not in merged["models"]:
+                merged["models"][model] = {
+                    "runs": [],
+                    "summary": {}
+                }
+            
+            merged["models"][model]["runs"].append({
+                "source": result.get('_source_file'),
+                "timestamp": result.get('timestamp'),
+                "summary": model_data.get('summary', {}),
+                "task_count": len(model_data.get('tasks', []))
+            })
+    
+    # Calculate averages across runs
+    for model, data in merged["models"].items():
+        runs = data["runs"]
+        if runs:
+            data["summary"] = {
+                "avg_accuracy": sum(r["summary"].get("avg_accuracy", 0) for r in runs) / len(runs),
+                "avg_completeness": sum(r["summary"].get("avg_completeness", 0) for r in runs) / len(runs),
+                "avg_tool_usage": sum(r["summary"].get("avg_tool_usage", 0) for r in runs) / len(runs),
+                "avg_response_time": sum(r["summary"].get("avg_response_time", 0) for r in runs) / len(runs),
+                "overall_score": sum(r["summary"].get("overall_score", 0) for r in runs) / len(runs),
+                "run_count": len(runs)
+            }
+    
+    return merged
+
+
+def format_table(merged: dict) -> str:
+    """Format results as ASCII table."""
+    lines = []
+    
+    lines.append("┌" + "─"*78 + "┐")
+    lines.append("│" + " "*18 + "YAZE AI Model Comparison Report" + " "*27 + "│")
+    lines.append("│" + " "*18 + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}" + " "*27 + "│")
+    lines.append("├" + "─"*78 + "┤")
+    lines.append("│ {:24} │ {:10} │ {:10} │ {:10} │ {:10} │ {:5} │".format(
+        "Model", "Accuracy", "Complete", "Tool Use", "Speed", "Runs"
+    ))
+    lines.append("├" + "─"*78 + "┤")
+    
+    # Sort by overall score
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    for model, data in sorted_models:
+        summary = data["summary"]
+        model_name = model[:24] if len(model) <= 24 else model[:21] + "..."
+        
+        lines.append("│ {:24} │ {:8.1f}/10 │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:5} │".format(
+            model_name,
+            summary.get("avg_accuracy", 0),
+            summary.get("avg_completeness", 0),
+            summary.get("avg_tool_usage", 0),
+            summary.get("avg_response_time", 0),
+            summary.get("run_count", 0)
+        ))
+    
+    lines.append("├" + "─"*78 + "┤")
+    
+    # Add recommendation
+    if sorted_models:
+        best_model = sorted_models[0][0]
+        best_score = sorted_models[0][1]["summary"].get("overall_score", 0)
+        lines.append("│ {:76} │".format(f"Recommended: {best_model} (score: {best_score:.1f}/10)"))
+    
+    lines.append("└" + "─"*78 + "┘")
+    
+    return "\n".join(lines)
+
+
+def format_markdown(merged: dict) -> str:
+    """Format results as Markdown."""
+    lines = []
+    
+    lines.append("# YAZE AI Model Comparison Report")
+    lines.append("")
+    lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append("")
+    lines.append("| Model | Accuracy | Completeness | Tool Use | Speed | Overall | Runs |")
+    lines.append("|-------|----------|--------------|----------|-------|---------|------|")
+    
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    for model, data in sorted_models:
+        summary = data["summary"]
+        lines.append("| {} | {:.1f}/10 | {:.1f}/10 | {:.1f}/10 | {:.1f}s | **{:.1f}/10** | {} |".format(
+            model,
+            summary.get("avg_accuracy", 0),
+            summary.get("avg_completeness", 0),
+            summary.get("avg_tool_usage", 0),
+            summary.get("avg_response_time", 0),
+            summary.get("overall_score", 0),
+            summary.get("run_count", 0)
+        ))
+    
+    lines.append("")
+    
+    # Recommendation section
+    if sorted_models:
+        best = sorted_models[0]
+        lines.append("## Recommendation")
+        lines.append("")
+        lines.append(f"**Best Model:** `{best[0]}`")
+        lines.append("")
+        lines.append("### Strengths")
+        lines.append("")
+        
+        summary = best[1]["summary"]
+        if summary.get("avg_accuracy", 0) >= 8:
+            lines.append("- ✅ High accuracy in responses")
+        if summary.get("avg_tool_usage", 0) >= 8:
+            lines.append("- ✅ Effective tool usage")
+        if summary.get("avg_response_time", 0) <= 3:
+            lines.append("- ✅ Fast response times")
+        if summary.get("avg_completeness", 0) >= 8:
+            lines.append("- ✅ Complete and detailed responses")
+        
+        lines.append("")
+        lines.append("### Considerations")
+        lines.append("")
+        
+        if summary.get("avg_accuracy", 0) < 7:
+            lines.append("- ⚠️ Accuracy could be improved")
+        if summary.get("avg_tool_usage", 0) < 7:
+            lines.append("- ⚠️ Tool usage needs improvement")
+        if summary.get("avg_response_time", 0) > 5:
+            lines.append("- ⚠️ Response times are slow")
+    
+    # Source files section
+    lines.append("")
+    lines.append("## Sources")
+    lines.append("")
+    for source in merged.get("sources", []):
+        lines.append(f"- `{source}`")
+    
+    return "\n".join(lines)
+
+
+def format_json(merged: dict) -> str:
+    """Format results as JSON."""
+    # Remove internal fields
+    output = {k: v for k, v in merged.items() if not k.startswith('_')}
+    return json.dumps(output, indent=2)
+
+
+def get_best_model(merged: dict) -> str:
+    """Get the name of the best performing model."""
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    if sorted_models:
+        return sorted_models[0][0]
+    return "unknown"
+
+
+def analyze_task_performance(results: list[dict]) -> dict:
+    """Analyze performance broken down by task category."""
+    task_performance = {}
+    
+    for result in results:
+        for model, model_data in result.get('models', {}).items():
+            for task in model_data.get('tasks', []):
+                category = task.get('category', 'unknown')
+                task_id = task.get('task_id', 'unknown')
+                
+                key = f"{category}/{task_id}"
+                if key not in task_performance:
+                    task_performance[key] = {
+                        "category": category,
+                        "task_id": task_id,
+                        "task_name": task.get('task_name', 'Unknown'),
+                        "models": {}
+                    }
+                
+                if model not in task_performance[key]["models"]:
+                    task_performance[key]["models"][model] = {
+                        "scores": [],
+                        "times": []
+                    }
+                
+                task_performance[key]["models"][model]["scores"].append(
+                    task.get('accuracy_score', 0) * 0.5 + 
+                    task.get('completeness_score', 0) * 0.3 +
+                    task.get('tool_usage_score', 0) * 0.2
+                )
+                task_performance[key]["models"][model]["times"].append(
+                    task.get('response_time', 0)
+                )
+    
+    # Calculate averages
+    for task_key, task_data in task_performance.items():
+        for model, model_scores in task_data["models"].items():
+            scores = model_scores["scores"]
+            times = model_scores["times"]
+            model_scores["avg_score"] = sum(scores) / len(scores) if scores else 0
+            model_scores["avg_time"] = sum(times) / len(times) if times else 0
+    
+    return task_performance
+
+
+def format_task_analysis(task_performance: dict) -> str:
+    """Format task-level analysis."""
+    lines = []
+    lines.append("\n## Task-Level Performance\n")
+    
+    # Group by category
+    by_category = {}
+    for key, data in task_performance.items():
+        cat = data["category"]
+        if cat not in by_category:
+            by_category[cat] = []
+        by_category[cat].append(data)
+    
+    for category, tasks in sorted(by_category.items()):
+        lines.append(f"### {category.replace('_', ' ').title()}\n")
+        lines.append("| Task | Best Model | Score | Time |")
+        lines.append("|------|------------|-------|------|")
+        
+        for task in tasks:
+            # Find best model for this task
+            best_model = None
+            best_score = 0
+            for model, scores in task["models"].items():
+                if scores["avg_score"] > best_score:
+                    best_score = scores["avg_score"]
+                    best_model = model
+            
+            if best_model:
+                best_time = task["models"][best_model]["avg_time"]
+                lines.append("| {} | {} | {:.1f}/10 | {:.1f}s |".format(
+                    task["task_name"],
+                    best_model,
+                    best_score,
+                    best_time
+                ))
+        
+        lines.append("")
+    
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate comparison reports from AI evaluation results"
+    )
+    parser.add_argument(
+        "files",
+        nargs="+",
+        help="Evaluation result JSON files to compare"
+    )
+    parser.add_argument(
+        "--format", "-f",
+        choices=["table", "markdown", "json"],
+        default="table",
+        help="Output format (default: table)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        help="Output file (default: stdout)"
+    )
+    parser.add_argument(
+        "--best",
+        action="store_true",
+        help="Only output the best model name (for scripting)"
+    )
+    parser.add_argument(
+        "--task-analysis",
+        action="store_true",
+        help="Include task-level performance analysis"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load and merge results
+    results = load_results(args.files)
+    if not results:
+        print("No valid result files found", file=sys.stderr)
+        sys.exit(1)
+    
+    merged = merge_results(results)
+    
+    # Handle --best flag
+    if args.best:
+        print(get_best_model(merged))
+        sys.exit(0)
+    
+    # Format output
+    if args.format == "table":
+        output = format_table(merged)
+    elif args.format == "markdown":
+        output = format_markdown(merged)
+        if args.task_analysis:
+            task_perf = analyze_task_performance(results)
+            output += format_task_analysis(task_perf)
+    else:
+        output = format_json(merged)
+    
+    # Write output
+    if args.output:
+        with open(args.output, 'w') as f:
+            f.write(output)
+        print(f"Report written to: {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/ai/eval-runner.py
+++ b/scripts/ai/eval-runner.py
@@ -0,0 +1,596 @@
+#!/usr/bin/env python3
+"""
+YAZE AI Model Evaluation Runner
+
+Runs evaluation tasks against multiple AI models and produces scored results.
+
+Usage:
+    python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
+    python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
+
+Requirements:
+    pip install requests pyyaml
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+
+import requests
+import yaml
+
+
+@dataclass
+class TaskResult:
+    """Result of a single task evaluation."""
+    task_id: str
+    task_name: str
+    category: str
+    model: str
+    prompt: str
+    response: str
+    response_time: float
+    accuracy_score: float = 0.0
+    completeness_score: float = 0.0
+    tool_usage_score: float = 0.0
+    pattern_matches: list = field(default_factory=list)
+    tools_used: list = field(default_factory=list)
+    error: Optional[str] = None
+    
+    @property
+    def overall_score(self) -> float:
+        """Calculate weighted overall score."""
+        # Default weights from eval-tasks.yaml
+        weights = {
+            'accuracy': 0.4,
+            'completeness': 0.3,
+            'tool_usage': 0.2,
+            'response_time': 0.1
+        }
+        
+        # Normalize response time to 0-10 scale (lower is better)
+        # 0s = 10, 60s+ = 0
+        time_score = max(0, 10 - (self.response_time / 6))
+        
+        return (
+            weights['accuracy'] * self.accuracy_score +
+            weights['completeness'] * self.completeness_score +
+            weights['tool_usage'] * self.tool_usage_score +
+            weights['response_time'] * time_score
+        )
+
+
+@dataclass
+class ModelResults:
+    """Aggregated results for a single model."""
+    model: str
+    tasks: list[TaskResult] = field(default_factory=list)
+    
+    @property
+    def avg_accuracy(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_completeness(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_tool_usage(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_response_time(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.response_time for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def overall_score(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.overall_score for t in self.tasks) / len(self.tasks)
+
+
+class OllamaClient:
+    """Client for Ollama API."""
+    
+    def __init__(self, base_url: str = "http://localhost:11434"):
+        self.base_url = base_url
+        
+    def is_available(self) -> bool:
+        """Check if Ollama is running."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            return resp.status_code == 200
+        except requests.exceptions.RequestException:
+            return False
+    
+    def list_models(self) -> list[str]:
+        """List available models."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
+            if resp.status_code == 200:
+                data = resp.json()
+                return [m['name'] for m in data.get('models', [])]
+        except requests.exceptions.RequestException:
+            pass
+        return []
+    
+    def pull_model(self, model: str) -> bool:
+        """Pull a model if not available."""
+        print(f"  Pulling model {model}...", end=" ", flush=True)
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/pull",
+                json={"name": model},
+                timeout=600  # 10 minutes for large models
+            )
+            if resp.status_code == 200:
+                print("Done")
+                return True
+        except requests.exceptions.RequestException as e:
+            print(f"Failed: {e}")
+        return False
+    
+    def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
+        """
+        Send a chat message and return response + response time.
+        
+        Returns:
+            Tuple of (response_text, response_time_seconds)
+        """
+        start_time = time.time()
+        
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False
+                },
+                timeout=timeout
+            )
+            
+            elapsed = time.time() - start_time
+            
+            if resp.status_code == 200:
+                data = resp.json()
+                content = data.get("message", {}).get("content", "")
+                return content, elapsed
+            else:
+                return f"Error: HTTP {resp.status_code}", elapsed
+                
+        except requests.exceptions.Timeout:
+            return "Error: Request timed out", timeout
+        except requests.exceptions.RequestException as e:
+            return f"Error: {str(e)}", time.time() - start_time
+
+
+class TaskEvaluator:
+    """Evaluates task responses and assigns scores."""
+    
+    def __init__(self, config: dict):
+        self.config = config
+        
+    def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
+        """Evaluate a response for a task."""
+        result = TaskResult(
+            task_id=task['id'],
+            task_name=task['name'],
+            category=task.get('category', 'unknown'),
+            model=task.get('model', 'unknown'),
+            prompt=task.get('prompt', ''),
+            response=response,
+            response_time=response_time
+        )
+        
+        if response.startswith("Error:"):
+            result.error = response
+            return result
+        
+        # Check pattern matches
+        expected_patterns = task.get('expected_patterns', [])
+        for pattern in expected_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                result.pattern_matches.append(pattern)
+        
+        # Score accuracy based on pattern matches
+        if expected_patterns:
+            match_ratio = len(result.pattern_matches) / len(expected_patterns)
+            result.accuracy_score = match_ratio * 10
+        else:
+            # No patterns defined, give neutral score
+            result.accuracy_score = 5.0
+        
+        # Score completeness based on response length and structure
+        result.completeness_score = self._score_completeness(response, task)
+        
+        # Score tool usage
+        result.tool_usage_score = self._score_tool_usage(response, task)
+        
+        return result
+    
+    def _score_completeness(self, response: str, task: dict) -> float:
+        """Score completeness based on response characteristics."""
+        score = 0.0
+        
+        # Base score for having a response
+        if len(response.strip()) > 0:
+            score += 2.0
+        
+        # Length bonus (up to 4 points)
+        word_count = len(response.split())
+        if word_count >= 20:
+            score += min(4.0, word_count / 50)
+        
+        # Structure bonus (up to 2 points)
+        if '\n' in response:
+            score += 1.0  # Multi-line response
+        if '- ' in response or '* ' in response:
+            score += 0.5  # List items
+        if any(c.isdigit() for c in response):
+            score += 0.5  # Contains numbers/data
+        
+        # Code block bonus
+        if '```' in response or '    ' in response:
+            score += 1.0
+        
+        return min(10.0, score)
+    
+    def _score_tool_usage(self, response: str, task: dict) -> float:
+        """Score tool usage based on task requirements."""
+        required_tool = task.get('required_tool')
+        
+        if not required_tool:
+            # No tool required, check if response is sensible
+            return 7.0  # Neutral-good score
+        
+        # Check if the response mentions using tools
+        tool_patterns = [
+            r'filesystem-list',
+            r'filesystem-read',
+            r'filesystem-exists',
+            r'filesystem-info',
+            r'build-configure',
+            r'build-compile',
+            r'build-test',
+            r'memory-analyze',
+            r'memory-search',
+        ]
+        
+        tools_mentioned = []
+        for pattern in tool_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                tools_mentioned.append(pattern)
+        
+        if required_tool.lower() in ' '.join(tools_mentioned).lower():
+            return 10.0  # Used the required tool
+        elif tools_mentioned:
+            return 6.0  # Used some tools but not the required one
+        else:
+            return 3.0  # Didn't use any tools when one was required
+
+
+def load_config(config_path: str) -> dict:
+    """Load the evaluation tasks configuration."""
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
+    """Get all tasks for specified categories."""
+    tasks = []
+    
+    for cat_name, cat_data in config.get('categories', {}).items():
+        if 'all' in categories or cat_name in categories:
+            for task in cat_data.get('tasks', []):
+                task['category'] = cat_name
+                tasks.append(task)
+    
+    return tasks
+
+
+def run_evaluation(
+    models: list[str],
+    tasks: list[dict],
+    client: OllamaClient,
+    evaluator: TaskEvaluator,
+    timeout: int = 120
+) -> dict[str, ModelResults]:
+    """Run evaluation for all models and tasks."""
+    results = {}
+    
+    total = len(models) * len(tasks)
+    current = 0
+    
+    for model in models:
+        print(f"\n{'='*60}")
+        print(f"Evaluating: {model}")
+        print(f"{'='*60}")
+        
+        model_results = ModelResults(model=model)
+        
+        for task in tasks:
+            current += 1
+            print(f"\n  [{current}/{total}] {task['id']}: {task['name']}")
+            
+            # Handle multi-turn tasks differently
+            if task.get('multi_turn'):
+                response, resp_time = run_multi_turn_task(
+                    client, model, task, timeout
+                )
+            else:
+                prompt = task.get('prompt', '')
+                print(f"    Prompt: {prompt[:60]}...")
+                response, resp_time = client.chat(model, prompt, timeout)
+            
+            print(f"    Response time: {resp_time:.2f}s")
+            
+            # Create a copy of task with model info
+            task_with_model = {**task, 'model': model}
+            
+            # Evaluate the response
+            result = evaluator.evaluate(task_with_model, response, resp_time)
+            model_results.tasks.append(result)
+            
+            print(f"    Accuracy: {result.accuracy_score:.1f}/10")
+            print(f"    Completeness: {result.completeness_score:.1f}/10")
+            print(f"    Tool Usage: {result.tool_usage_score:.1f}/10")
+            print(f"    Overall: {result.overall_score:.1f}/10")
+        
+        results[model] = model_results
+    
+    return results
+
+
+def run_multi_turn_task(
+    client: OllamaClient,
+    model: str,
+    task: dict,
+    timeout: int
+) -> tuple[str, float]:
+    """Run a multi-turn conversation task."""
+    prompts = task.get('prompts', [])
+    if not prompts:
+        return "Error: No prompts defined for multi-turn task", 0.0
+    
+    total_time = 0.0
+    all_responses = []
+    
+    for i, prompt in enumerate(prompts):
+        # For simplicity, we send each prompt independently
+        # A more sophisticated version would maintain conversation context
+        print(f"    Turn {i+1}: {prompt[:50]}...")
+        response, resp_time = client.chat(model, prompt, timeout)
+        total_time += resp_time
+        all_responses.append(f"Turn {i+1}: {response}")
+    
+    return "\n\n".join(all_responses), total_time
+
+
+def print_summary(results: dict[str, ModelResults]):
+    """Print a summary table of results."""
+    print("\n")
+    print("┌" + "─"*70 + "┐")
+    print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
+    print("├" + "─"*70 + "┤")
+    print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
+        "Model", "Accuracy", "Tool Use", "Speed", "Overall"
+    ))
+    print("├" + "─"*70 + "┤")
+    
+    for model, model_results in sorted(
+        results.items(),
+        key=lambda x: x[1].overall_score,
+        reverse=True
+    ):
+        # Format model name (truncate if needed)
+        model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
+        
+        print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
+            model_name,
+            model_results.avg_accuracy,
+            model_results.avg_tool_usage,
+            model_results.avg_response_time,
+            model_results.overall_score
+        ))
+    
+    print("└" + "─"*70 + "┘")
+
+
+def save_results(results: dict[str, ModelResults], output_path: str):
+    """Save detailed results to JSON file."""
+    output_data = {
+        "timestamp": datetime.now().isoformat(),
+        "version": "1.0",
+        "models": {}
+    }
+    
+    for model, model_results in results.items():
+        output_data["models"][model] = {
+            "summary": {
+                "avg_accuracy": model_results.avg_accuracy,
+                "avg_completeness": model_results.avg_completeness,
+                "avg_tool_usage": model_results.avg_tool_usage,
+                "avg_response_time": model_results.avg_response_time,
+                "overall_score": model_results.overall_score,
+            },
+            "tasks": [asdict(t) for t in model_results.tasks]
+        }
+    
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    
+    print(f"\nResults saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="YAZE AI Model Evaluation Runner"
+    )
+    parser.add_argument(
+        "--models", "-m",
+        type=str,
+        help="Comma-separated list of models to evaluate"
+    )
+    parser.add_argument(
+        "--all-models",
+        action="store_true",
+        help="Evaluate all available models"
+    )
+    parser.add_argument(
+        "--default-models",
+        action="store_true",
+        help="Evaluate default models from config"
+    )
+    parser.add_argument(
+        "--tasks", "-t",
+        type=str,
+        default="all",
+        help="Task categories to run (comma-separated, or 'all')"
+    )
+    parser.add_argument(
+        "--config", "-c",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
+        help="Path to evaluation config file"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        help="Output file for results (default: results/eval-TIMESTAMP.json)"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each task (default: 120)"
+    )
+    parser.add_argument(
+        "--ollama-url",
+        type=str,
+        default="http://localhost:11434",
+        help="Ollama API URL"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be evaluated without running"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load configuration
+    print("Loading configuration...")
+    try:
+        config = load_config(args.config)
+    except Exception as e:
+        print(f"Error loading config: {e}")
+        sys.exit(1)
+    
+    # Initialize Ollama client
+    client = OllamaClient(args.ollama_url)
+    
+    if not client.is_available():
+        print("Error: Ollama is not running. Start it with 'ollama serve'")
+        sys.exit(1)
+    
+    # Determine which models to evaluate
+    available_models = client.list_models()
+    print(f"Available models: {', '.join(available_models) or 'none'}")
+    
+    if args.all_models:
+        models = available_models
+    elif args.default_models:
+        default_model_names = [
+            m['name'] for m in config.get('default_models', [])
+        ]
+        models = [m for m in default_model_names if m in available_models]
+        # Offer to pull missing models
+        missing = [m for m in default_model_names if m not in available_models]
+        if missing:
+            print(f"Missing default models: {', '.join(missing)}")
+            for m in missing:
+                if client.pull_model(m):
+                    models.append(m)
+    elif args.models:
+        models = [m.strip() for m in args.models.split(',')]
+        # Validate models exist
+        for m in models:
+            if m not in available_models:
+                print(f"Warning: Model '{m}' not found. Attempting to pull...")
+                if not client.pull_model(m):
+                    print(f"  Failed to pull {m}, skipping")
+                    models.remove(m)
+    else:
+        # Default to first available model
+        models = available_models[:1] if available_models else []
+    
+    if not models:
+        print("No models available for evaluation")
+        sys.exit(1)
+    
+    print(f"Models to evaluate: {', '.join(models)}")
+    
+    # Get tasks
+    categories = [c.strip() for c in args.tasks.split(',')]
+    tasks = get_tasks_for_categories(config, categories)
+    
+    if not tasks:
+        print(f"No tasks found for categories: {args.tasks}")
+        sys.exit(1)
+    
+    print(f"Tasks to run: {len(tasks)}")
+    for task in tasks:
+        print(f"  - [{task['category']}] {task['id']}: {task['name']}")
+    
+    if args.dry_run:
+        print("\nDry run complete. Use --help for options.")
+        sys.exit(0)
+    
+    # Run evaluation
+    evaluator = TaskEvaluator(config)
+    results = run_evaluation(
+        models, tasks, client, evaluator, args.timeout
+    )
+    
+    # Print summary
+    print_summary(results)
+    
+    # Save results
+    output_path = args.output or os.path.join(
+        os.path.dirname(__file__),
+        "results",
+        f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    save_results(results, output_path)
+    
+    # Return exit code based on best model score
+    best_score = max(r.overall_score for r in results.values())
+    if best_score >= 7.0:
+        sys.exit(0)  # Good
+    elif best_score >= 5.0:
+        sys.exit(1)  # Okay
+    else:
+        sys.exit(2)  # Poor
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/ai/eval-tasks.yaml
+++ b/scripts/ai/eval-tasks.yaml
@@ -0,0 +1,383 @@
+# YAZE AI Model Evaluation Tasks
+# 
+# This file defines evaluation tasks for comparing different AI models
+# used with the z3ed CLI agent system.
+#
+# Usage:
+#   ./scripts/ai/run-model-eval.sh --models "llama3,qwen2.5,codellama" --tasks all
+#   ./scripts/ai/run-model-eval.sh --tasks rom_inspection --models "llama3"
+#
+# Scoring:
+#   Each task is scored on a 0-10 scale across multiple dimensions:
+#   - accuracy: Did the model answer correctly?
+#   - completeness: Did it include all relevant information?
+#   - tool_usage: Did it use tools appropriately?
+#   - response_time: Measured in seconds (lower is better)
+
+version: "1.0"
+
+# Models to evaluate by default
+default_models:
+  - name: "llama3.2:latest"
+    description: "Meta's Llama 3.2 - default baseline"
+    type: "baseline"
+  - name: "qwen2.5-coder:7b"
+    description: "Qwen 2.5 Coder - optimized for code"
+    type: "code"
+  - name: "codellama:7b"
+    description: "Meta's CodeLlama - code generation"
+    type: "code"
+  - name: "mistral:7b"
+    description: "Mistral 7B - general purpose"
+    type: "general"
+  - name: "phi3:medium"
+    description: "Microsoft Phi-3 - efficient"
+    type: "efficient"
+
+# Scoring weights for overall score calculation
+scoring_weights:
+  accuracy: 0.4
+  completeness: 0.3
+  tool_usage: 0.2
+  response_time: 0.1
+
+# Maximum response time before timeout (seconds)
+timeout: 120
+
+# Evaluation task categories
+categories:
+  rom_inspection:
+    description: "Tasks that inspect ROM data structures"
+    tasks:
+      - id: "list_dungeons"
+        name: "List Dungeons"
+        prompt: "What dungeons are in this ROM? List their names and IDs."
+        expected_patterns:
+          - "eastern palace|palace of darkness|desert palace"
+          - "tower of hera|swamp palace|skull woods"
+          - "thieves|ice palace|misery mire"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Lists at least 8 dungeons with correct names"
+          completeness_criteria: "Includes dungeon IDs or entrance info"
+        
+      - id: "describe_overworld"
+        name: "Describe Overworld Map"
+        prompt: "Describe overworld map 0 (Light World). What areas and features are visible?"
+        expected_patterns:
+          - "light world|hyrule"
+          - "castle|sanctuary|kakariko"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Correctly identifies the Light World"
+          completeness_criteria: "Mentions multiple notable locations"
+        
+      - id: "find_sprites"
+        name: "Find Sprites in Room"
+        prompt: "What sprites are present in dungeon room 0? List their types and positions."
+        expected_patterns:
+          - "sprite|enemy|npc"
+          - "position|coordinate|x|y"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Lists sprites with correct types"
+          completeness_criteria: "Includes position data"
+        
+      - id: "entrance_info"
+        name: "Get Entrance Information"
+        prompt: "Where is the entrance to the Eastern Palace?"
+        expected_patterns:
+          - "eastern|palace|entrance"
+          - "east|light world"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Correctly identifies entrance location"
+          completeness_criteria: "Provides coordinates or map reference"
+
+  code_analysis:
+    description: "Tasks that analyze or generate code"
+    tasks:
+      - id: "explain_function"
+        name: "Explain Function"
+        prompt: "Explain what the function LoadDungeonRoom does in the codebase."
+        expected_patterns:
+          - "dungeon|room|load"
+          - "tilemap|object|sprite"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Correctly describes the function purpose"
+          completeness_criteria: "Explains key steps or data flows"
+        
+      - id: "find_bugs"
+        name: "Find Potential Issues"
+        prompt: "Are there any potential issues with how sprite coordinates are handled in room loading?"
+        expected_patterns:
+          - "bounds|overflow|check"
+          - "coordinate|position"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Identifies real or plausible issues"
+          completeness_criteria: "Explains why the issue matters"
+        
+      - id: "suggest_refactor"
+        name: "Suggest Refactoring"
+        prompt: "How could the dungeon editor's room rendering be improved for performance?"
+        expected_patterns:
+          - "cache|batch|optimize"
+          - "render|draw|update"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Suggests valid optimization strategies"
+          completeness_criteria: "Explains implementation approach"
+
+  tool_calling:
+    description: "Tasks that require proper tool usage"
+    tasks:
+      - id: "list_files"
+        name: "List Source Files"
+        prompt: "List all .cc files in src/app/editor/"
+        expected_patterns:
+          - "\\.cc"
+          - "editor"
+        required_tool: "filesystem-list"
+        scoring:
+          accuracy_criteria: "Uses filesystem-list tool correctly"
+          completeness_criteria: "Lists files in correct directory"
+        
+      - id: "read_file"
+        name: "Read File Contents"
+        prompt: "What are the first 20 lines of src/app/rom.h?"
+        expected_patterns:
+          - "#ifndef|#define|#include"
+          - "rom|Rom"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Uses filesystem-read with correct path"
+          completeness_criteria: "Shows actual file content"
+        
+      - id: "check_existence"
+        name: "Check File Existence"
+        prompt: "Does the file src/app/editor/dungeon/dungeon_editor.cc exist?"
+        expected_patterns:
+          - "exists|found|yes"
+        required_tool: "filesystem-exists"
+        scoring:
+          accuracy_criteria: "Uses filesystem-exists tool"
+          completeness_criteria: "Provides clear yes/no answer"
+        
+      - id: "build_status"
+        name: "Get Build Status"
+        prompt: "What build presets are available for macOS?"
+        expected_patterns:
+          - "mac-dbg|mac-rel|mac-ai|mac-test"
+          - "preset|configure"
+        required_tool: "build-configure"
+        scoring:
+          accuracy_criteria: "Lists valid macOS presets"
+          completeness_criteria: "Describes preset purposes"
+
+  visual_analysis:
+    description: "Tasks for visual analysis and pattern recognition"
+    tasks:
+      - id: "find_similar_tiles"
+        name: "Find Similar Tiles"
+        prompt: "Find tiles similar to tile 42 in the ROM. Use a similarity threshold of 85%."
+        expected_patterns:
+          - "similar|match|tile"
+          - "similarity|score|percent"
+        required_tool: "visual-find-similar-tiles"
+        scoring:
+          accuracy_criteria: "Uses visual-find-similar-tiles with correct parameters"
+          completeness_criteria: "Returns list of matching tiles with scores"
+        
+      - id: "analyze_spritesheet"
+        name: "Analyze Spritesheet"
+        prompt: "Analyze graphics sheet 10 to find unused regions that could be used for custom graphics."
+        expected_patterns:
+          - "unused|empty|free"
+          - "region|space|tile"
+        required_tool: "visual-analyze-spritesheet"
+        scoring:
+          accuracy_criteria: "Uses visual-analyze-spritesheet tool"
+          completeness_criteria: "Reports locations and sizes of free regions"
+        
+      - id: "palette_usage"
+        name: "Palette Usage Analysis"
+        prompt: "Analyze which palettes are used most frequently in the overworld maps."
+        expected_patterns:
+          - "palette|color"
+          - "usage|count|percent"
+        required_tool: "visual-palette-usage"
+        scoring:
+          accuracy_criteria: "Uses visual-palette-usage with overworld type"
+          completeness_criteria: "Shows palette usage statistics"
+        
+      - id: "tile_histogram"
+        name: "Tile Usage Histogram"
+        prompt: "Generate a histogram of the top 20 most used tiles in dungeon rooms."
+        expected_patterns:
+          - "tile|usage|histogram"
+          - "count|frequency|top"
+        required_tool: "visual-tile-histogram"
+        scoring:
+          accuracy_criteria: "Uses visual-tile-histogram with dungeon type"
+          completeness_criteria: "Lists top tiles with usage counts"
+
+  project_management:
+    description: "Tasks for project state and snapshot management"
+    tasks:
+      - id: "project_status"
+        name: "Get Project Status"
+        prompt: "What is the current project status? Show me any pending edits and available snapshots."
+        expected_patterns:
+          - "project|status|snapshot"
+          - "edit|pending|initialized"
+        required_tool: "project-status"
+        scoring:
+          accuracy_criteria: "Uses project-status tool correctly"
+          completeness_criteria: "Reports project state, snapshots, and ROM checksum"
+
+      - id: "create_snapshot"
+        name: "Create Project Snapshot"
+        prompt: "Create a snapshot named 'v1.0' with description 'Initial sprite modifications'."
+        expected_patterns:
+          - "snapshot|created|v1.0"
+          - "edit|delta|saved"
+        required_tool: "project-snapshot"
+        scoring:
+          accuracy_criteria: "Uses project-snapshot with correct name parameter"
+          completeness_criteria: "Confirms snapshot creation with details"
+
+      - id: "compare_snapshots"
+        name: "Compare Snapshots"
+        prompt: "Compare snapshots 'before-fix' and 'after-fix' to see what changed."
+        expected_patterns:
+          - "diff|compare|changed"
+          - "added|removed|modified"
+        required_tool: "project-diff"
+        scoring:
+          accuracy_criteria: "Uses project-diff with both snapshot names"
+          completeness_criteria: "Shows detailed comparison of edits"
+
+      - id: "restore_checkpoint"
+        name: "Restore to Checkpoint"
+        prompt: "Restore the ROM to the 'stable' snapshot."
+        expected_patterns:
+          - "restore|snapshot|stable"
+          - "applied|reverted|edit"
+        required_tool: "project-restore"
+        scoring:
+          accuracy_criteria: "Uses project-restore with correct snapshot name"
+          completeness_criteria: "Confirms restoration and lists applied edits"
+
+  code_generation:
+    description: "Tasks for ASM code generation and patching"
+    tasks:
+      - id: "generate_hook"
+        name: "Generate ASM Hook"
+        prompt: "Generate an ASM hook at address $008040 with label MyCustomHook and 2 NOPs for alignment."
+        expected_patterns:
+          - "hook|JSL|008040"
+          - "MyCustomHook|NOP"
+        required_tool: "codegen-asm-hook"
+        scoring:
+          accuracy_criteria: "Uses codegen-asm-hook with correct address and label"
+          completeness_criteria: "Generates valid ASM with proper hook structure"
+
+      - id: "find_freespace"
+        name: "Find Freespace for Patch"
+        prompt: "Generate a freespace patch for 256 bytes of code labeled 'NewSpriteCode', preferring bank $3F."
+        expected_patterns:
+          - "freespace|org|NewSpriteCode"
+          - "1F8000|bank|free"
+        required_tool: "codegen-freespace-patch"
+        scoring:
+          accuracy_criteria: "Uses codegen-freespace-patch with size and label"
+          completeness_criteria: "Reports available regions and generates allocation code"
+
+      - id: "sprite_template"
+        name: "Generate Sprite Template"
+        prompt: "Generate a sprite template named 'FollowerSprite' with init code that sets sprite state and main code that follows the player."
+        expected_patterns:
+          - "sprite|FollowerSprite|template"
+          - "init|main|0DD0"
+        required_tool: "codegen-sprite-template"
+        scoring:
+          accuracy_criteria: "Uses codegen-sprite-template with name and custom code"
+          completeness_criteria: "Generates complete sprite with init and main sections"
+
+      - id: "event_handler"
+        name: "Generate Event Handler"
+        prompt: "Generate an NMI event handler labeled 'FrameCounter' that increments a counter each frame."
+        expected_patterns:
+          - "NMI|event|handler"
+          - "FrameCounter|INC|counter"
+        required_tool: "codegen-event-handler"
+        scoring:
+          accuracy_criteria: "Uses codegen-event-handler with type=nmi and label"
+          completeness_criteria: "Generates handler with state preservation and custom code"
+
+  conversation:
+    description: "Tasks testing multi-turn dialog and context"
+    tasks:
+      - id: "follow_up"
+        name: "Follow-up Questions"
+        multi_turn: true
+        prompts:
+          - "What is the main purpose of the Rom class?"
+          - "What methods does it have for loading data?"
+          - "Can you show me an example of using LoadFromFile?"
+        expected_patterns:
+          - "rom|ROM|file"
+          - "load|read|parse"
+          - "example|code|usage"
+        scoring:
+          accuracy_criteria: "Maintains context across turns"
+          completeness_criteria: "Each response builds on previous"
+        
+      - id: "clarification"
+        name: "Handle Clarification"
+        multi_turn: true
+        prompts:
+          - "How do I add a new sprite?"
+          - "I mean in the dungeon editor, not the overworld"
+        expected_patterns:
+          - "sprite|dungeon|editor"
+          - "add|create|place"
+        scoring:
+          accuracy_criteria: "Adjusts response based on clarification"
+          completeness_criteria: "Provides dungeon-specific instructions"
+
+# Scoring rubric definitions
+scoring_rubric:
+  accuracy:
+    10: "Perfect - completely correct with no errors"
+    8: "Excellent - minor inaccuracies that don't affect understanding"
+    6: "Good - mostly correct with some notable errors"
+    4: "Fair - partially correct but missing key points"
+    2: "Poor - significant errors or misunderstandings"
+    0: "Incorrect - completely wrong or off-topic"
+  
+  completeness:
+    10: "Comprehensive - covers all aspects thoroughly"
+    8: "Very complete - covers most aspects well"
+    6: "Adequate - covers main points but missing some details"
+    4: "Partial - covers some points but lacks depth"
+    2: "Minimal - barely addresses the question"
+    0: "Incomplete - doesn't meaningfully address the question"
+  
+  tool_usage:
+    10: "Perfect - uses correct tools with proper parameters"
+    8: "Good - uses appropriate tools with minor parameter issues"
+    6: "Adequate - uses tools but not optimally"
+    4: "Fair - attempts tool use but with errors"
+    2: "Poor - wrong tool or significant usage errors"
+    0: "Failed - doesn't use required tools or fails completely"
+
+# Report configuration
+reporting:
+  output_format: "table"  # table, json, markdown
+  show_individual_scores: true
+  show_response_samples: true
+  max_sample_length: 500
+
--- a/scripts/ai/results/.gitkeep
+++ b/scripts/ai/results/.gitkeep
@@ -0,0 +1,3 @@
+# This directory stores AI evaluation results
+# Results are gitignored but this file keeps the directory in the repo
+
--- a/scripts/ai/run-model-eval.sh
+++ b/scripts/ai/run-model-eval.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+# =============================================================================
+# YAZE AI Model Evaluation Script
+# 
+# Runs AI model evaluations using the eval-runner.py engine.
+#
+# Usage:
+#   ./run-model-eval.sh                          # Run with defaults
+#   ./run-model-eval.sh --models llama3,qwen2.5  # Specific models
+#   ./run-model-eval.sh --all                    # All available models
+#   ./run-model-eval.sh --quick                  # Quick smoke test
+#   ./run-model-eval.sh --compare                # Compare and report
+#
+# Prerequisites:
+#   - Ollama running (ollama serve)
+#   - Python 3.10+ with requests and pyyaml
+#   - At least one model pulled (ollama pull llama3.2)
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+RESULTS_DIR="$SCRIPT_DIR/results"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Default settings
+MODELS=""
+TASKS="all"
+TIMEOUT=120
+DRY_RUN=false
+COMPARE=false
+QUICK_MODE=false
+ALL_MODELS=false
+DEFAULT_MODELS=false
+VERBOSE=false
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+print_header() {
+    echo -e "${CYAN}"
+    echo "╔════════════════════════════════════════════════════════════════════╗"
+    echo "║                    YAZE AI Model Evaluation                        ║"
+    echo "╚════════════════════════════════════════════════════════════════════╝"
+    echo -e "${NC}"
+}
+
+print_step() {
+    echo -e "${BLUE}[*]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[✓]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[!]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[✗]${NC} $1"
+}
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  --models, -m LIST    Comma-separated list of models to evaluate"
+    echo "  --all                Evaluate all available models"
+    echo "  --default            Evaluate default models from config"
+    echo "  --tasks, -t LIST     Task categories (default: all)"
+    echo "                       Options: rom_inspection, code_analysis, tool_calling, conversation"
+    echo "  --timeout SEC        Timeout per task in seconds (default: 120)"
+    echo "  --quick              Quick smoke test (fewer tasks)"
+    echo "  --dry-run            Show what would run without executing"
+    echo "  --compare            Generate comparison report after evaluation"
+    echo "  --verbose, -v        Verbose output"
+    echo "  --help, -h           Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
+    echo "  $0 --all --compare"
+    echo "  $0 --quick --default"
+}
+
+check_prerequisites() {
+    print_step "Checking prerequisites..."
+    
+    local missing=false
+    
+    # Check Python
+    if ! command -v python3 &> /dev/null; then
+        print_error "Python 3 not found"
+        missing=true
+    else
+        print_success "Python 3 found: $(python3 --version)"
+    fi
+    
+    # Check Python packages
+    if python3 -c "import requests" 2>/dev/null; then
+        print_success "Python 'requests' package installed"
+    else
+        print_warning "Python 'requests' package missing - installing..."
+        pip3 install requests --quiet || missing=true
+    fi
+    
+    if python3 -c "import yaml" 2>/dev/null; then
+        print_success "Python 'pyyaml' package installed"
+    else
+        print_warning "Python 'pyyaml' package missing - installing..."
+        pip3 install pyyaml --quiet || missing=true
+    fi
+    
+    # Check Ollama
+    if ! command -v ollama &> /dev/null; then
+        print_error "Ollama not found. Install from https://ollama.ai"
+        missing=true
+    else
+        print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
+    fi
+    
+    # Check if Ollama is running
+    if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+        print_success "Ollama server is running"
+    else
+        print_warning "Ollama server not running - attempting to start..."
+        ollama serve &> /dev/null &
+        sleep 3
+        if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+            print_success "Ollama server started"
+        else
+            print_error "Could not start Ollama server. Run 'ollama serve' manually."
+            missing=true
+        fi
+    fi
+    
+    if $missing; then
+        print_error "Prerequisites check failed"
+        exit 1
+    fi
+    
+    echo ""
+}
+
+list_available_models() {
+    curl -s http://localhost:11434/api/tags | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+for model in data.get('models', []):
+    print(model['name'])
+" 2>/dev/null || echo ""
+}
+
+ensure_model() {
+    local model=$1
+    local available=$(list_available_models)
+    
+    if echo "$available" | grep -q "^$model$"; then
+        return 0
+    else
+        print_warning "Model '$model' not found, pulling..."
+        ollama pull "$model"
+        return $?
+    fi
+}
+
+run_evaluation() {
+    local args=()
+    
+    if [ -n "$MODELS" ]; then
+        args+=(--models "$MODELS")
+    elif $ALL_MODELS; then
+        args+=(--all-models)
+    elif $DEFAULT_MODELS; then
+        args+=(--default-models)
+    fi
+    
+    args+=(--tasks "$TASKS")
+    args+=(--timeout "$TIMEOUT")
+    args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
+    
+    if $DRY_RUN; then
+        args+=(--dry-run)
+    fi
+    
+    local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
+    args+=(--output "$output_file")
+    
+    print_step "Running evaluation..."
+    if $VERBOSE; then
+        echo "  Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
+    fi
+    echo ""
+    
+    python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
+    local exit_code=$?
+    
+    if [ $exit_code -eq 0 ]; then
+        print_success "Evaluation completed successfully"
+    elif [ $exit_code -eq 1 ]; then
+        print_warning "Evaluation completed with moderate scores"
+    else
+        print_error "Evaluation completed with poor scores"
+    fi
+    
+    return 0
+}
+
+run_comparison() {
+    print_step "Generating comparison report..."
+    
+    local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
+    
+    if [ -z "$result_files" ]; then
+        print_error "No result files found"
+        return 1
+    fi
+    
+    local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
+    
+    python3 "$SCRIPT_DIR/compare-models.py" \
+        --format markdown \
+        --task-analysis \
+        --output "$report_file" \
+        $result_files
+    
+    print_success "Comparison report: $report_file"
+    
+    # Also print table to console
+    echo ""
+    python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
+}
+
+quick_test() {
+    print_step "Running quick smoke test..."
+    
+    # Get first available model
+    local available=$(list_available_models | head -1)
+    
+    if [ -z "$available" ]; then
+        print_error "No models available. Pull a model with: ollama pull llama3.2"
+        exit 1
+    fi
+    
+    print_step "Using model: $available"
+    
+    # Run just one task category
+    python3 "$SCRIPT_DIR/eval-runner.py" \
+        --models "$available" \
+        --tasks tool_calling \
+        --timeout 60 \
+        --config "$SCRIPT_DIR/eval-tasks.yaml"
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --models|-m)
+            MODELS="$2"
+            shift 2
+            ;;
+        --all)
+            ALL_MODELS=true
+            shift
+            ;;
+        --default)
+            DEFAULT_MODELS=true
+            shift
+            ;;
+        --tasks|-t)
+            TASKS="$2"
+            shift 2
+            ;;
+        --timeout)
+            TIMEOUT="$2"
+            shift 2
+            ;;
+        --quick)
+            QUICK_MODE=true
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --compare)
+            COMPARE=true
+            shift
+            ;;
+        --verbose|-v)
+            VERBOSE=true
+            shift
+            ;;
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Ensure results directory exists
+mkdir -p "$RESULTS_DIR"
+
+print_header
+check_prerequisites
+
+if $QUICK_MODE; then
+    quick_test
+elif $DRY_RUN; then
+    run_evaluation
+else
+    run_evaluation
+    
+    if $COMPARE; then
+        echo ""
+        run_comparison
+    fi
+fi
+
+echo ""
+print_success "Done!"
+