backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)

2025-12-22 00:20:49 +00:00
parent 2934c82b75
commit 5c4cd57ff8
1259 changed files with 239160 additions and 43801 deletions
--- a/scripts/ai/eval-runner.py
+++ b/scripts/ai/eval-runner.py
@@ -0,0 +1,596 @@
+#!/usr/bin/env python3
+"""
+YAZE AI Model Evaluation Runner
+
+Runs evaluation tasks against multiple AI models and produces scored results.
+
+Usage:
+    python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
+    python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
+
+Requirements:
+    pip install requests pyyaml
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+
+import requests
+import yaml
+
+
+@dataclass
+class TaskResult:
+    """Result of a single task evaluation."""
+    task_id: str
+    task_name: str
+    category: str
+    model: str
+    prompt: str
+    response: str
+    response_time: float
+    accuracy_score: float = 0.0
+    completeness_score: float = 0.0
+    tool_usage_score: float = 0.0
+    pattern_matches: list = field(default_factory=list)
+    tools_used: list = field(default_factory=list)
+    error: Optional[str] = None
+    
+    @property
+    def overall_score(self) -> float:
+        """Calculate weighted overall score."""
+        # Default weights from eval-tasks.yaml
+        weights = {
+            'accuracy': 0.4,
+            'completeness': 0.3,
+            'tool_usage': 0.2,
+            'response_time': 0.1
+        }
+        
+        # Normalize response time to 0-10 scale (lower is better)
+        # 0s = 10, 60s+ = 0
+        time_score = max(0, 10 - (self.response_time / 6))
+        
+        return (
+            weights['accuracy'] * self.accuracy_score +
+            weights['completeness'] * self.completeness_score +
+            weights['tool_usage'] * self.tool_usage_score +
+            weights['response_time'] * time_score
+        )
+
+
+@dataclass
+class ModelResults:
+    """Aggregated results for a single model."""
+    model: str
+    tasks: list[TaskResult] = field(default_factory=list)
+    
+    @property
+    def avg_accuracy(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_completeness(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_tool_usage(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_response_time(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.response_time for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def overall_score(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.overall_score for t in self.tasks) / len(self.tasks)
+
+
+class OllamaClient:
+    """Client for Ollama API."""
+    
+    def __init__(self, base_url: str = "http://localhost:11434"):
+        self.base_url = base_url
+        
+    def is_available(self) -> bool:
+        """Check if Ollama is running."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            return resp.status_code == 200
+        except requests.exceptions.RequestException:
+            return False
+    
+    def list_models(self) -> list[str]:
+        """List available models."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
+            if resp.status_code == 200:
+                data = resp.json()
+                return [m['name'] for m in data.get('models', [])]
+        except requests.exceptions.RequestException:
+            pass
+        return []
+    
+    def pull_model(self, model: str) -> bool:
+        """Pull a model if not available."""
+        print(f"  Pulling model {model}...", end=" ", flush=True)
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/pull",
+                json={"name": model},
+                timeout=600  # 10 minutes for large models
+            )
+            if resp.status_code == 200:
+                print("Done")
+                return True
+        except requests.exceptions.RequestException as e:
+            print(f"Failed: {e}")
+        return False
+    
+    def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
+        """
+        Send a chat message and return response + response time.
+        
+        Returns:
+            Tuple of (response_text, response_time_seconds)
+        """
+        start_time = time.time()
+        
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False
+                },
+                timeout=timeout
+            )
+            
+            elapsed = time.time() - start_time
+            
+            if resp.status_code == 200:
+                data = resp.json()
+                content = data.get("message", {}).get("content", "")
+                return content, elapsed
+            else:
+                return f"Error: HTTP {resp.status_code}", elapsed
+                
+        except requests.exceptions.Timeout:
+            return "Error: Request timed out", timeout
+        except requests.exceptions.RequestException as e:
+            return f"Error: {str(e)}", time.time() - start_time
+
+
+class TaskEvaluator:
+    """Evaluates task responses and assigns scores."""
+    
+    def __init__(self, config: dict):
+        self.config = config
+        
+    def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
+        """Evaluate a response for a task."""
+        result = TaskResult(
+            task_id=task['id'],
+            task_name=task['name'],
+            category=task.get('category', 'unknown'),
+            model=task.get('model', 'unknown'),
+            prompt=task.get('prompt', ''),
+            response=response,
+            response_time=response_time
+        )
+        
+        if response.startswith("Error:"):
+            result.error = response
+            return result
+        
+        # Check pattern matches
+        expected_patterns = task.get('expected_patterns', [])
+        for pattern in expected_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                result.pattern_matches.append(pattern)
+        
+        # Score accuracy based on pattern matches
+        if expected_patterns:
+            match_ratio = len(result.pattern_matches) / len(expected_patterns)
+            result.accuracy_score = match_ratio * 10
+        else:
+            # No patterns defined, give neutral score
+            result.accuracy_score = 5.0
+        
+        # Score completeness based on response length and structure
+        result.completeness_score = self._score_completeness(response, task)
+        
+        # Score tool usage
+        result.tool_usage_score = self._score_tool_usage(response, task)
+        
+        return result
+    
+    def _score_completeness(self, response: str, task: dict) -> float:
+        """Score completeness based on response characteristics."""
+        score = 0.0
+        
+        # Base score for having a response
+        if len(response.strip()) > 0:
+            score += 2.0
+        
+        # Length bonus (up to 4 points)
+        word_count = len(response.split())
+        if word_count >= 20:
+            score += min(4.0, word_count / 50)
+        
+        # Structure bonus (up to 2 points)
+        if '\n' in response:
+            score += 1.0  # Multi-line response
+        if '- ' in response or '* ' in response:
+            score += 0.5  # List items
+        if any(c.isdigit() for c in response):
+            score += 0.5  # Contains numbers/data
+        
+        # Code block bonus
+        if '```' in response or '    ' in response:
+            score += 1.0
+        
+        return min(10.0, score)
+    
+    def _score_tool_usage(self, response: str, task: dict) -> float:
+        """Score tool usage based on task requirements."""
+        required_tool = task.get('required_tool')
+        
+        if not required_tool:
+            # No tool required, check if response is sensible
+            return 7.0  # Neutral-good score
+        
+        # Check if the response mentions using tools
+        tool_patterns = [
+            r'filesystem-list',
+            r'filesystem-read',
+            r'filesystem-exists',
+            r'filesystem-info',
+            r'build-configure',
+            r'build-compile',
+            r'build-test',
+            r'memory-analyze',
+            r'memory-search',
+        ]
+        
+        tools_mentioned = []
+        for pattern in tool_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                tools_mentioned.append(pattern)
+        
+        if required_tool.lower() in ' '.join(tools_mentioned).lower():
+            return 10.0  # Used the required tool
+        elif tools_mentioned:
+            return 6.0  # Used some tools but not the required one
+        else:
+            return 3.0  # Didn't use any tools when one was required
+
+
+def load_config(config_path: str) -> dict:
+    """Load the evaluation tasks configuration."""
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
+    """Get all tasks for specified categories."""
+    tasks = []
+    
+    for cat_name, cat_data in config.get('categories', {}).items():
+        if 'all' in categories or cat_name in categories:
+            for task in cat_data.get('tasks', []):
+                task['category'] = cat_name
+                tasks.append(task)
+    
+    return tasks
+
+
+def run_evaluation(
+    models: list[str],
+    tasks: list[dict],
+    client: OllamaClient,
+    evaluator: TaskEvaluator,
+    timeout: int = 120
+) -> dict[str, ModelResults]:
+    """Run evaluation for all models and tasks."""
+    results = {}
+    
+    total = len(models) * len(tasks)
+    current = 0
+    
+    for model in models:
+        print(f"\n{'='*60}")
+        print(f"Evaluating: {model}")
+        print(f"{'='*60}")
+        
+        model_results = ModelResults(model=model)
+        
+        for task in tasks:
+            current += 1
+            print(f"\n  [{current}/{total}] {task['id']}: {task['name']}")
+            
+            # Handle multi-turn tasks differently
+            if task.get('multi_turn'):
+                response, resp_time = run_multi_turn_task(
+                    client, model, task, timeout
+                )
+            else:
+                prompt = task.get('prompt', '')
+                print(f"    Prompt: {prompt[:60]}...")
+                response, resp_time = client.chat(model, prompt, timeout)
+            
+            print(f"    Response time: {resp_time:.2f}s")
+            
+            # Create a copy of task with model info
+            task_with_model = {**task, 'model': model}
+            
+            # Evaluate the response
+            result = evaluator.evaluate(task_with_model, response, resp_time)
+            model_results.tasks.append(result)
+            
+            print(f"    Accuracy: {result.accuracy_score:.1f}/10")
+            print(f"    Completeness: {result.completeness_score:.1f}/10")
+            print(f"    Tool Usage: {result.tool_usage_score:.1f}/10")
+            print(f"    Overall: {result.overall_score:.1f}/10")
+        
+        results[model] = model_results
+    
+    return results
+
+
+def run_multi_turn_task(
+    client: OllamaClient,
+    model: str,
+    task: dict,
+    timeout: int
+) -> tuple[str, float]:
+    """Run a multi-turn conversation task."""
+    prompts = task.get('prompts', [])
+    if not prompts:
+        return "Error: No prompts defined for multi-turn task", 0.0
+    
+    total_time = 0.0
+    all_responses = []
+    
+    for i, prompt in enumerate(prompts):
+        # For simplicity, we send each prompt independently
+        # A more sophisticated version would maintain conversation context
+        print(f"    Turn {i+1}: {prompt[:50]}...")
+        response, resp_time = client.chat(model, prompt, timeout)
+        total_time += resp_time
+        all_responses.append(f"Turn {i+1}: {response}")
+    
+    return "\n\n".join(all_responses), total_time
+
+
+def print_summary(results: dict[str, ModelResults]):
+    """Print a summary table of results."""
+    print("\n")
+    print("┌" + "─"*70 + "┐")
+    print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
+    print("├" + "─"*70 + "┤")
+    print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
+        "Model", "Accuracy", "Tool Use", "Speed", "Overall"
+    ))
+    print("├" + "─"*70 + "┤")
+    
+    for model, model_results in sorted(
+        results.items(),
+        key=lambda x: x[1].overall_score,
+        reverse=True
+    ):
+        # Format model name (truncate if needed)
+        model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
+        
+        print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
+            model_name,
+            model_results.avg_accuracy,
+            model_results.avg_tool_usage,
+            model_results.avg_response_time,
+            model_results.overall_score
+        ))
+    
+    print("└" + "─"*70 + "┘")
+
+
+def save_results(results: dict[str, ModelResults], output_path: str):
+    """Save detailed results to JSON file."""
+    output_data = {
+        "timestamp": datetime.now().isoformat(),
+        "version": "1.0",
+        "models": {}
+    }
+    
+    for model, model_results in results.items():
+        output_data["models"][model] = {
+            "summary": {
+                "avg_accuracy": model_results.avg_accuracy,
+                "avg_completeness": model_results.avg_completeness,
+                "avg_tool_usage": model_results.avg_tool_usage,
+                "avg_response_time": model_results.avg_response_time,
+                "overall_score": model_results.overall_score,
+            },
+            "tasks": [asdict(t) for t in model_results.tasks]
+        }
+    
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    
+    print(f"\nResults saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="YAZE AI Model Evaluation Runner"
+    )
+    parser.add_argument(
+        "--models", "-m",
+        type=str,
+        help="Comma-separated list of models to evaluate"
+    )
+    parser.add_argument(
+        "--all-models",
+        action="store_true",
+        help="Evaluate all available models"
+    )
+    parser.add_argument(
+        "--default-models",
+        action="store_true",
+        help="Evaluate default models from config"
+    )
+    parser.add_argument(
+        "--tasks", "-t",
+        type=str,
+        default="all",
+        help="Task categories to run (comma-separated, or 'all')"
+    )
+    parser.add_argument(
+        "--config", "-c",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
+        help="Path to evaluation config file"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        help="Output file for results (default: results/eval-TIMESTAMP.json)"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each task (default: 120)"
+    )
+    parser.add_argument(
+        "--ollama-url",
+        type=str,
+        default="http://localhost:11434",
+        help="Ollama API URL"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be evaluated without running"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load configuration
+    print("Loading configuration...")
+    try:
+        config = load_config(args.config)
+    except Exception as e:
+        print(f"Error loading config: {e}")
+        sys.exit(1)
+    
+    # Initialize Ollama client
+    client = OllamaClient(args.ollama_url)
+    
+    if not client.is_available():
+        print("Error: Ollama is not running. Start it with 'ollama serve'")
+        sys.exit(1)
+    
+    # Determine which models to evaluate
+    available_models = client.list_models()
+    print(f"Available models: {', '.join(available_models) or 'none'}")
+    
+    if args.all_models:
+        models = available_models
+    elif args.default_models:
+        default_model_names = [
+            m['name'] for m in config.get('default_models', [])
+        ]
+        models = [m for m in default_model_names if m in available_models]
+        # Offer to pull missing models
+        missing = [m for m in default_model_names if m not in available_models]
+        if missing:
+            print(f"Missing default models: {', '.join(missing)}")
+            for m in missing:
+                if client.pull_model(m):
+                    models.append(m)
+    elif args.models:
+        models = [m.strip() for m in args.models.split(',')]
+        # Validate models exist
+        for m in models:
+            if m not in available_models:
+                print(f"Warning: Model '{m}' not found. Attempting to pull...")
+                if not client.pull_model(m):
+                    print(f"  Failed to pull {m}, skipping")
+                    models.remove(m)
+    else:
+        # Default to first available model
+        models = available_models[:1] if available_models else []
+    
+    if not models:
+        print("No models available for evaluation")
+        sys.exit(1)
+    
+    print(f"Models to evaluate: {', '.join(models)}")
+    
+    # Get tasks
+    categories = [c.strip() for c in args.tasks.split(',')]
+    tasks = get_tasks_for_categories(config, categories)
+    
+    if not tasks:
+        print(f"No tasks found for categories: {args.tasks}")
+        sys.exit(1)
+    
+    print(f"Tasks to run: {len(tasks)}")
+    for task in tasks:
+        print(f"  - [{task['category']}] {task['id']}: {task['name']}")
+    
+    if args.dry_run:
+        print("\nDry run complete. Use --help for options.")
+        sys.exit(0)
+    
+    # Run evaluation
+    evaluator = TaskEvaluator(config)
+    results = run_evaluation(
+        models, tasks, client, evaluator, args.timeout
+    )
+    
+    # Print summary
+    print_summary(results)
+    
+    # Save results
+    output_path = args.output or os.path.join(
+        os.path.dirname(__file__),
+        "results",
+        f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    save_results(results, output_path)
+    
+    # Return exit code based on best model score
+    best_score = max(r.overall_score for r in results.values())
+    if best_score >= 7.0:
+        sys.exit(0)  # Good
+    elif best_score >= 5.0:
+        sys.exit(1)  # Okay
+    else:
+        sys.exit(2)  # Poor
+
+
+if __name__ == "__main__":
+    main()
+