backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)

2025-12-22 00:20:49 +00:00
parent 2934c82b75
commit 5c4cd57ff8
1259 changed files with 239160 additions and 43801 deletions
--- a/scripts/ai/run-model-eval.sh
+++ b/scripts/ai/run-model-eval.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+# =============================================================================
+# YAZE AI Model Evaluation Script
+# 
+# Runs AI model evaluations using the eval-runner.py engine.
+#
+# Usage:
+#   ./run-model-eval.sh                          # Run with defaults
+#   ./run-model-eval.sh --models llama3,qwen2.5  # Specific models
+#   ./run-model-eval.sh --all                    # All available models
+#   ./run-model-eval.sh --quick                  # Quick smoke test
+#   ./run-model-eval.sh --compare                # Compare and report
+#
+# Prerequisites:
+#   - Ollama running (ollama serve)
+#   - Python 3.10+ with requests and pyyaml
+#   - At least one model pulled (ollama pull llama3.2)
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+RESULTS_DIR="$SCRIPT_DIR/results"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Default settings
+MODELS=""
+TASKS="all"
+TIMEOUT=120
+DRY_RUN=false
+COMPARE=false
+QUICK_MODE=false
+ALL_MODELS=false
+DEFAULT_MODELS=false
+VERBOSE=false
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+print_header() {
+    echo -e "${CYAN}"
+    echo "╔════════════════════════════════════════════════════════════════════╗"
+    echo "║                    YAZE AI Model Evaluation                        ║"
+    echo "╚════════════════════════════════════════════════════════════════════╝"
+    echo -e "${NC}"
+}
+
+print_step() {
+    echo -e "${BLUE}[*]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[✓]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[!]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[✗]${NC} $1"
+}
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  --models, -m LIST    Comma-separated list of models to evaluate"
+    echo "  --all                Evaluate all available models"
+    echo "  --default            Evaluate default models from config"
+    echo "  --tasks, -t LIST     Task categories (default: all)"
+    echo "                       Options: rom_inspection, code_analysis, tool_calling, conversation"
+    echo "  --timeout SEC        Timeout per task in seconds (default: 120)"
+    echo "  --quick              Quick smoke test (fewer tasks)"
+    echo "  --dry-run            Show what would run without executing"
+    echo "  --compare            Generate comparison report after evaluation"
+    echo "  --verbose, -v        Verbose output"
+    echo "  --help, -h           Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
+    echo "  $0 --all --compare"
+    echo "  $0 --quick --default"
+}
+
+check_prerequisites() {
+    print_step "Checking prerequisites..."
+    
+    local missing=false
+    
+    # Check Python
+    if ! command -v python3 &> /dev/null; then
+        print_error "Python 3 not found"
+        missing=true
+    else
+        print_success "Python 3 found: $(python3 --version)"
+    fi
+    
+    # Check Python packages
+    if python3 -c "import requests" 2>/dev/null; then
+        print_success "Python 'requests' package installed"
+    else
+        print_warning "Python 'requests' package missing - installing..."
+        pip3 install requests --quiet || missing=true
+    fi
+    
+    if python3 -c "import yaml" 2>/dev/null; then
+        print_success "Python 'pyyaml' package installed"
+    else
+        print_warning "Python 'pyyaml' package missing - installing..."
+        pip3 install pyyaml --quiet || missing=true
+    fi
+    
+    # Check Ollama
+    if ! command -v ollama &> /dev/null; then
+        print_error "Ollama not found. Install from https://ollama.ai"
+        missing=true
+    else
+        print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
+    fi
+    
+    # Check if Ollama is running
+    if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+        print_success "Ollama server is running"
+    else
+        print_warning "Ollama server not running - attempting to start..."
+        ollama serve &> /dev/null &
+        sleep 3
+        if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+            print_success "Ollama server started"
+        else
+            print_error "Could not start Ollama server. Run 'ollama serve' manually."
+            missing=true
+        fi
+    fi
+    
+    if $missing; then
+        print_error "Prerequisites check failed"
+        exit 1
+    fi
+    
+    echo ""
+}
+
+list_available_models() {
+    curl -s http://localhost:11434/api/tags | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+for model in data.get('models', []):
+    print(model['name'])
+" 2>/dev/null || echo ""
+}
+
+ensure_model() {
+    local model=$1
+    local available=$(list_available_models)
+    
+    if echo "$available" | grep -q "^$model$"; then
+        return 0
+    else
+        print_warning "Model '$model' not found, pulling..."
+        ollama pull "$model"
+        return $?
+    fi
+}
+
+run_evaluation() {
+    local args=()
+    
+    if [ -n "$MODELS" ]; then
+        args+=(--models "$MODELS")
+    elif $ALL_MODELS; then
+        args+=(--all-models)
+    elif $DEFAULT_MODELS; then
+        args+=(--default-models)
+    fi
+    
+    args+=(--tasks "$TASKS")
+    args+=(--timeout "$TIMEOUT")
+    args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
+    
+    if $DRY_RUN; then
+        args+=(--dry-run)
+    fi
+    
+    local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
+    args+=(--output "$output_file")
+    
+    print_step "Running evaluation..."
+    if $VERBOSE; then
+        echo "  Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
+    fi
+    echo ""
+    
+    python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
+    local exit_code=$?
+    
+    if [ $exit_code -eq 0 ]; then
+        print_success "Evaluation completed successfully"
+    elif [ $exit_code -eq 1 ]; then
+        print_warning "Evaluation completed with moderate scores"
+    else
+        print_error "Evaluation completed with poor scores"
+    fi
+    
+    return 0
+}
+
+run_comparison() {
+    print_step "Generating comparison report..."
+    
+    local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
+    
+    if [ -z "$result_files" ]; then
+        print_error "No result files found"
+        return 1
+    fi
+    
+    local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
+    
+    python3 "$SCRIPT_DIR/compare-models.py" \
+        --format markdown \
+        --task-analysis \
+        --output "$report_file" \
+        $result_files
+    
+    print_success "Comparison report: $report_file"
+    
+    # Also print table to console
+    echo ""
+    python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
+}
+
+quick_test() {
+    print_step "Running quick smoke test..."
+    
+    # Get first available model
+    local available=$(list_available_models | head -1)
+    
+    if [ -z "$available" ]; then
+        print_error "No models available. Pull a model with: ollama pull llama3.2"
+        exit 1
+    fi
+    
+    print_step "Using model: $available"
+    
+    # Run just one task category
+    python3 "$SCRIPT_DIR/eval-runner.py" \
+        --models "$available" \
+        --tasks tool_calling \
+        --timeout 60 \
+        --config "$SCRIPT_DIR/eval-tasks.yaml"
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --models|-m)
+            MODELS="$2"
+            shift 2
+            ;;
+        --all)
+            ALL_MODELS=true
+            shift
+            ;;
+        --default)
+            DEFAULT_MODELS=true
+            shift
+            ;;
+        --tasks|-t)
+            TASKS="$2"
+            shift 2
+            ;;
+        --timeout)
+            TIMEOUT="$2"
+            shift 2
+            ;;
+        --quick)
+            QUICK_MODE=true
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --compare)
+            COMPARE=true
+            shift
+            ;;
+        --verbose|-v)
+            VERBOSE=true
+            shift
+            ;;
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Ensure results directory exists
+mkdir -p "$RESULTS_DIR"
+
+print_header
+check_prerequisites
+
+if $QUICK_MODE; then
+    quick_test
+elif $DRY_RUN; then
+    run_evaluation
+else
+    run_evaluation
+    
+    if $COMPARE; then
+        echo ""
+        run_comparison
+    fi
+fi
+
+echo ""
+print_success "Done!"
+