Files
yaze/scripts/ai/run-model-eval.sh

341 lines
9.0 KiB
Bash
Executable File

#!/bin/bash
# =============================================================================
# YAZE AI Model Evaluation Script
#
# Runs AI model evaluations using the eval-runner.py engine.
#
# Usage:
# ./run-model-eval.sh # Run with defaults
# ./run-model-eval.sh --models llama3,qwen2.5 # Specific models
# ./run-model-eval.sh --all # All available models
# ./run-model-eval.sh --quick # Quick smoke test
# ./run-model-eval.sh --compare # Compare and report
#
# Prerequisites:
# - Ollama running (ollama serve)
# - Python 3.10+ with requests and pyyaml
# - At least one model pulled (ollama pull llama3.2)
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
RESULTS_DIR="$SCRIPT_DIR/results"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Default settings
MODELS=""
TASKS="all"
TIMEOUT=120
DRY_RUN=false
COMPARE=false
QUICK_MODE=false
ALL_MODELS=false
DEFAULT_MODELS=false
VERBOSE=false
# =============================================================================
# Helper Functions
# =============================================================================
print_header() {
echo -e "${CYAN}"
echo "╔════════════════════════════════════════════════════════════════════╗"
echo "║ YAZE AI Model Evaluation ║"
echo "╚════════════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
}
print_step() {
echo -e "${BLUE}[*]${NC} $1"
}
print_success() {
echo -e "${GREEN}[✓]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[!]${NC} $1"
}
print_error() {
echo -e "${RED}[✗]${NC} $1"
}
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --models, -m LIST Comma-separated list of models to evaluate"
echo " --all Evaluate all available models"
echo " --default Evaluate default models from config"
echo " --tasks, -t LIST Task categories (default: all)"
echo " Options: rom_inspection, code_analysis, tool_calling, conversation"
echo " --timeout SEC Timeout per task in seconds (default: 120)"
echo " --quick Quick smoke test (fewer tasks)"
echo " --dry-run Show what would run without executing"
echo " --compare Generate comparison report after evaluation"
echo " --verbose, -v Verbose output"
echo " --help, -h Show this help message"
echo ""
echo "Examples:"
echo " $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
echo " $0 --all --compare"
echo " $0 --quick --default"
}
check_prerequisites() {
print_step "Checking prerequisites..."
local missing=false
# Check Python
if ! command -v python3 &> /dev/null; then
print_error "Python 3 not found"
missing=true
else
print_success "Python 3 found: $(python3 --version)"
fi
# Check Python packages
if python3 -c "import requests" 2>/dev/null; then
print_success "Python 'requests' package installed"
else
print_warning "Python 'requests' package missing - installing..."
pip3 install requests --quiet || missing=true
fi
if python3 -c "import yaml" 2>/dev/null; then
print_success "Python 'pyyaml' package installed"
else
print_warning "Python 'pyyaml' package missing - installing..."
pip3 install pyyaml --quiet || missing=true
fi
# Check Ollama
if ! command -v ollama &> /dev/null; then
print_error "Ollama not found. Install from https://ollama.ai"
missing=true
else
print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
fi
# Check if Ollama is running
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
print_success "Ollama server is running"
else
print_warning "Ollama server not running - attempting to start..."
ollama serve &> /dev/null &
sleep 3
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
print_success "Ollama server started"
else
print_error "Could not start Ollama server. Run 'ollama serve' manually."
missing=true
fi
fi
if $missing; then
print_error "Prerequisites check failed"
exit 1
fi
echo ""
}
list_available_models() {
curl -s http://localhost:11434/api/tags | python3 -c "
import json, sys
data = json.load(sys.stdin)
for model in data.get('models', []):
print(model['name'])
" 2>/dev/null || echo ""
}
ensure_model() {
local model=$1
local available=$(list_available_models)
if echo "$available" | grep -q "^$model$"; then
return 0
else
print_warning "Model '$model' not found, pulling..."
ollama pull "$model"
return $?
fi
}
run_evaluation() {
local args=()
if [ -n "$MODELS" ]; then
args+=(--models "$MODELS")
elif $ALL_MODELS; then
args+=(--all-models)
elif $DEFAULT_MODELS; then
args+=(--default-models)
fi
args+=(--tasks "$TASKS")
args+=(--timeout "$TIMEOUT")
args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
if $DRY_RUN; then
args+=(--dry-run)
fi
local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
args+=(--output "$output_file")
print_step "Running evaluation..."
if $VERBOSE; then
echo " Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
fi
echo ""
python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
local exit_code=$?
if [ $exit_code -eq 0 ]; then
print_success "Evaluation completed successfully"
elif [ $exit_code -eq 1 ]; then
print_warning "Evaluation completed with moderate scores"
else
print_error "Evaluation completed with poor scores"
fi
return 0
}
run_comparison() {
print_step "Generating comparison report..."
local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
if [ -z "$result_files" ]; then
print_error "No result files found"
return 1
fi
local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
python3 "$SCRIPT_DIR/compare-models.py" \
--format markdown \
--task-analysis \
--output "$report_file" \
$result_files
print_success "Comparison report: $report_file"
# Also print table to console
echo ""
python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
}
quick_test() {
print_step "Running quick smoke test..."
# Get first available model
local available=$(list_available_models | head -1)
if [ -z "$available" ]; then
print_error "No models available. Pull a model with: ollama pull llama3.2"
exit 1
fi
print_step "Using model: $available"
# Run just one task category
python3 "$SCRIPT_DIR/eval-runner.py" \
--models "$available" \
--tasks tool_calling \
--timeout 60 \
--config "$SCRIPT_DIR/eval-tasks.yaml"
}
# =============================================================================
# Main
# =============================================================================
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--models|-m)
MODELS="$2"
shift 2
;;
--all)
ALL_MODELS=true
shift
;;
--default)
DEFAULT_MODELS=true
shift
;;
--tasks|-t)
TASKS="$2"
shift 2
;;
--timeout)
TIMEOUT="$2"
shift 2
;;
--quick)
QUICK_MODE=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--compare)
COMPARE=true
shift
;;
--verbose|-v)
VERBOSE=true
shift
;;
--help|-h)
usage
exit 0
;;
*)
print_error "Unknown option: $1"
usage
exit 1
;;
esac
done
# Ensure results directory exists
mkdir -p "$RESULTS_DIR"
print_header
check_prerequisites
if $QUICK_MODE; then
quick_test
elif $DRY_RUN; then
run_evaluation
else
run_evaluation
if $COMPARE; then
echo ""
run_comparison
fi
fi
echo ""
print_success "Done!"