341 lines
9.0 KiB
Bash
Executable File
341 lines
9.0 KiB
Bash
Executable File
#!/bin/bash
|
|
# =============================================================================
|
|
# YAZE AI Model Evaluation Script
|
|
#
|
|
# Runs AI model evaluations using the eval-runner.py engine.
|
|
#
|
|
# Usage:
|
|
# ./run-model-eval.sh # Run with defaults
|
|
# ./run-model-eval.sh --models llama3,qwen2.5 # Specific models
|
|
# ./run-model-eval.sh --all # All available models
|
|
# ./run-model-eval.sh --quick # Quick smoke test
|
|
# ./run-model-eval.sh --compare # Compare and report
|
|
#
|
|
# Prerequisites:
|
|
# - Ollama running (ollama serve)
|
|
# - Python 3.10+ with requests and pyyaml
|
|
# - At least one model pulled (ollama pull llama3.2)
|
|
# =============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
RESULTS_DIR="$SCRIPT_DIR/results"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Default settings
|
|
MODELS=""
|
|
TASKS="all"
|
|
TIMEOUT=120
|
|
DRY_RUN=false
|
|
COMPARE=false
|
|
QUICK_MODE=false
|
|
ALL_MODELS=false
|
|
DEFAULT_MODELS=false
|
|
VERBOSE=false
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
print_header() {
|
|
echo -e "${CYAN}"
|
|
echo "╔════════════════════════════════════════════════════════════════════╗"
|
|
echo "║ YAZE AI Model Evaluation ║"
|
|
echo "╚════════════════════════════════════════════════════════════════════╝"
|
|
echo -e "${NC}"
|
|
}
|
|
|
|
print_step() {
|
|
echo -e "${BLUE}[*]${NC} $1"
|
|
}
|
|
|
|
print_success() {
|
|
echo -e "${GREEN}[✓]${NC} $1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}[!]${NC} $1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}[✗]${NC} $1"
|
|
}
|
|
|
|
usage() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --models, -m LIST Comma-separated list of models to evaluate"
|
|
echo " --all Evaluate all available models"
|
|
echo " --default Evaluate default models from config"
|
|
echo " --tasks, -t LIST Task categories (default: all)"
|
|
echo " Options: rom_inspection, code_analysis, tool_calling, conversation"
|
|
echo " --timeout SEC Timeout per task in seconds (default: 120)"
|
|
echo " --quick Quick smoke test (fewer tasks)"
|
|
echo " --dry-run Show what would run without executing"
|
|
echo " --compare Generate comparison report after evaluation"
|
|
echo " --verbose, -v Verbose output"
|
|
echo " --help, -h Show this help message"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
|
|
echo " $0 --all --compare"
|
|
echo " $0 --quick --default"
|
|
}
|
|
|
|
check_prerequisites() {
|
|
print_step "Checking prerequisites..."
|
|
|
|
local missing=false
|
|
|
|
# Check Python
|
|
if ! command -v python3 &> /dev/null; then
|
|
print_error "Python 3 not found"
|
|
missing=true
|
|
else
|
|
print_success "Python 3 found: $(python3 --version)"
|
|
fi
|
|
|
|
# Check Python packages
|
|
if python3 -c "import requests" 2>/dev/null; then
|
|
print_success "Python 'requests' package installed"
|
|
else
|
|
print_warning "Python 'requests' package missing - installing..."
|
|
pip3 install requests --quiet || missing=true
|
|
fi
|
|
|
|
if python3 -c "import yaml" 2>/dev/null; then
|
|
print_success "Python 'pyyaml' package installed"
|
|
else
|
|
print_warning "Python 'pyyaml' package missing - installing..."
|
|
pip3 install pyyaml --quiet || missing=true
|
|
fi
|
|
|
|
# Check Ollama
|
|
if ! command -v ollama &> /dev/null; then
|
|
print_error "Ollama not found. Install from https://ollama.ai"
|
|
missing=true
|
|
else
|
|
print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
|
|
fi
|
|
|
|
# Check if Ollama is running
|
|
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
|
print_success "Ollama server is running"
|
|
else
|
|
print_warning "Ollama server not running - attempting to start..."
|
|
ollama serve &> /dev/null &
|
|
sleep 3
|
|
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
|
print_success "Ollama server started"
|
|
else
|
|
print_error "Could not start Ollama server. Run 'ollama serve' manually."
|
|
missing=true
|
|
fi
|
|
fi
|
|
|
|
if $missing; then
|
|
print_error "Prerequisites check failed"
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
list_available_models() {
|
|
curl -s http://localhost:11434/api/tags | python3 -c "
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
for model in data.get('models', []):
|
|
print(model['name'])
|
|
" 2>/dev/null || echo ""
|
|
}
|
|
|
|
ensure_model() {
|
|
local model=$1
|
|
local available=$(list_available_models)
|
|
|
|
if echo "$available" | grep -q "^$model$"; then
|
|
return 0
|
|
else
|
|
print_warning "Model '$model' not found, pulling..."
|
|
ollama pull "$model"
|
|
return $?
|
|
fi
|
|
}
|
|
|
|
run_evaluation() {
|
|
local args=()
|
|
|
|
if [ -n "$MODELS" ]; then
|
|
args+=(--models "$MODELS")
|
|
elif $ALL_MODELS; then
|
|
args+=(--all-models)
|
|
elif $DEFAULT_MODELS; then
|
|
args+=(--default-models)
|
|
fi
|
|
|
|
args+=(--tasks "$TASKS")
|
|
args+=(--timeout "$TIMEOUT")
|
|
args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
|
|
|
|
if $DRY_RUN; then
|
|
args+=(--dry-run)
|
|
fi
|
|
|
|
local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
|
|
args+=(--output "$output_file")
|
|
|
|
print_step "Running evaluation..."
|
|
if $VERBOSE; then
|
|
echo " Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
|
|
fi
|
|
echo ""
|
|
|
|
python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
|
|
local exit_code=$?
|
|
|
|
if [ $exit_code -eq 0 ]; then
|
|
print_success "Evaluation completed successfully"
|
|
elif [ $exit_code -eq 1 ]; then
|
|
print_warning "Evaluation completed with moderate scores"
|
|
else
|
|
print_error "Evaluation completed with poor scores"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
run_comparison() {
|
|
print_step "Generating comparison report..."
|
|
|
|
local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
|
|
|
|
if [ -z "$result_files" ]; then
|
|
print_error "No result files found"
|
|
return 1
|
|
fi
|
|
|
|
local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
|
|
|
|
python3 "$SCRIPT_DIR/compare-models.py" \
|
|
--format markdown \
|
|
--task-analysis \
|
|
--output "$report_file" \
|
|
$result_files
|
|
|
|
print_success "Comparison report: $report_file"
|
|
|
|
# Also print table to console
|
|
echo ""
|
|
python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
|
|
}
|
|
|
|
quick_test() {
|
|
print_step "Running quick smoke test..."
|
|
|
|
# Get first available model
|
|
local available=$(list_available_models | head -1)
|
|
|
|
if [ -z "$available" ]; then
|
|
print_error "No models available. Pull a model with: ollama pull llama3.2"
|
|
exit 1
|
|
fi
|
|
|
|
print_step "Using model: $available"
|
|
|
|
# Run just one task category
|
|
python3 "$SCRIPT_DIR/eval-runner.py" \
|
|
--models "$available" \
|
|
--tasks tool_calling \
|
|
--timeout 60 \
|
|
--config "$SCRIPT_DIR/eval-tasks.yaml"
|
|
}
|
|
|
|
# =============================================================================
|
|
# Main
|
|
# =============================================================================
|
|
|
|
# Parse arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--models|-m)
|
|
MODELS="$2"
|
|
shift 2
|
|
;;
|
|
--all)
|
|
ALL_MODELS=true
|
|
shift
|
|
;;
|
|
--default)
|
|
DEFAULT_MODELS=true
|
|
shift
|
|
;;
|
|
--tasks|-t)
|
|
TASKS="$2"
|
|
shift 2
|
|
;;
|
|
--timeout)
|
|
TIMEOUT="$2"
|
|
shift 2
|
|
;;
|
|
--quick)
|
|
QUICK_MODE=true
|
|
shift
|
|
;;
|
|
--dry-run)
|
|
DRY_RUN=true
|
|
shift
|
|
;;
|
|
--compare)
|
|
COMPARE=true
|
|
shift
|
|
;;
|
|
--verbose|-v)
|
|
VERBOSE=true
|
|
shift
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
print_error "Unknown option: $1"
|
|
usage
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Ensure results directory exists
|
|
mkdir -p "$RESULTS_DIR"
|
|
|
|
print_header
|
|
check_prerequisites
|
|
|
|
if $QUICK_MODE; then
|
|
quick_test
|
|
elif $DRY_RUN; then
|
|
run_evaluation
|
|
else
|
|
run_evaluation
|
|
|
|
if $COMPARE; then
|
|
echo ""
|
|
run_comparison
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
print_success "Done!"
|
|
|