backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)
This commit is contained in:
340
scripts/ai/run-model-eval.sh
Executable file
340
scripts/ai/run-model-eval.sh
Executable file
@@ -0,0 +1,340 @@
|
||||
#!/bin/bash
|
||||
# =============================================================================
|
||||
# YAZE AI Model Evaluation Script
|
||||
#
|
||||
# Runs AI model evaluations using the eval-runner.py engine.
|
||||
#
|
||||
# Usage:
|
||||
# ./run-model-eval.sh # Run with defaults
|
||||
# ./run-model-eval.sh --models llama3,qwen2.5 # Specific models
|
||||
# ./run-model-eval.sh --all # All available models
|
||||
# ./run-model-eval.sh --quick # Quick smoke test
|
||||
# ./run-model-eval.sh --compare # Compare and report
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Ollama running (ollama serve)
|
||||
# - Python 3.10+ with requests and pyyaml
|
||||
# - At least one model pulled (ollama pull llama3.2)
|
||||
# =============================================================================
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
RESULTS_DIR="$SCRIPT_DIR/results"
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Default settings
|
||||
MODELS=""
|
||||
TASKS="all"
|
||||
TIMEOUT=120
|
||||
DRY_RUN=false
|
||||
COMPARE=false
|
||||
QUICK_MODE=false
|
||||
ALL_MODELS=false
|
||||
DEFAULT_MODELS=false
|
||||
VERBOSE=false
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
print_header() {
|
||||
echo -e "${CYAN}"
|
||||
echo "╔════════════════════════════════════════════════════════════════════╗"
|
||||
echo "║ YAZE AI Model Evaluation ║"
|
||||
echo "╚════════════════════════════════════════════════════════════════════╝"
|
||||
echo -e "${NC}"
|
||||
}
|
||||
|
||||
print_step() {
|
||||
echo -e "${BLUE}[*]${NC} $1"
|
||||
}
|
||||
|
||||
print_success() {
|
||||
echo -e "${GREEN}[✓]${NC} $1"
|
||||
}
|
||||
|
||||
print_warning() {
|
||||
echo -e "${YELLOW}[!]${NC} $1"
|
||||
}
|
||||
|
||||
print_error() {
|
||||
echo -e "${RED}[✗]${NC} $1"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " --models, -m LIST Comma-separated list of models to evaluate"
|
||||
echo " --all Evaluate all available models"
|
||||
echo " --default Evaluate default models from config"
|
||||
echo " --tasks, -t LIST Task categories (default: all)"
|
||||
echo " Options: rom_inspection, code_analysis, tool_calling, conversation"
|
||||
echo " --timeout SEC Timeout per task in seconds (default: 120)"
|
||||
echo " --quick Quick smoke test (fewer tasks)"
|
||||
echo " --dry-run Show what would run without executing"
|
||||
echo " --compare Generate comparison report after evaluation"
|
||||
echo " --verbose, -v Verbose output"
|
||||
echo " --help, -h Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
|
||||
echo " $0 --all --compare"
|
||||
echo " $0 --quick --default"
|
||||
}
|
||||
|
||||
check_prerequisites() {
|
||||
print_step "Checking prerequisites..."
|
||||
|
||||
local missing=false
|
||||
|
||||
# Check Python
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
print_error "Python 3 not found"
|
||||
missing=true
|
||||
else
|
||||
print_success "Python 3 found: $(python3 --version)"
|
||||
fi
|
||||
|
||||
# Check Python packages
|
||||
if python3 -c "import requests" 2>/dev/null; then
|
||||
print_success "Python 'requests' package installed"
|
||||
else
|
||||
print_warning "Python 'requests' package missing - installing..."
|
||||
pip3 install requests --quiet || missing=true
|
||||
fi
|
||||
|
||||
if python3 -c "import yaml" 2>/dev/null; then
|
||||
print_success "Python 'pyyaml' package installed"
|
||||
else
|
||||
print_warning "Python 'pyyaml' package missing - installing..."
|
||||
pip3 install pyyaml --quiet || missing=true
|
||||
fi
|
||||
|
||||
# Check Ollama
|
||||
if ! command -v ollama &> /dev/null; then
|
||||
print_error "Ollama not found. Install from https://ollama.ai"
|
||||
missing=true
|
||||
else
|
||||
print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
|
||||
fi
|
||||
|
||||
# Check if Ollama is running
|
||||
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
||||
print_success "Ollama server is running"
|
||||
else
|
||||
print_warning "Ollama server not running - attempting to start..."
|
||||
ollama serve &> /dev/null &
|
||||
sleep 3
|
||||
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
||||
print_success "Ollama server started"
|
||||
else
|
||||
print_error "Could not start Ollama server. Run 'ollama serve' manually."
|
||||
missing=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if $missing; then
|
||||
print_error "Prerequisites check failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
list_available_models() {
|
||||
curl -s http://localhost:11434/api/tags | python3 -c "
|
||||
import json, sys
|
||||
data = json.load(sys.stdin)
|
||||
for model in data.get('models', []):
|
||||
print(model['name'])
|
||||
" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
ensure_model() {
|
||||
local model=$1
|
||||
local available=$(list_available_models)
|
||||
|
||||
if echo "$available" | grep -q "^$model$"; then
|
||||
return 0
|
||||
else
|
||||
print_warning "Model '$model' not found, pulling..."
|
||||
ollama pull "$model"
|
||||
return $?
|
||||
fi
|
||||
}
|
||||
|
||||
run_evaluation() {
|
||||
local args=()
|
||||
|
||||
if [ -n "$MODELS" ]; then
|
||||
args+=(--models "$MODELS")
|
||||
elif $ALL_MODELS; then
|
||||
args+=(--all-models)
|
||||
elif $DEFAULT_MODELS; then
|
||||
args+=(--default-models)
|
||||
fi
|
||||
|
||||
args+=(--tasks "$TASKS")
|
||||
args+=(--timeout "$TIMEOUT")
|
||||
args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
|
||||
|
||||
if $DRY_RUN; then
|
||||
args+=(--dry-run)
|
||||
fi
|
||||
|
||||
local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
|
||||
args+=(--output "$output_file")
|
||||
|
||||
print_step "Running evaluation..."
|
||||
if $VERBOSE; then
|
||||
echo " Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
|
||||
local exit_code=$?
|
||||
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
print_success "Evaluation completed successfully"
|
||||
elif [ $exit_code -eq 1 ]; then
|
||||
print_warning "Evaluation completed with moderate scores"
|
||||
else
|
||||
print_error "Evaluation completed with poor scores"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
run_comparison() {
|
||||
print_step "Generating comparison report..."
|
||||
|
||||
local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
|
||||
|
||||
if [ -z "$result_files" ]; then
|
||||
print_error "No result files found"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
|
||||
|
||||
python3 "$SCRIPT_DIR/compare-models.py" \
|
||||
--format markdown \
|
||||
--task-analysis \
|
||||
--output "$report_file" \
|
||||
$result_files
|
||||
|
||||
print_success "Comparison report: $report_file"
|
||||
|
||||
# Also print table to console
|
||||
echo ""
|
||||
python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
|
||||
}
|
||||
|
||||
quick_test() {
|
||||
print_step "Running quick smoke test..."
|
||||
|
||||
# Get first available model
|
||||
local available=$(list_available_models | head -1)
|
||||
|
||||
if [ -z "$available" ]; then
|
||||
print_error "No models available. Pull a model with: ollama pull llama3.2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
print_step "Using model: $available"
|
||||
|
||||
# Run just one task category
|
||||
python3 "$SCRIPT_DIR/eval-runner.py" \
|
||||
--models "$available" \
|
||||
--tasks tool_calling \
|
||||
--timeout 60 \
|
||||
--config "$SCRIPT_DIR/eval-tasks.yaml"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Main
|
||||
# =============================================================================
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--models|-m)
|
||||
MODELS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--all)
|
||||
ALL_MODELS=true
|
||||
shift
|
||||
;;
|
||||
--default)
|
||||
DEFAULT_MODELS=true
|
||||
shift
|
||||
;;
|
||||
--tasks|-t)
|
||||
TASKS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--timeout)
|
||||
TIMEOUT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--quick)
|
||||
QUICK_MODE=true
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
--compare)
|
||||
COMPARE=true
|
||||
shift
|
||||
;;
|
||||
--verbose|-v)
|
||||
VERBOSE=true
|
||||
shift
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
print_error "Unknown option: $1"
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Ensure results directory exists
|
||||
mkdir -p "$RESULTS_DIR"
|
||||
|
||||
print_header
|
||||
check_prerequisites
|
||||
|
||||
if $QUICK_MODE; then
|
||||
quick_test
|
||||
elif $DRY_RUN; then
|
||||
run_evaluation
|
||||
else
|
||||
run_evaluation
|
||||
|
||||
if $COMPARE; then
|
||||
echo ""
|
||||
run_comparison
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
print_success "Done!"
|
||||
|
||||
Reference in New Issue
Block a user