- Added functionality to start and manage the Ollama server within the agent test suite. - Implemented checks for the availability of the Ollama model and provided user feedback for setup. - Updated usage instructions to include environment variables for configuring the Ollama model. Benefits: - Improves the testing framework by allowing dynamic management of the Ollama server and model, enhancing test coverage and flexibility.
359 lines
10 KiB
Bash
Executable File
359 lines
10 KiB
Bash
Executable File
#!/bin/bash
|
|
# Comprehensive test script for Ollama and Gemini AI providers with tool calling
|
|
|
|
set -e
|
|
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
RED='\033[0;31m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
Z3ED="./build_test/bin/z3ed"
|
|
RESULTS_FILE="/tmp/z3ed_ai_test_results.txt"
|
|
USE_MOCK_ROM=true # Set to false if you want to test with a real ROM
|
|
OLLAMA_MODEL="${OLLAMA_MODEL:-qwen2.5-coder:latest}"
|
|
OLLAMA_PID=""
|
|
|
|
echo "=========================================="
|
|
echo " Z3ED AI Provider Test Suite"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
# Clear results file
|
|
> "$RESULTS_FILE"
|
|
|
|
# Cleanup function
|
|
cleanup() {
|
|
if [ -n "$OLLAMA_PID" ]; then
|
|
echo ""
|
|
echo "Stopping Ollama server (PID: $OLLAMA_PID)..."
|
|
kill "$OLLAMA_PID" 2>/dev/null || true
|
|
wait "$OLLAMA_PID" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
# Register cleanup on exit
|
|
trap cleanup EXIT INT TERM
|
|
|
|
# --- Helper Functions ---
|
|
|
|
# Start Ollama server if not already running
|
|
start_ollama_server() {
|
|
echo "Checking Ollama server status..."
|
|
|
|
# Check if Ollama is already running
|
|
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
|
echo -e "${GREEN}✓ Ollama server already running${NC}"
|
|
return 0
|
|
fi
|
|
|
|
# Check if ollama command exists
|
|
if ! command -v ollama &> /dev/null; then
|
|
echo -e "${YELLOW}⚠ Ollama command not found. Skipping Ollama tests.${NC}"
|
|
return 1
|
|
fi
|
|
|
|
echo "Starting Ollama server..."
|
|
ollama serve > /tmp/ollama_server.log 2>&1 &
|
|
OLLAMA_PID=$!
|
|
|
|
# Wait for server to be ready (max 30 seconds)
|
|
local max_wait=30
|
|
local waited=0
|
|
while [ $waited -lt $max_wait ]; do
|
|
if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
|
|
echo -e "${GREEN}✓ Ollama server started (PID: $OLLAMA_PID)${NC}"
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
waited=$((waited + 1))
|
|
done
|
|
|
|
echo -e "${RED}✗ Ollama server failed to start within ${max_wait}s${NC}"
|
|
echo "Check logs at: /tmp/ollama_server.log"
|
|
return 1
|
|
}
|
|
|
|
# Ensure Ollama model is available
|
|
setup_ollama_model() {
|
|
local model="$1"
|
|
echo "Checking for Ollama model: $model"
|
|
|
|
if ollama list | grep -q "${model%:*}"; then
|
|
echo -e "${GREEN}✓ Model $model already available${NC}"
|
|
return 0
|
|
fi
|
|
|
|
echo "Pulling Ollama model: $model (this may take a while)..."
|
|
if ollama pull "$model"; then
|
|
echo -e "${GREEN}✓ Model $model pulled successfully${NC}"
|
|
return 0
|
|
else
|
|
echo -e "${RED}✗ Failed to pull model $model${NC}"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# --- Pre-flight Checks ---
|
|
|
|
if [ -z "$1" ]; then
|
|
echo "❌ Error: No AI provider specified."
|
|
echo "Usage: $0 <ollama|gemini|mock>"
|
|
echo ""
|
|
echo "Environment Variables:"
|
|
echo " OLLAMA_MODEL - Ollama model to use (default: qwen2.5-coder:latest)"
|
|
echo " GEMINI_API_KEY - Required for Gemini provider"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 ollama # Use Ollama with default model"
|
|
echo " OLLAMA_MODEL=llama3:8b $0 ollama # Use Ollama with llama3"
|
|
echo " GEMINI_API_KEY=xyz $0 gemini # Use Gemini"
|
|
exit 1
|
|
fi
|
|
PROVIDER=$1
|
|
echo "✅ Provider: $PROVIDER"
|
|
|
|
# Check binary exists
|
|
if [ ! -f "$Z3ED" ]; then
|
|
echo -e "${RED}✗ z3ed binary not found at: $Z3ED${NC}"
|
|
echo "Run: cmake --build build_test"
|
|
exit 1
|
|
fi
|
|
echo "✅ z3ed binary found"
|
|
|
|
# Set ROM flags based on mode
|
|
if [ "$USE_MOCK_ROM" = true ]; then
|
|
ROM_FLAGS="--mock-rom"
|
|
echo "✅ Using mock ROM mode (no ROM file required)"
|
|
else
|
|
ROM="assets/zelda3.sfc"
|
|
if [ ! -f "$ROM" ]; then
|
|
echo -e "${RED}✗ ROM file not found: $ROM${NC}"
|
|
echo "Tip: Use mock ROM mode by setting USE_MOCK_ROM=true"
|
|
exit 1
|
|
fi
|
|
ROM_FLAGS="--rom=\"$ROM\""
|
|
echo "✅ Real ROM found: $ROM"
|
|
fi
|
|
|
|
# Verify z3ed can execute
|
|
if "$Z3ED" --help > /dev/null 2>&1; then
|
|
echo "✅ z3ed executable works"
|
|
else
|
|
echo "${RED}✗ z3ed failed to execute${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
# Setup Ollama if needed
|
|
OLLAMA_AVAILABLE=false
|
|
if [ "$PROVIDER" == "ollama" ] || [ -z "$PROVIDER" ]; then
|
|
if start_ollama_server; then
|
|
if setup_ollama_model "$OLLAMA_MODEL"; then
|
|
OLLAMA_AVAILABLE=true
|
|
echo -e "${GREEN}✓ Ollama ready with model: $OLLAMA_MODEL${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠ Ollama server running but model setup failed${NC}"
|
|
fi
|
|
else
|
|
echo -e "${YELLOW}⚠ Ollama server not available${NC}"
|
|
fi
|
|
fi
|
|
|
|
# Test Gemini availability
|
|
GEMINI_AVAILABLE=false
|
|
if [ -n "$GEMINI_API_KEY" ]; then
|
|
GEMINI_AVAILABLE=true
|
|
echo -e "${GREEN}✓ Gemini API key configured${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠ Gemini API key not set${NC}"
|
|
fi
|
|
|
|
if [ "$PROVIDER" == "ollama" ] && [ "$OLLAMA_AVAILABLE" = false ]; then
|
|
echo -e "${RED}✗ Exiting: Ollama provider requested but not available.${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
if [ "$PROVIDER" == "gemini" ] && [ "$GEMINI_AVAILABLE" = false ]; then
|
|
echo -e "${RED}✗ Exiting: Gemini provider requested but GEMINI_API_KEY is not set.${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
# --- Run Test Suite ---
|
|
|
|
# Test function
|
|
run_test() {
|
|
local test_name="$1"
|
|
local provider="$2"
|
|
local query="$3"
|
|
local expected_pattern="$4"
|
|
local extra_args="$5"
|
|
|
|
echo "=========================================="
|
|
echo " Test: $test_name"
|
|
echo " Provider: $provider"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo "Query: $query"
|
|
echo ""
|
|
|
|
local cmd="$Z3ED agent simple-chat \"$query\" $ROM_FLAGS --ai_provider=$provider $extra_args"
|
|
echo "Running: $cmd"
|
|
echo ""
|
|
|
|
local output
|
|
local exit_code=0
|
|
output=$($cmd 2>&1) || exit_code=$?
|
|
|
|
echo "$output"
|
|
echo ""
|
|
|
|
# Check for expected patterns
|
|
local result="UNKNOWN"
|
|
if [ $exit_code -ne 0 ]; then
|
|
result="FAILED (exit code: $exit_code)"
|
|
elif echo "$output" | grep -qi "$expected_pattern"; then
|
|
result="PASSED"
|
|
echo -e "${GREEN}✓ Response contains expected pattern: '$expected_pattern'${NC}"
|
|
else
|
|
result="FAILED (pattern not found)"
|
|
echo -e "${YELLOW}⚠ Response missing expected pattern: '$expected_pattern'${NC}"
|
|
fi
|
|
|
|
# Check for error indicators
|
|
if echo "$output" | grep -qi "error\|failed\|infinite loop"; then
|
|
result="FAILED (error detected)"
|
|
echo -e "${RED}✗ Error detected in output${NC}"
|
|
fi
|
|
|
|
# Record result
|
|
echo "$test_name | $provider | $result" >> "$RESULTS_FILE"
|
|
echo ""
|
|
echo -e "${BLUE}Result: $result${NC}"
|
|
echo ""
|
|
|
|
sleep 2 # Avoid rate limiting
|
|
}
|
|
|
|
# Test Suite
|
|
|
|
if [ "$OLLAMA_AVAILABLE" = true ]; then
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " OLLAMA TESTS"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
run_test "Ollama: Simple Question" "ollama" \
|
|
"What dungeons are in this ROM?" \
|
|
"dungeon\|palace\|castle"
|
|
|
|
run_test "Ollama: Sprite Query" "ollama" \
|
|
"What sprites are in room 0?" \
|
|
"sprite\|room"
|
|
|
|
run_test "Ollama: Tile Search" "ollama" \
|
|
"Where can I find trees in the overworld?" \
|
|
"tree\|0x02E\|map\|coordinate"
|
|
|
|
run_test "Ollama: Map Description" "ollama" \
|
|
"Describe overworld map 0" \
|
|
"light world\|map\|overworld"
|
|
|
|
run_test "Ollama: Warp List" "ollama" \
|
|
"List the warps in the Light World" \
|
|
"warp\|entrance\|exit"
|
|
fi
|
|
|
|
if [ "$GEMINI_AVAILABLE" = true ]; then
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " GEMINI TESTS"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
run_test "Gemini: Simple Question" "gemini" \
|
|
"What dungeons are in this ROM?" \
|
|
"dungeon\|palace\|castle" \
|
|
"--gemini_api_key=\"$GEMINI_API_KEY\""
|
|
|
|
run_test "Gemini: Sprite Query" "gemini" \
|
|
"What sprites are in room 0?" \
|
|
"sprite\|room" \
|
|
"--gemini_api_key=\"$GEMINI_API_KEY\""
|
|
|
|
run_test "Gemini: Tile Search" "gemini" \
|
|
"Where can I find trees in the overworld?" \
|
|
"tree\|0x02E\|map\|coordinate" \
|
|
"--gemini_api_key=\"$GEMINI_API_KEY\""
|
|
|
|
run_test "Gemini: Map Description" "gemini" \
|
|
"Describe overworld map 0" \
|
|
"light world\|map\|overworld" \
|
|
"--gemini_api_key=\"$GEMINI_API_KEY\""
|
|
|
|
run_test "Gemini: Warp List" "gemini" \
|
|
"List the warps in the Light World" \
|
|
"warp\|entrance\|exit" \
|
|
"--gemini_api_key=\"$GEMINI_API_KEY\""
|
|
fi
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " TEST SUMMARY"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
if [ -f "$RESULTS_FILE" ]; then
|
|
cat "$RESULTS_FILE"
|
|
echo ""
|
|
|
|
local total=$(wc -l < "$RESULTS_FILE" | tr -d ' ')
|
|
local passed=$(grep -c "PASSED" "$RESULTS_FILE" || echo "0")
|
|
local failed=$(grep -c "FAILED" "$RESULTS_FILE" || echo "0")
|
|
|
|
echo "Total Tests: $total"
|
|
echo -e "${GREEN}Passed: $passed${NC}"
|
|
echo -e "${RED}Failed: $failed${NC}"
|
|
echo ""
|
|
|
|
if [ "$passed" -eq "$total" ]; then
|
|
echo -e "${GREEN}🎉 All tests passed!${NC}"
|
|
elif [ "$passed" -gt 0 ]; then
|
|
echo -e "${YELLOW}⚠ Some tests failed. Review output above.${NC}"
|
|
else
|
|
echo -e "${RED}✗ All tests failed. Check configuration.${NC}"
|
|
fi
|
|
else
|
|
echo -e "${RED}✗ No results file generated${NC}"
|
|
fi
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " Recommendations"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo "If tests are failing:"
|
|
echo " 1. Check that the ROM is valid and loaded properly"
|
|
echo " 2. Verify tool definitions in prompt_catalogue.yaml"
|
|
echo " 3. Review system prompts in prompt_builder.cc"
|
|
echo " 4. Check AI provider connectivity and quotas"
|
|
echo " 5. Examine tool execution logs for errors"
|
|
echo ""
|
|
echo "For Ollama:"
|
|
echo " - Try different models: OLLAMA_MODEL=llama3:8b $0 ollama"
|
|
echo " - Default model: $OLLAMA_MODEL"
|
|
echo " - Adjust temperature in ollama_ai_service.cc"
|
|
echo " - Server logs: /tmp/ollama_server.log"
|
|
echo ""
|
|
echo "For Gemini:"
|
|
echo " - Verify API key is valid"
|
|
echo " - Check quota at: https://aistudio.google.com"
|
|
echo ""
|
|
echo "Environment Variables:"
|
|
echo " - OLLAMA_MODEL: Set the Ollama model (default: qwen2.5-coder:latest)"
|
|
echo " - GEMINI_API_KEY: Required for Gemini tests"
|
|
echo ""
|
|
echo "Results saved to: $RESULTS_FILE"
|
|
echo ""
|