feat: Revamp agent test suite script for improved functionality and usability

- Converted the agent test suite script to a more comprehensive format, consolidating multiple tests into a single script.
- Enhanced pre-flight checks for AI provider availability, including Ollama and Gemini.
- Implemented detailed test execution and result logging, providing clearer output and recommendations for troubleshooting.
- Removed outdated test scripts to streamline the testing process and improve maintainability.
- Updated README to reflect changes in the test suite and added build environment verification instructions.
This commit is contained in:
scawful
2025-10-04 14:10:04 -04:00
parent 3ef157b991
commit 99d37a8747
8 changed files with 250 additions and 1260 deletions

303
scripts/agent_test_suite.sh Normal file → Executable file
View File

@@ -1,93 +1,238 @@
#!/bin/bash
# Comprehensive test script for Ollama and Gemini AI providers with tool calling
# Comprehensive test suite for the z3ed AI Agent.
# This script consolidates multiple older test scripts into one.
#
# Usage: ./scripts/agent_test_suite.sh <provider>
# provider: ollama, gemini, or mock
set -e
set -e # Exit immediately if a command exits with a non-zero status.
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# --- Configuration ---
Z3ED_BIN="/Users/scawful/Code/yaze/build_test/bin/z3ed"
ROM_PATH="/Users/scawful/Code/yaze/assets/zelda3.sfc"
TEST_DIR="/Users/scawful/Code/yaze/assets/agent"
TEST_FILES=(
"context_and_followup.txt"
"complex_command_generation.txt"
"error_handling_and_edge_cases.txt"
)
Z3ED="./build_test/bin/z3ed"
ROM="assets/zelda3.sfc"
RESULTS_FILE="/tmp/z3ed_ai_test_results.txt"
# --- Helper Functions ---
print_header() {
echo ""
echo "================================================="
echo "$1"
echo "================================================="
echo "=========================================="
echo " Z3ED AI Provider Test Suite"
echo "=========================================="
echo ""
# Clear results file
> "$RESULTS_FILE"
# Check if z3ed exists
if [ ! -f "$Z3ED" ]; then
echo -e "${RED}✗ z3ed not found at $Z3ED${NC}"
echo " Try building with: cmake --build build_rooms"
exit 1
fi
echo -e "${GREEN}✓ z3ed found${NC}"
# Check if ROM exists
if [ ! -f "$ROM" ]; then
echo -e "${RED}✗ ROM not found at $ROM${NC}"
exit 1
fi
echo -e "${GREEN}✓ ROM found${NC}"
# Test Ollama availability
OLLAMA_AVAILABLE=false
if command -v ollama &> /dev/null && curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
if ollama list | grep -q "qwen2.5-coder"; then
OLLAMA_AVAILABLE=true
echo -e "${GREEN}✓ Ollama available (qwen2.5-coder)${NC}"
else
echo -e "${YELLOW}⚠ Ollama available but qwen2.5-coder not found${NC}"
echo " Install with: ollama pull qwen2.5-coder:7b"
fi
else
echo -e "${YELLOW}⚠ Ollama not available${NC}"
fi
# Test Gemini availability
GEMINI_AVAILABLE=false
if [ -n "$GEMINI_API_KEY" ]; then
GEMINI_AVAILABLE=true
echo -e "${GREEN}✓ Gemini API key configured${NC}"
else
echo -e "${YELLOW}⚠ Gemini API key not set${NC}"
echo " Set with: export GEMINI_API_KEY='your-key'"
fi
if [ "$OLLAMA_AVAILABLE" = false ] && [ "$GEMINI_AVAILABLE" = false ]; then
echo -e "${RED}✗ No AI providers available${NC}"
exit 1
fi
echo ""
# Test function
run_test() {
local test_name="$1"
local provider="$2"
local query="$3"
local expected_pattern="$4"
local extra_args="$5"
echo "=========================================="
echo " Test: $test_name"
echo " Provider: $provider"
echo "=========================================="
echo ""
echo "Query: $query"
echo ""
local cmd="$Z3ED agent simple-chat \"$query\" --rom=\"$ROM\" --ai_provider=$provider $extra_args"
echo "Running: $cmd"
echo ""
local output
local exit_code=0
output=$($cmd 2>&1) || exit_code=$?
echo "$output"
echo ""
# Check for expected patterns
local result="UNKNOWN"
if [ $exit_code -ne 0 ]; then
result="FAILED (exit code: $exit_code)"
elif echo "$output" | grep -qi "$expected_pattern"; then
result="PASSED"
echo -e "${GREEN}✓ Response contains expected pattern: '$expected_pattern'${NC}"
else
result="FAILED (pattern not found)"
echo -e "${YELLOW}⚠ Response missing expected pattern: '$expected_pattern'${NC}"
fi
# Check for error indicators
if echo "$output" | grep -qi "error\|failed\|infinite loop"; then
result="FAILED (error detected)"
echo -e "${RED}✗ Error detected in output${NC}"
fi
# Record result
echo "$test_name | $provider | $result" >> "$RESULTS_FILE"
echo ""
echo -e "${BLUE}Result: $result${NC}"
echo ""
sleep 2 # Avoid rate limiting
}
# --- Pre-flight Checks ---
print_header "Performing Pre-flight Checks"
# Test Suite
if [ -z "$1" ]; then
echo "❌ Error: No AI provider specified."
echo "Usage: $0 <ollama|gemini|mock>"
exit 1
fi
PROVIDER=$1
echo "✅ Provider: $PROVIDER"
if [ ! -f "$Z3ED_BIN" ]; then
echo "❌ Error: z3ed binary not found at $Z3ED_BIN"
echo "Please build the project first (e.g., in build_test)."
exit 1
fi
echo "✅ z3ed binary found."
if [ ! -f "$ROM_PATH" ]; then
echo "❌ Error: ROM not found at $ROM_PATH"
exit 1
fi
echo "✅ ROM file found."
if [ "$PROVIDER" == "gemini" ] && [ -z "$GEMINI_API_KEY" ]; then
echo "❌ Error: GEMINI_API_KEY environment variable is not set."
echo "Please set it to your Gemini API key to run this test."
exit 1
fi
if [ "$PROVIDER" == "gemini" ]; then
echo "✅ GEMINI_API_KEY is set."
if [ "$OLLAMA_AVAILABLE" = true ]; then
echo ""
echo "=========================================="
echo " OLLAMA TESTS"
echo "=========================================="
echo ""
run_test "Ollama: Simple Question" "ollama" \
"What dungeons are in this ROM?" \
"dungeon\|palace\|castle"
run_test "Ollama: Sprite Query" "ollama" \
"What sprites are in room 0?" \
"sprite\|room"
run_test "Ollama: Tile Search" "ollama" \
"Where can I find trees in the overworld?" \
"tree\|0x02E\|map\|coordinate"
run_test "Ollama: Map Description" "ollama" \
"Describe overworld map 0" \
"light world\|map\|overworld"
run_test "Ollama: Warp List" "ollama" \
"List the warps in the Light World" \
"warp\|entrance\|exit"
fi
if [ "$PROVIDER" == "ollama" ]; then
if ! pgrep -x "Ollama" > /dev/null && ! pgrep -x "ollama" > /dev/null; then
echo "⚠️ Warning: Ollama server process not found. The script might fail if it's not running."
if [ "$GEMINI_AVAILABLE" = true ]; then
echo ""
echo "=========================================="
echo " GEMINI TESTS"
echo "=========================================="
echo ""
run_test "Gemini: Simple Question" "gemini" \
"What dungeons are in this ROM?" \
"dungeon\|palace\|castle" \
"--gemini_api_key=\"$GEMINI_API_KEY\""
run_test "Gemini: Sprite Query" "gemini" \
"What sprites are in room 0?" \
"sprite\|room" \
"--gemini_api_key=\"$GEMINI_API_KEY\""
run_test "Gemini: Tile Search" "gemini" \
"Where can I find trees in the overworld?" \
"tree\|0x02E\|map\|coordinate" \
"--gemini_api_key=\"$GEMINI_API_KEY\""
run_test "Gemini: Map Description" "gemini" \
"Describe overworld map 0" \
"light world\|map\|overworld" \
"--gemini_api_key=\"$GEMINI_API_KEY\""
run_test "Gemini: Warp List" "gemini" \
"List the warps in the Light World" \
"warp\|entrance\|exit" \
"--gemini_api_key=\"$GEMINI_API_KEY\""
fi
echo ""
echo "=========================================="
echo " TEST SUMMARY"
echo "=========================================="
echo ""
if [ -f "$RESULTS_FILE" ]; then
cat "$RESULTS_FILE"
echo ""
local total=$(wc -l < "$RESULTS_FILE" | tr -d ' ')
local passed=$(grep -c "PASSED" "$RESULTS_FILE" || echo "0")
local failed=$(grep -c "FAILED" "$RESULTS_FILE" || echo "0")
echo "Total Tests: $total"
echo -e "${GREEN}Passed: $passed${NC}"
echo -e "${RED}Failed: $failed${NC}"
echo ""
if [ "$passed" -eq "$total" ]; then
echo -e "${GREEN}🎉 All tests passed!${NC}"
elif [ "$passed" -gt 0 ]; then
echo -e "${YELLOW}⚠ Some tests failed. Review output above.${NC}"
else
echo "✅ Ollama server process found."
echo -e "${RED}✗ All tests failed. Check configuration.${NC}"
fi
else
echo -e "${RED}✗ No results file generated${NC}"
fi
# --- Run Test Suite ---
for test_file in "${TEST_FILES[@]}"; do
print_header "Running Test File: $test_file (Provider: $PROVIDER)"
FULL_TEST_PATH="$TEST_DIR/$test_file"
if [ ! -f "$FULL_TEST_PATH" ]; then
echo "❌ Error: Test file not found: $FULL_TEST_PATH"
continue
fi
# Construct the command. Use --quiet for cleaner test logs.
COMMAND="$Z3ED_BIN agent simple-chat --file=$FULL_TEST_PATH --rom=$ROM_PATH --ai_provider=$PROVIDER --quiet"
echo "Executing command..."
echo "--- Agent Output for $test_file ---"
# Execute the command and print its output
eval $COMMAND
echo "--- Test Complete ---"
echo ""
done
print_header "✅ All tests completed successfully!"
echo ""
echo "=========================================="
echo " Recommendations"
echo "=========================================="
echo ""
echo "If tests are failing:"
echo " 1. Check that the ROM is valid and loaded properly"
echo " 2. Verify tool definitions in prompt_catalogue.yaml"
echo " 3. Review system prompts in prompt_builder.cc"
echo " 4. Check AI provider connectivity and quotas"
echo " 5. Examine tool execution logs for errors"
echo ""
echo "For Ollama:"
echo " - Try different models: ollama pull llama3:8b"
echo " - Adjust temperature in ollama_ai_service.cc"
echo ""
echo "For Gemini:"
echo " - Verify API key is valid"
echo " - Check quota at: https://aistudio.google.com"
echo ""
echo "Results saved to: $RESULTS_FILE"
echo ""