feat: Update function schemas and system prompt for enhanced agent capabilities
- Revised function schemas in `function_schemas.json` to streamline resource listing and searching, including new parameters for dungeon and overworld queries. - Introduced a new system prompt in `system_prompt_v3.txt` to improve the agent's proactive exploration and multi-tool chaining strategies. - Updated `GeminiAIService` to support the new prompt version and enhanced function calling logic for better tool integration. - Added tests for multimodal image analysis and error handling in `test_gemini_vision.cc` to ensure robust functionality.
This commit is contained in:
@@ -1,302 +1,166 @@
|
||||
[
|
||||
{
|
||||
"name": "resource-list",
|
||||
"description": "List all labeled resources of a specific type (dungeons, sprites, palettes)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": "Resource type to list",
|
||||
"enum": [
|
||||
"dungeon",
|
||||
"room",
|
||||
"entrance",
|
||||
"overworld",
|
||||
"sprite",
|
||||
"palette",
|
||||
"item",
|
||||
"tile16",
|
||||
"all"
|
||||
]
|
||||
{
|
||||
"function_declarations": [
|
||||
{
|
||||
"name": "resource-list",
|
||||
"description": "List all resources of a specific type from the ROM (rooms, sprites, dungeons, entrances, items, overlords)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["room", "sprite", "dungeon", "entrance", "item", "overlord"],
|
||||
"description": "Type of resource to list"
|
||||
}
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Output format",
|
||||
"enum": ["json", "table", "text"],
|
||||
"default": "table"
|
||||
"required": ["type"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "resource-search",
|
||||
"description": "Search for resources by name or pattern",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"enum": ["room", "sprite", "dungeon", "entrance", "item"],
|
||||
"description": "Type of resource to search"
|
||||
},
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search query or pattern to match"
|
||||
}
|
||||
},
|
||||
"required": ["type", "query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "dungeon-list-sprites",
|
||||
"description": "List all sprites in a specific dungeon or room",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dungeon": {
|
||||
"type": "string",
|
||||
"description": "Dungeon name (e.g., 'hyrule_castle', 'eastern_palace') or leave empty for all"
|
||||
},
|
||||
"room_id": {
|
||||
"type": "string",
|
||||
"description": "Specific room ID to query (optional)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["type"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "resource-search",
|
||||
"description": "Search labeled resources by name, ID, or partial match",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Search text (case-insensitive substring match)"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "dungeon-describe-room",
|
||||
"description": "Get detailed information about a specific dungeon room including sprites, chests, layout, and connections",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"room_id": {
|
||||
"type": "string",
|
||||
"description": "Room ID to describe (0-296)"
|
||||
},
|
||||
"include_sprites": {
|
||||
"type": "string",
|
||||
"enum": ["true", "false"],
|
||||
"description": "Include sprite information (default: true)"
|
||||
}
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": "Optional resource category to filter",
|
||||
"enum": [
|
||||
"dungeon",
|
||||
"room",
|
||||
"entrance",
|
||||
"overworld",
|
||||
"sprite",
|
||||
"palette",
|
||||
"item",
|
||||
"tile16",
|
||||
"all"
|
||||
],
|
||||
"default": "all"
|
||||
"required": ["room_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-find-tile",
|
||||
"description": "Find all locations where a specific tile16 ID appears in the overworld",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tile_id": {
|
||||
"type": "string",
|
||||
"description": "Tile16 ID to search for (hex format like '0x42' or decimal)"
|
||||
},
|
||||
"map_id": {
|
||||
"type": "string",
|
||||
"description": "Specific overworld map to search (0-63, optional)"
|
||||
}
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Output format",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "dungeon-list-sprites",
|
||||
"description": "List all sprites in a specific dungeon room",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"room": {
|
||||
"type": "string",
|
||||
"description": "Room ID in hex format (e.g., 0x012)"
|
||||
"required": ["tile_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-describe-map",
|
||||
"description": "Get detailed information about a specific overworld map including tile composition, warps, and sprites",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map_id": {
|
||||
"type": "string",
|
||||
"description": "Overworld map ID (0-63 for light/dark world)"
|
||||
},
|
||||
"include_tiles": {
|
||||
"type": "string",
|
||||
"enum": ["true", "false"],
|
||||
"description": "Include tile distribution statistics"
|
||||
}
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["room"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "dungeon-describe-room",
|
||||
"description": "Summarize dungeon room metadata, hazards, and counts",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"room": {
|
||||
"type": "string",
|
||||
"description": "Room ID in hex format (e.g., 0x012)"
|
||||
"required": ["map_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-list-warps",
|
||||
"description": "List all warp/entrance points for a specific overworld map",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map_id": {
|
||||
"type": "string",
|
||||
"description": "Overworld map ID (0-63)"
|
||||
}
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["room"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-find-tile",
|
||||
"description": "Find all occurrences of a specific tile16 ID on overworld maps",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tile": {
|
||||
"type": "string",
|
||||
"description": "Tile16 ID in hex format (e.g., 0x02E)"
|
||||
"required": ["map_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-list-sprites",
|
||||
"description": "List all sprites placed on a specific overworld map",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map_id": {
|
||||
"type": "string",
|
||||
"description": "Overworld map ID (0-63)"
|
||||
}
|
||||
},
|
||||
"map": {
|
||||
"type": "string",
|
||||
"description": "Optional: specific map ID to search (e.g., 0x05)"
|
||||
"required": ["map_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-get-entrance",
|
||||
"description": "Get detailed information about a specific entrance/exit including destination and properties",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"entrance_id": {
|
||||
"type": "string",
|
||||
"description": "Entrance ID to query"
|
||||
}
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["tile"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-describe-map",
|
||||
"description": "Get summary information about an overworld map",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map": {
|
||||
"type": "string",
|
||||
"description": "Map ID in hex format (e.g., 0x00)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["map"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-list-warps",
|
||||
"description": "List warp/entrance/exit points on the overworld",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map": {
|
||||
"type": "string",
|
||||
"description": "Optional: filter by map ID"
|
||||
},
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": "Optional: filter by warp type",
|
||||
"enum": ["entrance", "exit", "hole", "all"],
|
||||
"default": "all"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
"required": ["entrance_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-tile-stats",
|
||||
"description": "Get statistical analysis of tile usage across overworld maps",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map_id": {
|
||||
"type": "string",
|
||||
"description": "Specific map ID or 'all' for global statistics"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "message-list",
|
||||
"description": "List all in-game dialogue and text messages from the ROM",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range": {
|
||||
"type": "string",
|
||||
"description": "Optional: limit to message ID range in format 'start-end' (e.g., '0-100')"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "message-read",
|
||||
"description": "Read a specific message by its ID",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "Message ID number (0-300+)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "message-search",
|
||||
"description": "Search for messages containing specific text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Text to search for within message content (case-insensitive)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-list-sprites",
|
||||
"description": "List sprites on the overworld with optional filters for map, world, or sprite ID",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"map": {
|
||||
"type": "string",
|
||||
"description": "Optional: filter by map ID (0x00-0x9F)"
|
||||
},
|
||||
"world": {
|
||||
"type": "string",
|
||||
"description": "Optional: filter by world (0=light, 1=dark, 2=special)"
|
||||
},
|
||||
"sprite_id": {
|
||||
"type": "string",
|
||||
"description": "Optional: filter by sprite ID (0x00-0xFF)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-get-entrance",
|
||||
"description": "Get detailed information about a specific overworld entrance by its ID",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"entrance_id": {
|
||||
"type": "string",
|
||||
"description": "Entrance ID number (0-128)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["entrance_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "overworld-tile-stats",
|
||||
"description": "Analyze usage statistics for a specific tile16 ID across the overworld",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"tile_id": {
|
||||
"type": "string",
|
||||
"description": "Tile16 ID to analyze (0x0000-0xFFFF, hex or decimal)"
|
||||
},
|
||||
"map": {
|
||||
"type": "string",
|
||||
"description": "Optional: limit analysis to specific map ID"
|
||||
},
|
||||
"world": {
|
||||
"type": "string",
|
||||
"description": "Optional: limit analysis to specific world (0=light, 1=dark, 2=special)"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"enum": ["json", "text"],
|
||||
"default": "json"
|
||||
}
|
||||
},
|
||||
"required": ["tile_id"]
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
]
|
||||
}
|
||||
207
assets/agent/system_prompt_v3.txt
Normal file
207
assets/agent/system_prompt_v3.txt
Normal file
@@ -0,0 +1,207 @@
|
||||
You are an expert ROM analysis assistant for **yaze** (Yet Another Zelda3 Editor), a modern cross-platform editor for The Legend of Zelda: A Link to the Past ROM hacking.
|
||||
|
||||
# Core Mission: PROACTIVE EXPLORATION
|
||||
|
||||
You are not a passive question-answerer. You are an intelligent ROM exploration partner who:
|
||||
1. **Anticipates needs**: When users ask questions, infer what they actually want to know
|
||||
2. **Chains tools intelligently**: Use multiple tools in one turn to provide complete answers
|
||||
3. **Iterates implicitly**: Don't wait for follow-up questions - provide comprehensive information upfront
|
||||
|
||||
# Tool Calling Strategy
|
||||
|
||||
## CRITICAL PRINCIPLE: Minimize Back-and-Forth
|
||||
|
||||
When a user asks a question:
|
||||
|
||||
### ❌ BAD (Reactive Approach):
|
||||
User: "What's in room 5?"
|
||||
You: Call `resource-list` → Get room list → Tell user "Room 5 exists"
|
||||
User: "What sprites are in it?" ← WASTED TURN!
|
||||
You: Call `dungeon-describe-room` → Give sprite list
|
||||
|
||||
### ✅ GOOD (Proactive Approach):
|
||||
User: "What's in room 5?"
|
||||
You: Call BOTH:
|
||||
- `dungeon-describe-room` with room=5
|
||||
- `resource-list` with type=sprite (to get sprite labels)
|
||||
You: "Room 5 contains 3 Stalfos (sprite 8), 2 Eyegores (sprite 12), has blue floor tiles, 2 chests with small key and compass, and connects to rooms 3 and 7."
|
||||
|
||||
## Multi-Tool Chaining Patterns
|
||||
|
||||
### Pattern 1: List + Detail
|
||||
When user asks about "what" exists:
|
||||
1. Get list of IDs with `resource-list`
|
||||
2. Get details for relevant items with describe/search commands
|
||||
3. Provide comprehensive summary
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"tool_calls": [
|
||||
{"tool_name": "resource-list", "args": {"type": "dungeon"}},
|
||||
{"tool_name": "dungeon-list-sprites", "args": {"dungeon": "hyrule_castle"}}
|
||||
],
|
||||
"reasoning": "Getting dungeon list AND sprites for first dungeon to provide complete answer"
|
||||
}
|
||||
```
|
||||
|
||||
### Pattern 2: Search + Context
|
||||
When user asks "where" something is:
|
||||
1. Search for the item with `resource-search` or find commands
|
||||
2. Get surrounding context (neighboring rooms, map info, etc.)
|
||||
3. Explain significance
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"tool_calls": [
|
||||
{"tool_name": "overworld-find-tile", "args": {"tile_id": "0x42"}},
|
||||
{"tool_name": "overworld-describe-map", "args": {"map_id": "0"}}
|
||||
],
|
||||
"reasoning": "Finding tile locations AND getting map context to explain where it appears"
|
||||
}
|
||||
```
|
||||
|
||||
### Pattern 3: Describe + Related
|
||||
When user asks about a specific thing:
|
||||
1. Get direct information
|
||||
2. Get related items (sprites in room, warps from location, etc.)
|
||||
3. Provide holistic view
|
||||
|
||||
Example:
|
||||
```json
|
||||
{
|
||||
"tool_calls": [
|
||||
{"tool_name": "dungeon-describe-room", "args": {"room_id": "5"}},
|
||||
{"tool_name": "overworld-list-warps", "args": {"map_id": "0"}},
|
||||
{"tool_name": "resource-list", "args": {"type": "sprite"}}
|
||||
],
|
||||
"reasoning": "Getting room details, checking warps that lead there, and sprite labels for complete context"
|
||||
}
|
||||
```
|
||||
|
||||
## CRITICAL RULES
|
||||
|
||||
1. **NEVER call the same tool twice with identical arguments**
|
||||
- Use tool call deduplication
|
||||
- If you need the same data, reference previous results
|
||||
|
||||
2. **NEVER send empty text_response after receiving [TOOL RESULT]**
|
||||
- You MUST synthesize tool results into a natural language answer
|
||||
- Include context, implications, and additional insights
|
||||
|
||||
3. **ALWAYS provide text_response after tools return**
|
||||
- Don't say "Let me check..." and call more tools
|
||||
- Synthesize ALL available information into one comprehensive answer
|
||||
|
||||
4. **Anticipate follow-up questions**
|
||||
- If user asks "What enemies are in Eastern Palace?", they likely also want to know:
|
||||
- How many total enemies
|
||||
- Which rooms have the most
|
||||
- Any rare or boss sprites
|
||||
- Provide this information proactively
|
||||
|
||||
5. **Use domain knowledge**
|
||||
- You know Zelda 3 ROM structure
|
||||
- Reference game terms: "This is a soldier sprite", not just "sprite ID 8"
|
||||
- Explain significance: "This room is the entrance hall" vs "Room 0"
|
||||
|
||||
# Output Format
|
||||
|
||||
You MUST respond with ONLY a JSON object:
|
||||
|
||||
```json
|
||||
{
|
||||
"text_response": "string (your natural language reply, REQUIRED after tool results)",
|
||||
"tool_calls": [{"tool_name": "string", "args": {"key": "value"}}],
|
||||
"commands": ["string array of z3ed commands for ROM modifications"],
|
||||
"reasoning": "string (your thought process, explain tool choices)"
|
||||
}
|
||||
```
|
||||
|
||||
**CRITICAL**: The field name is `"text_response"` (NOT "response", NOT "answer")
|
||||
|
||||
# Complete Workflow Example
|
||||
|
||||
User: "What enemies are in the Eastern Palace?"
|
||||
|
||||
## Your First Response (Tool Gathering):
|
||||
```json
|
||||
{
|
||||
"text_response": "Let me analyze the Eastern Palace enemy layout...",
|
||||
"tool_calls": [
|
||||
{"tool_name": "resource-list", "args": {"type": "dungeon"}},
|
||||
{"tool_name": "dungeon-list-sprites", "args": {"dungeon": "eastern_palace"}},
|
||||
{"tool_name": "resource-list", "args": {"type": "sprite"}}
|
||||
],
|
||||
"reasoning": "Need dungeon mapping, sprite placements, and sprite labels to provide comprehensive enemy analysis"
|
||||
}
|
||||
```
|
||||
|
||||
## After Receiving [TOOL RESULT]:
|
||||
```json
|
||||
{
|
||||
"text_response": "The Eastern Palace contains a diverse enemy roster:\n\n**Primary Enemies:**\n- Stalfos (Sprite 8): 12 total across rooms 1, 3, 5, 7, 9\n- Eyegore (Sprite 12): 6 total in rooms 5, 9, 11\n- Ball & Chain Trooper (Sprite 14): 3 in room 10\n\n**Mini-Boss Area:**\n- Giant Stalfos (Sprite 144) in room 8\n\n**Total Enemy Count:** 22 sprites across 11 rooms\n\n**Design Notes:** The palace uses a difficulty curve, starting with basic Stalfos in early rooms and introducing tougher Eyegores in the middle section. Room 10's Ball & Chain Troopers guard the boss key.",
|
||||
"tool_calls": [],
|
||||
"reasoning": "Synthesized sprite placement data, identified patterns, provided context about game design and difficulty progression"
|
||||
}
|
||||
```
|
||||
|
||||
# Learn Command Integration
|
||||
|
||||
When users mention preferences or patterns, you can suggest using the learn command:
|
||||
- "I always use palette 2 for outdoor tiles" → Suggest: `z3ed agent learn --preference "outdoor_palette=2"`
|
||||
- "I keep asking about room 5" → Suggest: `z3ed agent learn --topic "room_5_analysis"`
|
||||
|
||||
# When to Use Commands vs Tools
|
||||
|
||||
- **Tools** are READ-ONLY: They inspect ROM data
|
||||
- **Commands** are WRITE operations: They modify the ROM
|
||||
|
||||
Only suggest commands when user explicitly requests changes like:
|
||||
- "Change the palette to..."
|
||||
- "Place a sprite at..."
|
||||
- "Modify room layout..."
|
||||
|
||||
For inspection questions, ONLY use tools.
|
||||
|
||||
# Error Prevention
|
||||
|
||||
1. **Always validate tool results before answering**
|
||||
- Check if data is empty or malformed
|
||||
- Explain if information is unavailable
|
||||
|
||||
2. **Provide actionable next steps**
|
||||
- "Room 5 has no sprites. Would you like to add some?"
|
||||
- "Tile 0x42 doesn't exist in this map. Did you mean 0x24?"
|
||||
|
||||
3. **Explain ROM limitations**
|
||||
- "Zelda 3 vanilla has 296 rooms. Custom ROMs may have more."
|
||||
- "Sprite slots per room are limited to 16 in vanilla."
|
||||
|
||||
# Domain Knowledge
|
||||
|
||||
You understand:
|
||||
- **Dungeon structure**: Rooms, sprites, chests, bosses, keys
|
||||
- **Overworld layout**: 64 maps in light/dark world, tile16 system
|
||||
- **Sprite system**: IDs, behaviors, graphics, palettes
|
||||
- **Entrance/warp system**: How rooms connect
|
||||
- **Tile system**: Tile8 (8x8) compose Tile16 (16x16)
|
||||
|
||||
Use this knowledge to provide insightful, contextual answers that go beyond raw data.
|
||||
|
||||
# Response Quality Standards
|
||||
|
||||
GOOD response characteristics:
|
||||
- ✅ Comprehensive: Answers the question AND related context
|
||||
- ✅ Structured: Uses headers, lists, formatting for readability
|
||||
- ✅ Actionable: Provides next steps or suggestions
|
||||
- ✅ Insightful: Explains WHY, not just WHAT
|
||||
|
||||
BAD response characteristics:
|
||||
- ❌ Terse: "Room 5 has 3 sprites."
|
||||
- ❌ Incomplete: Missing context or related information
|
||||
- ❌ Vague: "Some enemies are in that room."
|
||||
- ❌ Passive: Waiting for user to ask follow-up questions
|
||||
|
||||
Remember: Your goal is to be the BEST ROM exploration assistant possible. Think ahead, chain tools intelligently, and provide comprehensive insights that save users time and mental effort.
|
||||
@@ -70,6 +70,7 @@ set(YAZE_AGENT_SOURCES
|
||||
cli/service/agent/conversational_agent_service.cc
|
||||
cli/service/agent/simple_chat_session.cc
|
||||
cli/service/agent/tool_dispatcher.cc
|
||||
cli/service/agent/learned_knowledge_service.cc
|
||||
cli/service/ai/ai_service.cc
|
||||
cli/service/ai/ollama_ai_service.cc
|
||||
cli/service/ai/prompt_builder.cc
|
||||
|
||||
@@ -77,9 +77,14 @@ GeminiAIService::GeminiAIService(const GeminiConfig& config)
|
||||
}
|
||||
|
||||
// Try to load version-specific system prompt file
|
||||
std::string prompt_file = config_.prompt_version == "v2"
|
||||
? "assets/agent/system_prompt_v2.txt"
|
||||
: "assets/agent/system_prompt.txt";
|
||||
std::string prompt_file;
|
||||
if (config_.prompt_version == "v3") {
|
||||
prompt_file = "assets/agent/system_prompt_v3.txt";
|
||||
} else if (config_.prompt_version == "v2") {
|
||||
prompt_file = "assets/agent/system_prompt_v2.txt";
|
||||
} else {
|
||||
prompt_file = "assets/agent/system_prompt.txt";
|
||||
}
|
||||
|
||||
std::vector<std::string> search_paths = {
|
||||
prompt_file,
|
||||
@@ -135,9 +140,15 @@ std::vector<std::string> GeminiAIService::GetAvailableTools() const {
|
||||
|
||||
std::string GeminiAIService::BuildFunctionCallSchemas() {
|
||||
#ifndef YAZE_WITH_JSON
|
||||
return "[]"; // Empty array if JSON not available
|
||||
return "{}"; // Empty object if JSON not available
|
||||
#else
|
||||
// Search for function_schemas.json in multiple locations
|
||||
// Use the prompt builder's schema generation which reads from prompt_catalogue.yaml
|
||||
std::string schemas = prompt_builder_.BuildFunctionCallSchemas();
|
||||
if (!schemas.empty() && schemas != "[]") {
|
||||
return schemas;
|
||||
}
|
||||
|
||||
// Fallback: Search for function_schemas.json
|
||||
const std::vector<std::string> search_paths = {
|
||||
"assets/agent/function_schemas.json",
|
||||
"../assets/agent/function_schemas.json",
|
||||
@@ -337,10 +348,30 @@ absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
|
||||
// Add function calling tools if enabled
|
||||
if (function_calling_enabled_) {
|
||||
try {
|
||||
nlohmann::json tools = nlohmann::json::parse(BuildFunctionCallSchemas());
|
||||
request_body["tools"] = {{
|
||||
{"function_declarations", tools}
|
||||
}};
|
||||
std::string schemas_str = BuildFunctionCallSchemas();
|
||||
if (config_.verbose) {
|
||||
std::cerr << "[DEBUG] Function calling schemas: " << schemas_str.substr(0, 200) << "..." << std::endl;
|
||||
}
|
||||
|
||||
nlohmann::json schemas = nlohmann::json::parse(schemas_str);
|
||||
|
||||
// Build tools array - schemas might be an array of tools or a function_declarations object
|
||||
if (schemas.is_array()) {
|
||||
// If it's already an array of tools, use it directly
|
||||
request_body["tools"] = {{
|
||||
{"function_declarations", schemas}
|
||||
}};
|
||||
} else if (schemas.is_object() && schemas.contains("function_declarations")) {
|
||||
// If it's a wrapper object with function_declarations
|
||||
request_body["tools"] = {{
|
||||
{"function_declarations", schemas["function_declarations"]}
|
||||
}};
|
||||
} else {
|
||||
// Treat as single tool object
|
||||
request_body["tools"] = {{
|
||||
{"function_declarations", nlohmann::json::array({schemas})}
|
||||
}};
|
||||
}
|
||||
} catch (const nlohmann::json::exception& e) {
|
||||
std::cerr << "⚠️ Failed to parse function schemas: " << e.what() << std::endl;
|
||||
}
|
||||
|
||||
@@ -19,8 +19,8 @@ struct GeminiConfig {
|
||||
int max_output_tokens = 2048;
|
||||
mutable std::string system_instruction; // Mutable to allow lazy initialization
|
||||
bool use_enhanced_prompting = true; // Enable few-shot examples
|
||||
bool use_function_calling = false; // Use native Gemini function calling
|
||||
std::string prompt_version = "default"; // Which prompt file to use (default, v2, etc.)
|
||||
bool use_function_calling = true; // Use native Gemini function calling (enabled by default for 2.0+)
|
||||
std::string prompt_version = "v3"; // Which prompt file to use (default, v2, v3, etc.)
|
||||
bool verbose = false; // Enable debug logging
|
||||
|
||||
GeminiConfig() = default;
|
||||
|
||||
251
test/multimodal/test_gemini_vision.cc
Normal file
251
test/multimodal/test_gemini_vision.cc
Normal file
@@ -0,0 +1,251 @@
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "cli/service/ai/gemini_ai_service.h"
|
||||
|
||||
#ifdef YAZE_WITH_GRPC
|
||||
#include "app/core/service/screenshot_utils.h"
|
||||
#endif
|
||||
|
||||
namespace yaze {
|
||||
namespace test {
|
||||
|
||||
class GeminiVisionTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
// Check if GEMINI_API_KEY is set
|
||||
const char* api_key = std::getenv("GEMINI_API_KEY");
|
||||
if (!api_key || std::string(api_key).empty()) {
|
||||
GTEST_SKIP() << "GEMINI_API_KEY not set. Skipping multimodal tests.";
|
||||
}
|
||||
|
||||
api_key_ = api_key;
|
||||
|
||||
// Create test data directory
|
||||
test_dir_ = std::filesystem::temp_directory_path() / "yaze_multimodal_test";
|
||||
std::filesystem::create_directories(test_dir_);
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Clean up test directory
|
||||
if (std::filesystem::exists(test_dir_)) {
|
||||
std::filesystem::remove_all(test_dir_);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper: Create a simple test image (16x16 PNG)
|
||||
std::filesystem::path CreateTestImage() {
|
||||
auto image_path = test_dir_ / "test_image.png";
|
||||
|
||||
// Create a minimal PNG file (16x16 red square)
|
||||
// PNG signature + IHDR + IDAT + IEND
|
||||
const unsigned char png_data[] = {
|
||||
// PNG signature
|
||||
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A,
|
||||
// IHDR chunk
|
||||
0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52,
|
||||
0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10,
|
||||
0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x91, 0x68,
|
||||
0x36,
|
||||
// IDAT chunk (minimal data)
|
||||
0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54,
|
||||
0x08, 0x99, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00,
|
||||
0x03, 0x01, 0x01, 0x00, 0x18, 0xDD, 0x8D, 0xB4,
|
||||
// IEND chunk
|
||||
0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44,
|
||||
0xAE, 0x42, 0x60, 0x82
|
||||
};
|
||||
|
||||
std::ofstream file(image_path, std::ios::binary);
|
||||
file.write(reinterpret_cast<const char*>(png_data), sizeof(png_data));
|
||||
file.close();
|
||||
|
||||
return image_path;
|
||||
}
|
||||
|
||||
std::string api_key_;
|
||||
std::filesystem::path test_dir_;
|
||||
};
|
||||
|
||||
TEST_F(GeminiVisionTest, BasicImageAnalysis) {
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp"; // Vision-capable model
|
||||
config.verbose = false;
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
// Create test image
|
||||
auto image_path = CreateTestImage();
|
||||
ASSERT_TRUE(std::filesystem::exists(image_path));
|
||||
|
||||
// Send multimodal request
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
image_path.string(),
|
||||
"Describe this image in one sentence."
|
||||
);
|
||||
|
||||
ASSERT_TRUE(response.ok()) << response.status().message();
|
||||
EXPECT_FALSE(response->text_response.empty());
|
||||
|
||||
std::cout << "Vision API response: " << response->text_response << std::endl;
|
||||
}
|
||||
|
||||
TEST_F(GeminiVisionTest, ImageWithSpecificPrompt) {
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp";
|
||||
config.verbose = false;
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
auto image_path = CreateTestImage();
|
||||
|
||||
// Ask specific question about the image
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
image_path.string(),
|
||||
"What color is the dominant color in this image? Answer with just the color name."
|
||||
);
|
||||
|
||||
ASSERT_TRUE(response.ok()) << response.status().message();
|
||||
EXPECT_FALSE(response->text_response.empty());
|
||||
|
||||
// Response should mention "red" since we created a red square
|
||||
std::string response_lower = response->text_response;
|
||||
std::transform(response_lower.begin(), response_lower.end(),
|
||||
response_lower.begin(), ::tolower);
|
||||
EXPECT_TRUE(response_lower.find("red") != std::string::npos ||
|
||||
response_lower.find("pink") != std::string::npos)
|
||||
<< "Expected color 'red' or 'pink' in response: " << response->text_response;
|
||||
}
|
||||
|
||||
TEST_F(GeminiVisionTest, InvalidImagePath) {
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp";
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
// Try with non-existent image
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
"/nonexistent/image.png",
|
||||
"Describe this image."
|
||||
);
|
||||
|
||||
EXPECT_FALSE(response.ok());
|
||||
EXPECT_TRUE(absl::IsNotFound(response.status()) ||
|
||||
absl::IsInternal(response.status()));
|
||||
}
|
||||
|
||||
#ifdef YAZE_WITH_GRPC
|
||||
// Integration test with screenshot capture
|
||||
TEST_F(GeminiVisionTest, ScreenshotCaptureIntegration) {
|
||||
// Note: This test requires a running YAZE instance with gRPC test harness
|
||||
// Skip if we can't connect
|
||||
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp";
|
||||
config.verbose = false;
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
// Attempt to capture a screenshot
|
||||
auto screenshot_result = yaze::test::CaptureHarnessScreenshot(
|
||||
(test_dir_ / "screenshot.png").string());
|
||||
|
||||
if (!screenshot_result.ok()) {
|
||||
GTEST_SKIP() << "Screenshot capture failed (YAZE may not be running): "
|
||||
<< screenshot_result.status().message();
|
||||
}
|
||||
|
||||
// Analyze the captured screenshot
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
screenshot_result->file_path.string(),
|
||||
"What UI elements are visible in this screenshot? List them."
|
||||
);
|
||||
|
||||
ASSERT_TRUE(response.ok()) << response.status().message();
|
||||
EXPECT_FALSE(response->text_response.empty());
|
||||
|
||||
std::cout << "Screenshot analysis: " << response->text_response << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Performance test
|
||||
TEST_F(GeminiVisionTest, MultipleRequestsSequential) {
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp";
|
||||
config.verbose = false;
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
auto image_path = CreateTestImage();
|
||||
|
||||
// Make 3 sequential requests
|
||||
const int num_requests = 3;
|
||||
for (int i = 0; i < num_requests; ++i) {
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
image_path.string(),
|
||||
absl::StrCat("Request ", i + 1, ": Describe this image briefly.")
|
||||
);
|
||||
|
||||
ASSERT_TRUE(response.ok()) << "Request " << i + 1 << " failed: "
|
||||
<< response.status().message();
|
||||
EXPECT_FALSE(response->text_response.empty());
|
||||
}
|
||||
}
|
||||
|
||||
// Rate limiting test (should handle gracefully)
|
||||
TEST_F(GeminiVisionTest, RateLimitHandling) {
|
||||
cli::GeminiConfig config;
|
||||
config.api_key = api_key_;
|
||||
config.model = "gemini-2.0-flash-exp";
|
||||
config.verbose = false;
|
||||
|
||||
cli::GeminiAIService service(config);
|
||||
|
||||
auto image_path = CreateTestImage();
|
||||
|
||||
// Make many rapid requests (may hit rate limit)
|
||||
int successful = 0;
|
||||
int rate_limited = 0;
|
||||
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
auto response = service.GenerateMultimodalResponse(
|
||||
image_path.string(),
|
||||
"Describe this image."
|
||||
);
|
||||
|
||||
if (response.ok()) {
|
||||
successful++;
|
||||
} else if (absl::IsResourceExhausted(response.status()) ||
|
||||
response.status().message().find("429") != std::string::npos) {
|
||||
rate_limited++;
|
||||
}
|
||||
}
|
||||
|
||||
// At least some requests should succeed
|
||||
EXPECT_GT(successful, 0) << "No successful requests out of 10";
|
||||
|
||||
// If we hit rate limits, that's expected behavior (not a failure)
|
||||
if (rate_limited > 0) {
|
||||
std::cout << "Note: Hit rate limit on " << rate_limited << " out of 10 requests (expected)" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace yaze
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
|
||||
std::cout << "\n=== Gemini Multimodal Vision Tests ===" << std::endl;
|
||||
std::cout << "These tests require GEMINI_API_KEY environment variable." << std::endl;
|
||||
std::cout << "Tests will be skipped if API key is not available.\n" << std::endl;
|
||||
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
Reference in New Issue
Block a user