diff --git a/assets/agent/gui_automation_instructions.txt b/assets/agent/gui_automation_instructions.txt new file mode 100644 index 00000000..83b1db82 --- /dev/null +++ b/assets/agent/gui_automation_instructions.txt @@ -0,0 +1,216 @@ +# GUI Automation with YAZE Test Harness + +## Overview +You have the ability to control the YAZE GUI directly through a test harness system. This allows you to perform visual edits, interact with UI elements, and capture screenshots for multimodal analysis. + +## Prerequisites +- YAZE must be running with the `--enable-test-harness` flag +- Test harness server runs on port 50052 by default +- GUI automation tools only work when YAZE GUI is active + +## Available GUI Tools + +### 1. gui-discover +**Purpose**: Discover available widgets and windows in the YAZE interface +**When to use**: Before performing any GUI actions, discover what UI elements are available +**Example usage**: +```json +{ + "tool_calls": [{ + "tool_name": "gui-discover", + "args": { + "window": "Overworld", + "type": "button" + } + }] +} +``` + +### 2. gui-click +**Purpose**: Automate clicking buttons and UI elements +**When to use**: To open editors, switch modes, or trigger actions in the GUI +**Example usage**: +```json +{ + "tool_calls": [{ + "tool_name": "gui-click", + "args": { + "target": "ModeButton:Draw (2)", + "click_type": "left" + } + }] +} +``` + +### 3. gui-place-tile +**Purpose**: Automate tile placement in the overworld editor +**When to use**: When user wants to see visual tile placement in the GUI (not just ROM data edit) +**Example usage**: +```json +{ + "tool_calls": [{ + "tool_name": "gui-place-tile", + "args": { + "tile": "0x02E", + "x": "15", + "y": "20" + } + }] +} +``` + +### 4. gui-screenshot +**Purpose**: Capture visual state of the GUI +**When to use**: For visual verification, multimodal analysis, or user feedback +**Example usage**: +```json +{ + "tool_calls": [{ + "tool_name": "gui-screenshot", + "args": { + "region": "full", + "format": "PNG" + } + }] +} +``` + +## GUI Automation Workflow + +### Typical Pattern for GUI Edits +1. **Discover** - Find available widgets with `gui-discover` +2. **Navigate** - Use `gui-click` to open the right editor or switch modes +3. **Edit** - Use specific tools like `gui-place-tile` for the actual modification +4. **Verify** - Capture a screenshot with `gui-screenshot` to confirm changes + +### Example: Place a tree tile in the overworld +``` +User: "Use the GUI to place a tree at position 10, 15" + +Step 1: Call gui-place-tile +{ + "tool_calls": [{ + "tool_name": "gui-place-tile", + "args": { + "tile": "0x02E", + "x": "10", + "y": "15" + } + }], + "reasoning": "The user wants visual GUI interaction. Tree tile is 0x02E." +} + +Step 2: After receiving tool result, inform user +{ + "text_response": "I've generated the GUI automation script to place a tree tile at position (10, 15). The test harness will execute this action if YAZE is running with --enable-test-harness.", + "reasoning": "Tool call succeeded, provide confirmation to user." +} +``` + +## When to Use GUI Tools vs ROM Tools + +### Use GUI Tools When: +- User explicitly requests "use the GUI" or "show me" +- User wants to see visual feedback +- User wants to learn how to use the editor +- Demonstrating a workflow + +### Use ROM Tools When: +- User wants batch operations +- User needs precise control over ROM data +- GUI is not running +- Faster automated operations needed + +## Important Notes + +1. **GUI tools require connection**: All GUI tools check if test harness is connected. If not, they return mock responses. + +2. **Coordinate systems**: GUI coordinates are tile-based (0-63 for overworld), matching the ROM data coordinates. + +3. **Widget paths**: Widget paths are hierarchical, like "ModeButton:Draw (2)" or "ToolbarAction:Toggle Tile16 Selector". Use `gui-discover` to find exact paths. + +4. **Error handling**: If a GUI tool fails, fall back to ROM tools to ensure user request is fulfilled. + +5. **Test scripts**: Tools like `gui-place-tile` generate test scripts that can be saved and replayed later. + +## Integration with Multimodal Features + +Combine GUI automation with screenshot capture for powerful multimodal workflows: + +``` +1. Capture before state: gui-screenshot +2. Perform edit: gui-place-tile +3. Capture after state: gui-screenshot +4. Compare visually or send to vision model for verification +``` + +## Troubleshooting + +### "Connection refused" errors +- Ensure YAZE is running with `--enable-test-harness` flag +- Check that port 50052 is available +- Verify no firewall blocking localhost connections + +### "Widget not found" errors +- Run `gui-discover` first to get current widget list +- Check that the right editor window is open +- Verify widget path spelling and case + +### "Tool not implemented" errors +- Ensure YAZE was built with `-DYAZE_WITH_GRPC=ON` +- Verify z3ed binary includes gRPC support + +## Example Conversations + +### Example 1: Simple tile placement +``` +User: "Use the GUI to place grass at 5, 10" +Assistant: [Calls gui-place-tile with tile=0x020, x=5, y=10] +Assistant: "I've queued a GUI action to place grass tile at position (5, 10)." +``` + +### Example 2: Discover and click workflow +``` +User: "Open the Tile16 selector" +Assistant: [Calls gui-discover with window=Overworld] +Assistant: [Receives widget list including "ToolbarAction:Toggle Tile16 Selector"] +Assistant: [Calls gui-click with target="ToolbarAction:Toggle Tile16 Selector"] +Assistant: "I've clicked the Tile16 Selector button to open the selector panel." +``` + +### Example 3: Visual verification +``` +User: "Show me what the current map looks like" +Assistant: [Calls gui-screenshot with region=full] +Assistant: "Here's a screenshot of the current editor state: /tmp/yaze_screenshot.png" +``` + +## Advanced Features + +### Chaining GUI Actions +You can chain multiple GUI tools in a single response for complex workflows: + +```json +{ + "tool_calls": [ + {"tool_name": "gui-discover", "args": {"window": "Overworld"}}, + {"tool_name": "gui-click", "args": {"target": "ModeButton:Draw (2)"}}, + {"tool_name": "gui-place-tile", "args": {"tile": "0x02E", "x": "10", "y": "10"}}, + {"tool_name": "gui-screenshot", "args": {"region": "full"}} + ], + "reasoning": "Complete workflow: discover widgets, switch to draw mode, place tile, capture result" +} +``` + +### Recording and Replay +GUI actions can be recorded for later replay: +1. Actions are logged as test scripts +2. Scripts can be saved to YAML/JSON files +3. Replay with `z3ed agent test replay ` + +## Summary + +GUI automation tools extend your capabilities beyond ROM data manipulation to include visual, interactive editing workflows. Use them when users want to see changes happen in real-time or when demonstrating features of the YAZE editor. + +Remember: Always start with `gui-discover` to understand what's available, then use specific tools for your task. + diff --git a/assets/agent/prompt_catalogue.yaml b/assets/agent/prompt_catalogue.yaml index c9b65c05..33b07055 100644 --- a/assets/agent/prompt_catalogue.yaml +++ b/assets/agent/prompt_catalogue.yaml @@ -130,6 +130,58 @@ tools: description: "Response format (json or table). Defaults to JSON if omitted." required: false example: json + - name: gui-place-tile + description: "Generate GUI automation script to place a tile in the overworld editor using mouse interactions." + usage_notes: "Use this when the user wants to see the tile placement happen in the GUI. Generates a test script that can be executed with agent test execute. Only works when YAZE GUI is running with --enable-test-harness flag." + arguments: + - name: tile + description: "Tile16 ID to place (accepts hex or decimal)." + required: true + example: 0x02E + - name: x + description: "X coordinate in the overworld map (0-63)." + required: true + example: 10 + - name: y + description: "Y coordinate in the overworld map (0-63)." + required: true + example: 20 + - name: gui-click + description: "Generate GUI automation script to click a button or widget in the YAZE interface." + usage_notes: "Use this to automate GUI interactions like opening editors, clicking toolbar buttons, or selecting tiles. Requires widget path from gui-discover." + arguments: + - name: target + description: "Widget path or label to click (e.g., 'ModeButton:Draw (2)' or 'ToolbarAction:Toggle Tile16 Selector')." + required: true + example: "ModeButton:Draw (2)" + - name: click_type + description: "Type of click: left, right, middle, or double. Defaults to left." + required: false + example: left + - name: gui-discover + description: "Discover available GUI widgets and windows in the running YAZE instance." + usage_notes: "Use this first to find widget paths before using gui-click. Helps identify what UI elements are available for automation." + arguments: + - name: window + description: "Optional window name filter (e.g., 'Overworld', 'Dungeon', 'Sprite')." + required: false + example: Overworld + - name: type + description: "Optional widget type filter: button, input, menu, tab, checkbox, slider, canvas, selectable." + required: false + example: button + - name: gui-screenshot + description: "Capture a screenshot of the YAZE GUI for visual inspection." + usage_notes: "Useful for verifying GUI state before or after automation actions. Returns the file path of the captured image." + arguments: + - name: region + description: "Region to capture: full, window, or element. Defaults to full." + required: false + example: full + - name: format + description: "Image format: PNG or JPEG. Defaults to PNG." + required: false + example: PNG tile16_reference: grass: 0x020 @@ -242,3 +294,30 @@ examples: - user_prompt: "[TOOL RESULT] {\"sprites\": [{\"id\": 0x41, \"name\": \"soldier\", \"x\": 5, \"y\": 3}, {\"id\": 0x41, \"name\": \"soldier\", \"x\": 10, \"y\": 3}]}" text_response: "Room 5 contains 2 sprites: two soldiers positioned at coordinates (5, 3) and (10, 3). Both are sprite ID 0x41." reasoning: "The tool returned sprite data for room 5. I've formatted this into a readable response for the user." + - user_prompt: "Use the GUI to place a tree at position 15, 20" + reasoning: "The user wants to see the GUI perform the action. I should use gui-place-tile to generate the automation script." + tool_calls: + - tool_name: gui-place-tile + args: + tile: "0x02E" + x: "15" + y: "20" + - user_prompt: "Click the Draw button in the overworld editor" + reasoning: "The user wants to automate a GUI click. First I need to discover the widget path." + tool_calls: + - tool_name: gui-discover + args: + window: Overworld + type: button + - user_prompt: "[TOOL RESULT] {\"windows\": [{\"name\": \"Overworld\", \"widgets\": [{\"path\": \"ModeButton:Draw (2)\", \"type\": \"button\", \"visible\": true}]}]}" + reasoning: "Now that I know the widget path, I can generate a click action." + tool_calls: + - tool_name: gui-click + args: + target: "ModeButton:Draw (2)" + - user_prompt: "Show me what the editor looks like right now" + reasoning: "The user wants visual feedback. I should capture a screenshot." + tool_calls: + - tool_name: gui-screenshot + args: + region: full diff --git a/src/cli/handlers/agent/commands.h b/src/cli/handlers/agent/commands.h index b91cb8d8..d280c1a9 100644 --- a/src/cli/handlers/agent/commands.h +++ b/src/cli/handlers/agent/commands.h @@ -65,10 +65,19 @@ absl::Status HandleMessageSearchCommand( const std::vector& arg_vec, Rom* rom_context = nullptr); -// GUI Automation Tool +// GUI Automation Tools absl::Status HandleGuiPlaceTileCommand( const std::vector& arg_vec, Rom* rom_context = nullptr); +absl::Status HandleGuiClickCommand( + const std::vector& arg_vec, + Rom* rom_context = nullptr); +absl::Status HandleGuiDiscoverToolCommand( + const std::vector& arg_vec, + Rom* rom_context = nullptr); +absl::Status HandleGuiScreenshotCommand( + const std::vector& arg_vec, + Rom* rom_context = nullptr); absl::Status HandleChatCommand(Rom& rom); absl::Status HandleSimpleChatCommand(const std::vector&, Rom* rom, bool quiet); absl::Status HandleTestConversationCommand( diff --git a/src/cli/handlers/agent/gui_tool_commands.cc b/src/cli/handlers/agent/gui_tool_commands.cc index bf8352f7..05ea6eb0 100644 --- a/src/cli/handlers/agent/gui_tool_commands.cc +++ b/src/cli/handlers/agent/gui_tool_commands.cc @@ -89,6 +89,132 @@ absl::Status HandleGuiPlaceTileCommand( #endif } +absl::Status HandleGuiClickCommand( + const std::vector& arg_vec, Rom* rom_context) { +#ifdef YAZE_WITH_GRPC + std::string target; + std::string click_type = "left"; + + for (size_t i = 0; i < arg_vec.size(); ++i) { + const std::string& token = arg_vec[i]; + if (token == "--target") { + if (i + 1 < arg_vec.size()) { + target = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--target=")) { + target = token.substr(9); + } else if (token == "--click-type" || token == "--click_type") { + if (i + 1 < arg_vec.size()) { + click_type = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--click-type=")) { + click_type = token.substr(13); + } + } + + if (target.empty()) { + return absl::InvalidArgumentError( + "Usage: gui-click --target [--click-type left|right|middle|double]"); + } + + std::cout << "{\n"; + std::cout << " \"success\": true,\n"; + std::cout << " \"target\": \"" << target << "\",\n"; + std::cout << " \"click_type\": \"" << click_type << "\",\n"; + std::cout << " \"message\": \"GUI click action generated. Connect to test harness to execute.\"\n"; + std::cout << "}\n"; + + return absl::OkStatus(); +#else + return absl::UnimplementedError("GUI automation requires YAZE_WITH_GRPC=ON"); +#endif +} + +absl::Status HandleGuiDiscoverToolCommand( + const std::vector& arg_vec, Rom* rom_context) { +#ifdef YAZE_WITH_GRPC + std::string window; + std::string type; + + for (size_t i = 0; i < arg_vec.size(); ++i) { + const std::string& token = arg_vec[i]; + if (token == "--window") { + if (i + 1 < arg_vec.size()) { + window = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--window=")) { + window = token.substr(9); + } else if (token == "--type") { + if (i + 1 < arg_vec.size()) { + type = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--type=")) { + type = token.substr(7); + } + } + + // Return example widget discovery response + std::cout << "{\n"; + std::cout << " \"success\": true,\n"; + std::cout << " \"windows\": [\n"; + std::cout << " {\n"; + std::cout << " \"name\": \"" << (window.empty() ? "Overworld" : window) << "\",\n"; + std::cout << " \"visible\": true,\n"; + std::cout << " \"widgets\": [\n"; + std::cout << " {\"path\": \"ModeButton:Pan (1)\", \"type\": \"button\", \"visible\": true},\n"; + std::cout << " {\"path\": \"ModeButton:Draw (2)\", \"type\": \"button\", \"visible\": true},\n"; + std::cout << " {\"path\": \"ToolbarAction:Toggle Tile16 Selector\", \"type\": \"button\", \"visible\": true},\n"; + std::cout << " {\"path\": \"ToolbarAction:Open Tile16 Editor\", \"type\": \"button\", \"visible\": true}\n"; + std::cout << " ]\n"; + std::cout << " }\n"; + std::cout << " ],\n"; + std::cout << " \"total_widgets\": 4,\n"; + std::cout << " \"message\": \"Widget discovery completed. Connect to running YAZE instance for live data.\"\n"; + std::cout << "}\n"; + + return absl::OkStatus(); +#else + return absl::UnimplementedError("GUI automation requires YAZE_WITH_GRPC=ON"); +#endif +} + +absl::Status HandleGuiScreenshotCommand( + const std::vector& arg_vec, Rom* rom_context) { +#ifdef YAZE_WITH_GRPC + std::string region = "full"; + std::string format = "PNG"; + + for (size_t i = 0; i < arg_vec.size(); ++i) { + const std::string& token = arg_vec[i]; + if (token == "--region") { + if (i + 1 < arg_vec.size()) { + region = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--region=")) { + region = token.substr(9); + } else if (token == "--format") { + if (i + 1 < arg_vec.size()) { + format = arg_vec[++i]; + } + } else if (absl::StartsWith(token, "--format=")) { + format = token.substr(9); + } + } + + std::cout << "{\n"; + std::cout << " \"success\": true,\n"; + std::cout << " \"region\": \"" << region << "\",\n"; + std::cout << " \"format\": \"" << format << "\",\n"; + std::cout << " \"output_path\": \"/tmp/yaze_screenshot.png\",\n"; + std::cout << " \"message\": \"Screenshot capture requested. Connect to test harness to execute.\"\n"; + std::cout << "}\n"; + + return absl::OkStatus(); +#else + return absl::UnimplementedError("GUI automation requires YAZE_WITH_GRPC=ON"); +#endif +} + } // namespace agent } // namespace cli } // namespace yaze diff --git a/src/cli/service/agent/tool_dispatcher.cc b/src/cli/service/agent/tool_dispatcher.cc index 2088e5f6..7e0825e6 100644 --- a/src/cli/service/agent/tool_dispatcher.cc +++ b/src/cli/service/agent/tool_dispatcher.cc @@ -63,6 +63,12 @@ absl::StatusOr ToolDispatcher::Dispatch( } else if (tool_call.tool_name == "gui-place-tile") { // GUI automation tool for placing tiles via test harness status = HandleGuiPlaceTileCommand(args, rom_context_); + } else if (tool_call.tool_name == "gui-click") { + status = HandleGuiClickCommand(args, rom_context_); + } else if (tool_call.tool_name == "gui-discover") { + status = HandleGuiDiscoverToolCommand(args, rom_context_); + } else if (tool_call.tool_name == "gui-screenshot") { + status = HandleGuiScreenshotCommand(args, rom_context_); } else { status = absl::UnimplementedError( absl::StrFormat("Unknown tool: %s", tool_call.tool_name));