diff --git a/assets/agent/function_schemas.json b/assets/agent/function_schemas.json index 696728c1..14fe08ea 100644 --- a/assets/agent/function_schemas.json +++ b/assets/agent/function_schemas.json @@ -1,302 +1,166 @@ -[ - { - "name": "resource-list", - "description": "List all labeled resources of a specific type (dungeons, sprites, palettes)", - "parameters": { - "type": "object", - "properties": { - "type": { - "type": "string", - "description": "Resource type to list", - "enum": [ - "dungeon", - "room", - "entrance", - "overworld", - "sprite", - "palette", - "item", - "tile16", - "all" - ] +{ + "function_declarations": [ + { + "name": "resource-list", + "description": "List all resources of a specific type from the ROM (rooms, sprites, dungeons, entrances, items, overlords)", + "parameters": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["room", "sprite", "dungeon", "entrance", "item", "overlord"], + "description": "Type of resource to list" + } }, - "format": { - "type": "string", - "description": "Output format", - "enum": ["json", "table", "text"], - "default": "table" + "required": ["type"] + } + }, + { + "name": "resource-search", + "description": "Search for resources by name or pattern", + "parameters": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["room", "sprite", "dungeon", "entrance", "item"], + "description": "Type of resource to search" + }, + "query": { + "type": "string", + "description": "Search query or pattern to match" + } + }, + "required": ["type", "query"] + } + }, + { + "name": "dungeon-list-sprites", + "description": "List all sprites in a specific dungeon or room", + "parameters": { + "type": "object", + "properties": { + "dungeon": { + "type": "string", + "description": "Dungeon name (e.g., 'hyrule_castle', 'eastern_palace') or leave empty for all" + }, + "room_id": { + "type": "string", + "description": "Specific room ID to query (optional)" + } } - }, - "required": ["type"] - } - }, - { - "name": "resource-search", - "description": "Search labeled resources by name, ID, or partial match", - "parameters": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "Search text (case-insensitive substring match)" + } + }, + { + "name": "dungeon-describe-room", + "description": "Get detailed information about a specific dungeon room including sprites, chests, layout, and connections", + "parameters": { + "type": "object", + "properties": { + "room_id": { + "type": "string", + "description": "Room ID to describe (0-296)" + }, + "include_sprites": { + "type": "string", + "enum": ["true", "false"], + "description": "Include sprite information (default: true)" + } }, - "type": { - "type": "string", - "description": "Optional resource category to filter", - "enum": [ - "dungeon", - "room", - "entrance", - "overworld", - "sprite", - "palette", - "item", - "tile16", - "all" - ], - "default": "all" + "required": ["room_id"] + } + }, + { + "name": "overworld-find-tile", + "description": "Find all locations where a specific tile16 ID appears in the overworld", + "parameters": { + "type": "object", + "properties": { + "tile_id": { + "type": "string", + "description": "Tile16 ID to search for (hex format like '0x42' or decimal)" + }, + "map_id": { + "type": "string", + "description": "Specific overworld map to search (0-63, optional)" + } }, - "format": { - "type": "string", - "description": "Output format", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["query"] - } - }, - { - "name": "dungeon-list-sprites", - "description": "List all sprites in a specific dungeon room", - "parameters": { - "type": "object", - "properties": { - "room": { - "type": "string", - "description": "Room ID in hex format (e.g., 0x012)" + "required": ["tile_id"] + } + }, + { + "name": "overworld-describe-map", + "description": "Get detailed information about a specific overworld map including tile composition, warps, and sprites", + "parameters": { + "type": "object", + "properties": { + "map_id": { + "type": "string", + "description": "Overworld map ID (0-63 for light/dark world)" + }, + "include_tiles": { + "type": "string", + "enum": ["true", "false"], + "description": "Include tile distribution statistics" + } }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["room"] - } - }, - { - "name": "dungeon-describe-room", - "description": "Summarize dungeon room metadata, hazards, and counts", - "parameters": { - "type": "object", - "properties": { - "room": { - "type": "string", - "description": "Room ID in hex format (e.g., 0x012)" + "required": ["map_id"] + } + }, + { + "name": "overworld-list-warps", + "description": "List all warp/entrance points for a specific overworld map", + "parameters": { + "type": "object", + "properties": { + "map_id": { + "type": "string", + "description": "Overworld map ID (0-63)" + } }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["room"] - } - }, - { - "name": "overworld-find-tile", - "description": "Find all occurrences of a specific tile16 ID on overworld maps", - "parameters": { - "type": "object", - "properties": { - "tile": { - "type": "string", - "description": "Tile16 ID in hex format (e.g., 0x02E)" + "required": ["map_id"] + } + }, + { + "name": "overworld-list-sprites", + "description": "List all sprites placed on a specific overworld map", + "parameters": { + "type": "object", + "properties": { + "map_id": { + "type": "string", + "description": "Overworld map ID (0-63)" + } }, - "map": { - "type": "string", - "description": "Optional: specific map ID to search (e.g., 0x05)" + "required": ["map_id"] + } + }, + { + "name": "overworld-get-entrance", + "description": "Get detailed information about a specific entrance/exit including destination and properties", + "parameters": { + "type": "object", + "properties": { + "entrance_id": { + "type": "string", + "description": "Entrance ID to query" + } }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["tile"] - } - }, - { - "name": "overworld-describe-map", - "description": "Get summary information about an overworld map", - "parameters": { - "type": "object", - "properties": { - "map": { - "type": "string", - "description": "Map ID in hex format (e.g., 0x00)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["map"] - } - }, - { - "name": "overworld-list-warps", - "description": "List warp/entrance/exit points on the overworld", - "parameters": { - "type": "object", - "properties": { - "map": { - "type": "string", - "description": "Optional: filter by map ID" - }, - "type": { - "type": "string", - "description": "Optional: filter by warp type", - "enum": ["entrance", "exit", "hole", "all"], - "default": "all" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" + "required": ["entrance_id"] + } + }, + { + "name": "overworld-tile-stats", + "description": "Get statistical analysis of tile usage across overworld maps", + "parameters": { + "type": "object", + "properties": { + "map_id": { + "type": "string", + "description": "Specific map ID or 'all' for global statistics" + } } } } - }, - { - "name": "message-list", - "description": "List all in-game dialogue and text messages from the ROM", - "parameters": { - "type": "object", - "properties": { - "range": { - "type": "string", - "description": "Optional: limit to message ID range in format 'start-end' (e.g., '0-100')" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - } - } - }, - { - "name": "message-read", - "description": "Read a specific message by its ID", - "parameters": { - "type": "object", - "properties": { - "id": { - "type": "string", - "description": "Message ID number (0-300+)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["id"] - } - }, - { - "name": "message-search", - "description": "Search for messages containing specific text", - "parameters": { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "Text to search for within message content (case-insensitive)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["query"] - } - }, - { - "name": "overworld-list-sprites", - "description": "List sprites on the overworld with optional filters for map, world, or sprite ID", - "parameters": { - "type": "object", - "properties": { - "map": { - "type": "string", - "description": "Optional: filter by map ID (0x00-0x9F)" - }, - "world": { - "type": "string", - "description": "Optional: filter by world (0=light, 1=dark, 2=special)" - }, - "sprite_id": { - "type": "string", - "description": "Optional: filter by sprite ID (0x00-0xFF)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - } - } - }, - { - "name": "overworld-get-entrance", - "description": "Get detailed information about a specific overworld entrance by its ID", - "parameters": { - "type": "object", - "properties": { - "entrance_id": { - "type": "string", - "description": "Entrance ID number (0-128)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["entrance_id"] - } - }, - { - "name": "overworld-tile-stats", - "description": "Analyze usage statistics for a specific tile16 ID across the overworld", - "parameters": { - "type": "object", - "properties": { - "tile_id": { - "type": "string", - "description": "Tile16 ID to analyze (0x0000-0xFFFF, hex or decimal)" - }, - "map": { - "type": "string", - "description": "Optional: limit analysis to specific map ID" - }, - "world": { - "type": "string", - "description": "Optional: limit analysis to specific world (0=light, 1=dark, 2=special)" - }, - "format": { - "type": "string", - "enum": ["json", "text"], - "default": "json" - } - }, - "required": ["tile_id"] - } - } -] - + ] +} \ No newline at end of file diff --git a/assets/agent/system_prompt_v3.txt b/assets/agent/system_prompt_v3.txt new file mode 100644 index 00000000..18130a66 --- /dev/null +++ b/assets/agent/system_prompt_v3.txt @@ -0,0 +1,207 @@ +You are an expert ROM analysis assistant for **yaze** (Yet Another Zelda3 Editor), a modern cross-platform editor for The Legend of Zelda: A Link to the Past ROM hacking. + +# Core Mission: PROACTIVE EXPLORATION + +You are not a passive question-answerer. You are an intelligent ROM exploration partner who: +1. **Anticipates needs**: When users ask questions, infer what they actually want to know +2. **Chains tools intelligently**: Use multiple tools in one turn to provide complete answers +3. **Iterates implicitly**: Don't wait for follow-up questions - provide comprehensive information upfront + +# Tool Calling Strategy + +## CRITICAL PRINCIPLE: Minimize Back-and-Forth + +When a user asks a question: + +### ❌ BAD (Reactive Approach): +User: "What's in room 5?" +You: Call `resource-list` → Get room list → Tell user "Room 5 exists" +User: "What sprites are in it?" ← WASTED TURN! +You: Call `dungeon-describe-room` → Give sprite list + +### ✅ GOOD (Proactive Approach): +User: "What's in room 5?" +You: Call BOTH: + - `dungeon-describe-room` with room=5 + - `resource-list` with type=sprite (to get sprite labels) +You: "Room 5 contains 3 Stalfos (sprite 8), 2 Eyegores (sprite 12), has blue floor tiles, 2 chests with small key and compass, and connects to rooms 3 and 7." + +## Multi-Tool Chaining Patterns + +### Pattern 1: List + Detail +When user asks about "what" exists: +1. Get list of IDs with `resource-list` +2. Get details for relevant items with describe/search commands +3. Provide comprehensive summary + +Example: +```json +{ + "tool_calls": [ + {"tool_name": "resource-list", "args": {"type": "dungeon"}}, + {"tool_name": "dungeon-list-sprites", "args": {"dungeon": "hyrule_castle"}} + ], + "reasoning": "Getting dungeon list AND sprites for first dungeon to provide complete answer" +} +``` + +### Pattern 2: Search + Context +When user asks "where" something is: +1. Search for the item with `resource-search` or find commands +2. Get surrounding context (neighboring rooms, map info, etc.) +3. Explain significance + +Example: +```json +{ + "tool_calls": [ + {"tool_name": "overworld-find-tile", "args": {"tile_id": "0x42"}}, + {"tool_name": "overworld-describe-map", "args": {"map_id": "0"}} + ], + "reasoning": "Finding tile locations AND getting map context to explain where it appears" +} +``` + +### Pattern 3: Describe + Related +When user asks about a specific thing: +1. Get direct information +2. Get related items (sprites in room, warps from location, etc.) +3. Provide holistic view + +Example: +```json +{ + "tool_calls": [ + {"tool_name": "dungeon-describe-room", "args": {"room_id": "5"}}, + {"tool_name": "overworld-list-warps", "args": {"map_id": "0"}}, + {"tool_name": "resource-list", "args": {"type": "sprite"}} + ], + "reasoning": "Getting room details, checking warps that lead there, and sprite labels for complete context" +} +``` + +## CRITICAL RULES + +1. **NEVER call the same tool twice with identical arguments** + - Use tool call deduplication + - If you need the same data, reference previous results + +2. **NEVER send empty text_response after receiving [TOOL RESULT]** + - You MUST synthesize tool results into a natural language answer + - Include context, implications, and additional insights + +3. **ALWAYS provide text_response after tools return** + - Don't say "Let me check..." and call more tools + - Synthesize ALL available information into one comprehensive answer + +4. **Anticipate follow-up questions** + - If user asks "What enemies are in Eastern Palace?", they likely also want to know: + - How many total enemies + - Which rooms have the most + - Any rare or boss sprites + - Provide this information proactively + +5. **Use domain knowledge** + - You know Zelda 3 ROM structure + - Reference game terms: "This is a soldier sprite", not just "sprite ID 8" + - Explain significance: "This room is the entrance hall" vs "Room 0" + +# Output Format + +You MUST respond with ONLY a JSON object: + +```json +{ + "text_response": "string (your natural language reply, REQUIRED after tool results)", + "tool_calls": [{"tool_name": "string", "args": {"key": "value"}}], + "commands": ["string array of z3ed commands for ROM modifications"], + "reasoning": "string (your thought process, explain tool choices)" +} +``` + +**CRITICAL**: The field name is `"text_response"` (NOT "response", NOT "answer") + +# Complete Workflow Example + +User: "What enemies are in the Eastern Palace?" + +## Your First Response (Tool Gathering): +```json +{ + "text_response": "Let me analyze the Eastern Palace enemy layout...", + "tool_calls": [ + {"tool_name": "resource-list", "args": {"type": "dungeon"}}, + {"tool_name": "dungeon-list-sprites", "args": {"dungeon": "eastern_palace"}}, + {"tool_name": "resource-list", "args": {"type": "sprite"}} + ], + "reasoning": "Need dungeon mapping, sprite placements, and sprite labels to provide comprehensive enemy analysis" +} +``` + +## After Receiving [TOOL RESULT]: +```json +{ + "text_response": "The Eastern Palace contains a diverse enemy roster:\n\n**Primary Enemies:**\n- Stalfos (Sprite 8): 12 total across rooms 1, 3, 5, 7, 9\n- Eyegore (Sprite 12): 6 total in rooms 5, 9, 11\n- Ball & Chain Trooper (Sprite 14): 3 in room 10\n\n**Mini-Boss Area:**\n- Giant Stalfos (Sprite 144) in room 8\n\n**Total Enemy Count:** 22 sprites across 11 rooms\n\n**Design Notes:** The palace uses a difficulty curve, starting with basic Stalfos in early rooms and introducing tougher Eyegores in the middle section. Room 10's Ball & Chain Troopers guard the boss key.", + "tool_calls": [], + "reasoning": "Synthesized sprite placement data, identified patterns, provided context about game design and difficulty progression" +} +``` + +# Learn Command Integration + +When users mention preferences or patterns, you can suggest using the learn command: +- "I always use palette 2 for outdoor tiles" → Suggest: `z3ed agent learn --preference "outdoor_palette=2"` +- "I keep asking about room 5" → Suggest: `z3ed agent learn --topic "room_5_analysis"` + +# When to Use Commands vs Tools + +- **Tools** are READ-ONLY: They inspect ROM data +- **Commands** are WRITE operations: They modify the ROM + +Only suggest commands when user explicitly requests changes like: +- "Change the palette to..." +- "Place a sprite at..." +- "Modify room layout..." + +For inspection questions, ONLY use tools. + +# Error Prevention + +1. **Always validate tool results before answering** + - Check if data is empty or malformed + - Explain if information is unavailable + +2. **Provide actionable next steps** + - "Room 5 has no sprites. Would you like to add some?" + - "Tile 0x42 doesn't exist in this map. Did you mean 0x24?" + +3. **Explain ROM limitations** + - "Zelda 3 vanilla has 296 rooms. Custom ROMs may have more." + - "Sprite slots per room are limited to 16 in vanilla." + +# Domain Knowledge + +You understand: +- **Dungeon structure**: Rooms, sprites, chests, bosses, keys +- **Overworld layout**: 64 maps in light/dark world, tile16 system +- **Sprite system**: IDs, behaviors, graphics, palettes +- **Entrance/warp system**: How rooms connect +- **Tile system**: Tile8 (8x8) compose Tile16 (16x16) + +Use this knowledge to provide insightful, contextual answers that go beyond raw data. + +# Response Quality Standards + +GOOD response characteristics: +- ✅ Comprehensive: Answers the question AND related context +- ✅ Structured: Uses headers, lists, formatting for readability +- ✅ Actionable: Provides next steps or suggestions +- ✅ Insightful: Explains WHY, not just WHAT + +BAD response characteristics: +- ❌ Terse: "Room 5 has 3 sprites." +- ❌ Incomplete: Missing context or related information +- ❌ Vague: "Some enemies are in that room." +- ❌ Passive: Waiting for user to ask follow-up questions + +Remember: Your goal is to be the BEST ROM exploration assistant possible. Think ahead, chain tools intelligently, and provide comprehensive insights that save users time and mental effort. diff --git a/src/cli/agent.cmake b/src/cli/agent.cmake index 74bdd220..e85980f0 100644 --- a/src/cli/agent.cmake +++ b/src/cli/agent.cmake @@ -70,6 +70,7 @@ set(YAZE_AGENT_SOURCES cli/service/agent/conversational_agent_service.cc cli/service/agent/simple_chat_session.cc cli/service/agent/tool_dispatcher.cc + cli/service/agent/learned_knowledge_service.cc cli/service/ai/ai_service.cc cli/service/ai/ollama_ai_service.cc cli/service/ai/prompt_builder.cc diff --git a/src/cli/service/ai/gemini_ai_service.cc b/src/cli/service/ai/gemini_ai_service.cc index 703a83b5..edb36ec3 100644 --- a/src/cli/service/ai/gemini_ai_service.cc +++ b/src/cli/service/ai/gemini_ai_service.cc @@ -77,9 +77,14 @@ GeminiAIService::GeminiAIService(const GeminiConfig& config) } // Try to load version-specific system prompt file - std::string prompt_file = config_.prompt_version == "v2" - ? "assets/agent/system_prompt_v2.txt" - : "assets/agent/system_prompt.txt"; + std::string prompt_file; + if (config_.prompt_version == "v3") { + prompt_file = "assets/agent/system_prompt_v3.txt"; + } else if (config_.prompt_version == "v2") { + prompt_file = "assets/agent/system_prompt_v2.txt"; + } else { + prompt_file = "assets/agent/system_prompt.txt"; + } std::vector search_paths = { prompt_file, @@ -135,9 +140,15 @@ std::vector GeminiAIService::GetAvailableTools() const { std::string GeminiAIService::BuildFunctionCallSchemas() { #ifndef YAZE_WITH_JSON - return "[]"; // Empty array if JSON not available + return "{}"; // Empty object if JSON not available #else - // Search for function_schemas.json in multiple locations + // Use the prompt builder's schema generation which reads from prompt_catalogue.yaml + std::string schemas = prompt_builder_.BuildFunctionCallSchemas(); + if (!schemas.empty() && schemas != "[]") { + return schemas; + } + + // Fallback: Search for function_schemas.json const std::vector search_paths = { "assets/agent/function_schemas.json", "../assets/agent/function_schemas.json", @@ -337,10 +348,30 @@ absl::StatusOr GeminiAIService::GenerateResponse( // Add function calling tools if enabled if (function_calling_enabled_) { try { - nlohmann::json tools = nlohmann::json::parse(BuildFunctionCallSchemas()); - request_body["tools"] = {{ - {"function_declarations", tools} - }}; + std::string schemas_str = BuildFunctionCallSchemas(); + if (config_.verbose) { + std::cerr << "[DEBUG] Function calling schemas: " << schemas_str.substr(0, 200) << "..." << std::endl; + } + + nlohmann::json schemas = nlohmann::json::parse(schemas_str); + + // Build tools array - schemas might be an array of tools or a function_declarations object + if (schemas.is_array()) { + // If it's already an array of tools, use it directly + request_body["tools"] = {{ + {"function_declarations", schemas} + }}; + } else if (schemas.is_object() && schemas.contains("function_declarations")) { + // If it's a wrapper object with function_declarations + request_body["tools"] = {{ + {"function_declarations", schemas["function_declarations"]} + }}; + } else { + // Treat as single tool object + request_body["tools"] = {{ + {"function_declarations", nlohmann::json::array({schemas})} + }}; + } } catch (const nlohmann::json::exception& e) { std::cerr << "⚠️ Failed to parse function schemas: " << e.what() << std::endl; } diff --git a/src/cli/service/ai/gemini_ai_service.h b/src/cli/service/ai/gemini_ai_service.h index 805199c7..37483c76 100644 --- a/src/cli/service/ai/gemini_ai_service.h +++ b/src/cli/service/ai/gemini_ai_service.h @@ -19,8 +19,8 @@ struct GeminiConfig { int max_output_tokens = 2048; mutable std::string system_instruction; // Mutable to allow lazy initialization bool use_enhanced_prompting = true; // Enable few-shot examples - bool use_function_calling = false; // Use native Gemini function calling - std::string prompt_version = "default"; // Which prompt file to use (default, v2, etc.) + bool use_function_calling = true; // Use native Gemini function calling (enabled by default for 2.0+) + std::string prompt_version = "v3"; // Which prompt file to use (default, v2, v3, etc.) bool verbose = false; // Enable debug logging GeminiConfig() = default; diff --git a/test/multimodal/test_gemini_vision.cc b/test/multimodal/test_gemini_vision.cc new file mode 100644 index 00000000..a40081fc --- /dev/null +++ b/test/multimodal/test_gemini_vision.cc @@ -0,0 +1,251 @@ +#include +#include + +#include "gtest/gtest.h" +#include "absl/strings/str_cat.h" +#include "cli/service/ai/gemini_ai_service.h" + +#ifdef YAZE_WITH_GRPC +#include "app/core/service/screenshot_utils.h" +#endif + +namespace yaze { +namespace test { + +class GeminiVisionTest : public ::testing::Test { + protected: + void SetUp() override { + // Check if GEMINI_API_KEY is set + const char* api_key = std::getenv("GEMINI_API_KEY"); + if (!api_key || std::string(api_key).empty()) { + GTEST_SKIP() << "GEMINI_API_KEY not set. Skipping multimodal tests."; + } + + api_key_ = api_key; + + // Create test data directory + test_dir_ = std::filesystem::temp_directory_path() / "yaze_multimodal_test"; + std::filesystem::create_directories(test_dir_); + } + + void TearDown() override { + // Clean up test directory + if (std::filesystem::exists(test_dir_)) { + std::filesystem::remove_all(test_dir_); + } + } + + // Helper: Create a simple test image (16x16 PNG) + std::filesystem::path CreateTestImage() { + auto image_path = test_dir_ / "test_image.png"; + + // Create a minimal PNG file (16x16 red square) + // PNG signature + IHDR + IDAT + IEND + const unsigned char png_data[] = { + // PNG signature + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, + // IHDR chunk + 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, + 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x91, 0x68, + 0x36, + // IDAT chunk (minimal data) + 0x00, 0x00, 0x00, 0x0C, 0x49, 0x44, 0x41, 0x54, + 0x08, 0x99, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, + 0x03, 0x01, 0x01, 0x00, 0x18, 0xDD, 0x8D, 0xB4, + // IEND chunk + 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, + 0xAE, 0x42, 0x60, 0x82 + }; + + std::ofstream file(image_path, std::ios::binary); + file.write(reinterpret_cast(png_data), sizeof(png_data)); + file.close(); + + return image_path; + } + + std::string api_key_; + std::filesystem::path test_dir_; +}; + +TEST_F(GeminiVisionTest, BasicImageAnalysis) { + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; // Vision-capable model + config.verbose = false; + + cli::GeminiAIService service(config); + + // Create test image + auto image_path = CreateTestImage(); + ASSERT_TRUE(std::filesystem::exists(image_path)); + + // Send multimodal request + auto response = service.GenerateMultimodalResponse( + image_path.string(), + "Describe this image in one sentence." + ); + + ASSERT_TRUE(response.ok()) << response.status().message(); + EXPECT_FALSE(response->text_response.empty()); + + std::cout << "Vision API response: " << response->text_response << std::endl; +} + +TEST_F(GeminiVisionTest, ImageWithSpecificPrompt) { + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; + config.verbose = false; + + cli::GeminiAIService service(config); + + auto image_path = CreateTestImage(); + + // Ask specific question about the image + auto response = service.GenerateMultimodalResponse( + image_path.string(), + "What color is the dominant color in this image? Answer with just the color name." + ); + + ASSERT_TRUE(response.ok()) << response.status().message(); + EXPECT_FALSE(response->text_response.empty()); + + // Response should mention "red" since we created a red square + std::string response_lower = response->text_response; + std::transform(response_lower.begin(), response_lower.end(), + response_lower.begin(), ::tolower); + EXPECT_TRUE(response_lower.find("red") != std::string::npos || + response_lower.find("pink") != std::string::npos) + << "Expected color 'red' or 'pink' in response: " << response->text_response; +} + +TEST_F(GeminiVisionTest, InvalidImagePath) { + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; + + cli::GeminiAIService service(config); + + // Try with non-existent image + auto response = service.GenerateMultimodalResponse( + "/nonexistent/image.png", + "Describe this image." + ); + + EXPECT_FALSE(response.ok()); + EXPECT_TRUE(absl::IsNotFound(response.status()) || + absl::IsInternal(response.status())); +} + +#ifdef YAZE_WITH_GRPC +// Integration test with screenshot capture +TEST_F(GeminiVisionTest, ScreenshotCaptureIntegration) { + // Note: This test requires a running YAZE instance with gRPC test harness + // Skip if we can't connect + + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; + config.verbose = false; + + cli::GeminiAIService service(config); + + // Attempt to capture a screenshot + auto screenshot_result = yaze::test::CaptureHarnessScreenshot( + (test_dir_ / "screenshot.png").string()); + + if (!screenshot_result.ok()) { + GTEST_SKIP() << "Screenshot capture failed (YAZE may not be running): " + << screenshot_result.status().message(); + } + + // Analyze the captured screenshot + auto response = service.GenerateMultimodalResponse( + screenshot_result->file_path.string(), + "What UI elements are visible in this screenshot? List them." + ); + + ASSERT_TRUE(response.ok()) << response.status().message(); + EXPECT_FALSE(response->text_response.empty()); + + std::cout << "Screenshot analysis: " << response->text_response << std::endl; +} +#endif + +// Performance test +TEST_F(GeminiVisionTest, MultipleRequestsSequential) { + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; + config.verbose = false; + + cli::GeminiAIService service(config); + + auto image_path = CreateTestImage(); + + // Make 3 sequential requests + const int num_requests = 3; + for (int i = 0; i < num_requests; ++i) { + auto response = service.GenerateMultimodalResponse( + image_path.string(), + absl::StrCat("Request ", i + 1, ": Describe this image briefly.") + ); + + ASSERT_TRUE(response.ok()) << "Request " << i + 1 << " failed: " + << response.status().message(); + EXPECT_FALSE(response->text_response.empty()); + } +} + +// Rate limiting test (should handle gracefully) +TEST_F(GeminiVisionTest, RateLimitHandling) { + cli::GeminiConfig config; + config.api_key = api_key_; + config.model = "gemini-2.0-flash-exp"; + config.verbose = false; + + cli::GeminiAIService service(config); + + auto image_path = CreateTestImage(); + + // Make many rapid requests (may hit rate limit) + int successful = 0; + int rate_limited = 0; + + for (int i = 0; i < 10; ++i) { + auto response = service.GenerateMultimodalResponse( + image_path.string(), + "Describe this image." + ); + + if (response.ok()) { + successful++; + } else if (absl::IsResourceExhausted(response.status()) || + response.status().message().find("429") != std::string::npos) { + rate_limited++; + } + } + + // At least some requests should succeed + EXPECT_GT(successful, 0) << "No successful requests out of 10"; + + // If we hit rate limits, that's expected behavior (not a failure) + if (rate_limited > 0) { + std::cout << "Note: Hit rate limit on " << rate_limited << " out of 10 requests (expected)" << std::endl; + } +} + +} // namespace test +} // namespace yaze + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + + std::cout << "\n=== Gemini Multimodal Vision Tests ===" << std::endl; + std::cout << "These tests require GEMINI_API_KEY environment variable." << std::endl; + std::cout << "Tests will be skipped if API key is not available.\n" << std::endl; + + return RUN_ALL_TESTS(); +}