feat: Add AI GUI controller and vision action refiner for enhanced automation
- Introduced `AIGUIController` class to manage AI-driven GUI automation with vision feedback, enabling natural language command execution and iterative action refinement. - Implemented `VisionActionRefiner` class to analyze screenshots and refine actions based on visual feedback, improving action success rates. - Added header and implementation files for both classes, along with necessary methods for screenshot analysis, action verification, and UI element location. - Updated CMake configuration to include new source files for the AI GUI controller and vision action refiner functionalities.
This commit is contained in:
@@ -73,6 +73,8 @@ set(YAZE_AGENT_SOURCES
|
||||
cli/service/agent/learned_knowledge_service.cc
|
||||
cli/service/ai/ai_service.cc
|
||||
cli/service/ai/ai_action_parser.cc
|
||||
cli/service/ai/vision_action_refiner.cc
|
||||
cli/service/ai/ai_gui_controller.cc
|
||||
cli/service/ai/ollama_ai_service.cc
|
||||
cli/service/ai/prompt_builder.cc
|
||||
cli/service/ai/service_factory.cc
|
||||
|
||||
351
src/cli/service/ai/ai_gui_controller.cc
Normal file
351
src/cli/service/ai/ai_gui_controller.cc
Normal file
@@ -0,0 +1,351 @@
|
||||
#include "cli/service/ai/ai_gui_controller.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/time/clock.h"
|
||||
#include "absl/time/time.h"
|
||||
#include "cli/service/ai/gemini_ai_service.h"
|
||||
|
||||
#ifdef YAZE_WITH_GRPC
|
||||
#include "cli/service/gui/gui_automation_client.h"
|
||||
#include "app/core/service/screenshot_utils.h"
|
||||
#endif
|
||||
|
||||
namespace yaze {
|
||||
namespace cli {
|
||||
namespace ai {
|
||||
|
||||
AIGUIController::AIGUIController(GeminiAIService* gemini_service,
|
||||
gui::GuiAutomationClient* gui_client)
|
||||
: gemini_service_(gemini_service),
|
||||
gui_client_(gui_client),
|
||||
vision_refiner_(std::make_unique<VisionActionRefiner>(gemini_service)) {
|
||||
|
||||
if (!gemini_service_) {
|
||||
throw std::invalid_argument("Gemini service cannot be null");
|
||||
}
|
||||
|
||||
if (!gui_client_) {
|
||||
throw std::invalid_argument("GUI client cannot be null");
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status AIGUIController::Initialize(const ControlLoopConfig& config) {
|
||||
config_ = config;
|
||||
screenshots_dir_ = config.screenshots_dir;
|
||||
|
||||
EnsureScreenshotsDirectory();
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::StatusOr<ControlResult> AIGUIController::ExecuteCommand(
|
||||
const std::string& command) {
|
||||
|
||||
// Parse natural language command into actions
|
||||
auto actions_result = AIActionParser::ParseCommand(command);
|
||||
if (!actions_result.ok()) {
|
||||
return actions_result.status();
|
||||
}
|
||||
|
||||
return ExecuteActions(*actions_result);
|
||||
}
|
||||
|
||||
absl::StatusOr<ControlResult> AIGUIController::ExecuteActions(
|
||||
const std::vector<AIAction>& actions) {
|
||||
|
||||
ControlResult result;
|
||||
result.success = false;
|
||||
|
||||
for (const auto& action : actions) {
|
||||
int retry_count = 0;
|
||||
bool action_succeeded = false;
|
||||
AIAction current_action = action;
|
||||
|
||||
while (retry_count < config_.max_retries_per_action && !action_succeeded) {
|
||||
result.iterations_performed++;
|
||||
|
||||
if (result.iterations_performed > config_.max_iterations) {
|
||||
result.error_message = "Max iterations reached";
|
||||
return result;
|
||||
}
|
||||
|
||||
// Execute the action with vision verification
|
||||
auto execute_result = ExecuteSingleAction(
|
||||
current_action,
|
||||
config_.enable_vision_verification
|
||||
);
|
||||
|
||||
if (!execute_result.ok()) {
|
||||
result.error_message = std::string(execute_result.status().message());
|
||||
return result;
|
||||
}
|
||||
|
||||
result.vision_analyses.push_back(*execute_result);
|
||||
result.actions_executed.push_back(current_action);
|
||||
|
||||
if (execute_result->action_successful) {
|
||||
action_succeeded = true;
|
||||
}
|
||||
else if (config_.enable_iterative_refinement) {
|
||||
// Refine action and retry
|
||||
auto refinement = vision_refiner_->RefineAction(
|
||||
current_action,
|
||||
*execute_result
|
||||
);
|
||||
|
||||
if (!refinement.ok()) {
|
||||
result.error_message =
|
||||
absl::StrCat("Failed to refine action: ",
|
||||
refinement.status().message());
|
||||
return result;
|
||||
}
|
||||
|
||||
if (refinement->needs_different_approach) {
|
||||
result.error_message =
|
||||
absl::StrCat("Action requires different approach: ",
|
||||
refinement->reasoning);
|
||||
return result;
|
||||
}
|
||||
|
||||
if (refinement->needs_retry) {
|
||||
// Update action parameters
|
||||
for (const auto& [key, value] : refinement->adjusted_parameters) {
|
||||
current_action.parameters[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
retry_count++;
|
||||
}
|
||||
else {
|
||||
// No refinement, just fail
|
||||
result.error_message = execute_result->error_message;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
if (!action_succeeded) {
|
||||
result.error_message =
|
||||
absl::StrFormat("Action failed after %d retries", retry_count);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
result.success = true;
|
||||
|
||||
// Capture final state
|
||||
auto final_screenshot = CaptureCurrentState("final_state");
|
||||
if (final_screenshot.ok()) {
|
||||
result.screenshots_taken.push_back(*final_screenshot);
|
||||
|
||||
// Analyze final state
|
||||
auto final_analysis = vision_refiner_->AnalyzeScreenshot(
|
||||
*final_screenshot,
|
||||
"Verify all actions completed successfully"
|
||||
);
|
||||
|
||||
if (final_analysis.ok()) {
|
||||
result.final_state_description = final_analysis->description;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> AIGUIController::ExecuteSingleAction(
|
||||
const AIAction& action,
|
||||
bool verify_with_vision) {
|
||||
|
||||
VisionAnalysisResult result;
|
||||
|
||||
// Capture before screenshot
|
||||
std::filesystem::path before_screenshot;
|
||||
if (verify_with_vision) {
|
||||
auto before_result = CaptureCurrentState("before_action");
|
||||
if (!before_result.ok()) {
|
||||
return before_result.status();
|
||||
}
|
||||
before_screenshot = *before_result;
|
||||
}
|
||||
|
||||
// Wait for UI to settle
|
||||
if (config_.screenshot_delay_ms > 0) {
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::milliseconds(config_.screenshot_delay_ms));
|
||||
}
|
||||
|
||||
// Execute the action via gRPC
|
||||
auto execute_status = ExecuteGRPCAction(action);
|
||||
if (!execute_status.ok()) {
|
||||
result.action_successful = false;
|
||||
result.error_message = std::string(execute_status.message());
|
||||
return result;
|
||||
}
|
||||
|
||||
// Wait for action to complete
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::milliseconds(config_.screenshot_delay_ms));
|
||||
|
||||
if (verify_with_vision) {
|
||||
// Capture after screenshot
|
||||
auto after_result = CaptureCurrentState("after_action");
|
||||
if (!after_result.ok()) {
|
||||
return after_result.status();
|
||||
}
|
||||
|
||||
// Verify with vision
|
||||
return VerifyActionSuccess(action, before_screenshot, *after_result);
|
||||
}
|
||||
else {
|
||||
// Assume success without verification
|
||||
result.action_successful = true;
|
||||
result.description = "Action executed (no vision verification)";
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> AIGUIController::AnalyzeCurrentGUIState(
|
||||
const std::string& context) {
|
||||
|
||||
auto screenshot = CaptureCurrentState("analysis");
|
||||
if (!screenshot.ok()) {
|
||||
return screenshot.status();
|
||||
}
|
||||
|
||||
return vision_refiner_->AnalyzeScreenshot(*screenshot, context);
|
||||
}
|
||||
|
||||
// Private helper methods
|
||||
|
||||
absl::StatusOr<std::filesystem::path> AIGUIController::CaptureCurrentState(
|
||||
const std::string& description) {
|
||||
|
||||
#ifdef YAZE_WITH_GRPC
|
||||
std::filesystem::path path = GenerateScreenshotPath(description);
|
||||
|
||||
auto result = yaze::test::CaptureHarnessScreenshot(path.string());
|
||||
if (!result.ok()) {
|
||||
return result.status();
|
||||
}
|
||||
|
||||
return std::filesystem::path(result->file_path);
|
||||
#else
|
||||
return absl::UnimplementedError("Screenshot capture requires gRPC support");
|
||||
#endif
|
||||
}
|
||||
|
||||
absl::Status AIGUIController::ExecuteGRPCAction(const AIAction& action) {
|
||||
// Convert AI action to gRPC test commands
|
||||
auto grpc_commands = action_generator_.GenerateGRPCCommands({action});
|
||||
|
||||
if (grpc_commands.empty()) {
|
||||
return absl::InternalError("No gRPC commands generated for action");
|
||||
}
|
||||
|
||||
// Execute each command
|
||||
for (const auto& command_json : grpc_commands) {
|
||||
// Parse JSON and execute via GUI client
|
||||
// This is a placeholder - actual implementation would parse JSON
|
||||
// and call appropriate GUI client methods
|
||||
|
||||
if (action.type == AIActionType::kClickButton) {
|
||||
auto button_it = action.parameters.find("button");
|
||||
if (button_it != action.parameters.end()) {
|
||||
auto status = gui_client_->ClickButton(button_it->second);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (action.type == AIActionType::kPlaceTile) {
|
||||
// Extract parameters
|
||||
auto x_it = action.parameters.find("x");
|
||||
auto y_it = action.parameters.find("y");
|
||||
auto tile_it = action.parameters.find("tile_id");
|
||||
|
||||
if (x_it != action.parameters.end() &&
|
||||
y_it != action.parameters.end() &&
|
||||
tile_it != action.parameters.end()) {
|
||||
|
||||
int x = std::stoi(x_it->second);
|
||||
int y = std::stoi(y_it->second);
|
||||
int tile_id = std::stoi(tile_it->second);
|
||||
|
||||
// Use GUI client to place tile
|
||||
// (This would need actual implementation in GuiAutomationClient)
|
||||
auto status = gui_client_->ExecuteTestScript(
|
||||
absl::StrFormat("PlaceTile(%d, %d, %d)", x, y, tile_id));
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (action.type == AIActionType::kWait) {
|
||||
int wait_ms = config_.screenshot_delay_ms;
|
||||
auto wait_it = action.parameters.find("duration_ms");
|
||||
if (wait_it != action.parameters.end()) {
|
||||
wait_ms = std::stoi(wait_it->second);
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(wait_ms));
|
||||
}
|
||||
}
|
||||
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> AIGUIController::VerifyActionSuccess(
|
||||
const AIAction& action,
|
||||
const std::filesystem::path& before_screenshot,
|
||||
const std::filesystem::path& after_screenshot) {
|
||||
|
||||
return vision_refiner_->VerifyAction(action, before_screenshot, after_screenshot);
|
||||
}
|
||||
|
||||
absl::StatusOr<AIAction> AIGUIController::RefineActionWithVision(
|
||||
const AIAction& original_action,
|
||||
const VisionAnalysisResult& analysis) {
|
||||
|
||||
auto refinement = vision_refiner_->RefineAction(original_action, analysis);
|
||||
if (!refinement.ok()) {
|
||||
return refinement.status();
|
||||
}
|
||||
|
||||
AIAction refined_action = original_action;
|
||||
|
||||
// Apply adjusted parameters
|
||||
for (const auto& [key, value] : refinement->adjusted_parameters) {
|
||||
refined_action.parameters[key] = value;
|
||||
}
|
||||
|
||||
return refined_action;
|
||||
}
|
||||
|
||||
void AIGUIController::EnsureScreenshotsDirectory() {
|
||||
std::error_code ec;
|
||||
std::filesystem::create_directories(screenshots_dir_, ec);
|
||||
|
||||
if (ec) {
|
||||
std::cerr << "Warning: Failed to create screenshots directory: "
|
||||
<< ec.message() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::filesystem::path AIGUIController::GenerateScreenshotPath(
|
||||
const std::string& suffix) {
|
||||
|
||||
int64_t timestamp = absl::ToUnixMillis(absl::Now());
|
||||
|
||||
std::string filename = absl::StrFormat(
|
||||
"ai_gui_%s_%lld.png",
|
||||
suffix,
|
||||
static_cast<long long>(timestamp)
|
||||
);
|
||||
|
||||
return screenshots_dir_ / filename;
|
||||
}
|
||||
|
||||
} // namespace ai
|
||||
} // namespace cli
|
||||
} // namespace yaze
|
||||
173
src/cli/service/ai/ai_gui_controller.h
Normal file
173
src/cli/service/ai/ai_gui_controller.h
Normal file
@@ -0,0 +1,173 @@
|
||||
#ifndef YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_
|
||||
#define YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/status/status.h"
|
||||
#include "absl/status/statusor.h"
|
||||
#include "cli/service/ai/ai_action_parser.h"
|
||||
#include "cli/service/ai/vision_action_refiner.h"
|
||||
#include "cli/service/gui/gui_action_generator.h"
|
||||
|
||||
namespace yaze {
|
||||
namespace cli {
|
||||
|
||||
// Forward declares
|
||||
class GeminiAIService;
|
||||
namespace gui {
|
||||
class GuiAutomationClient;
|
||||
}
|
||||
|
||||
namespace ai {
|
||||
|
||||
/**
|
||||
* @struct ControlLoopConfig
|
||||
* @brief Configuration for the AI GUI control loop
|
||||
*/
|
||||
struct ControlLoopConfig {
|
||||
int max_iterations = 10; // Max attempts before giving up
|
||||
int screenshot_delay_ms = 500; // Delay before taking screenshots
|
||||
bool enable_vision_verification = true; // Use vision to verify actions
|
||||
bool enable_iterative_refinement = true; // Retry with refined actions
|
||||
int max_retries_per_action = 3; // Max retries for a single action
|
||||
std::string screenshots_dir = "/tmp/yaze/ai_gui_control";
|
||||
};
|
||||
|
||||
/**
|
||||
* @struct ControlResult
|
||||
* @brief Result of AI-controlled GUI automation
|
||||
*/
|
||||
struct ControlResult {
|
||||
bool success = false;
|
||||
int iterations_performed = 0;
|
||||
std::vector<ai::AIAction> actions_executed;
|
||||
std::vector<VisionAnalysisResult> vision_analyses;
|
||||
std::vector<std::filesystem::path> screenshots_taken;
|
||||
std::string error_message;
|
||||
std::string final_state_description;
|
||||
};
|
||||
|
||||
/**
|
||||
* @class AIGUIController
|
||||
* @brief High-level controller for AI-driven GUI automation with vision feedback
|
||||
*
|
||||
* This class implements the complete vision-guided control loop:
|
||||
*
|
||||
* 1. **Parse Command** → Natural language → AIActions
|
||||
* 2. **Take Screenshot** → Capture current GUI state
|
||||
* 3. **Analyze Vision** → Gemini analyzes screenshot
|
||||
* 4. **Execute Action** → Send gRPC command to GUI
|
||||
* 5. **Verify Success** → Compare before/after screenshots
|
||||
* 6. **Refine & Retry** → Adjust parameters if action failed
|
||||
* 7. **Repeat** → Until goal achieved or max iterations reached
|
||||
*
|
||||
* Example usage:
|
||||
* ```cpp
|
||||
* AIGUIController controller(gemini_service, gui_client);
|
||||
* controller.Initialize(config);
|
||||
*
|
||||
* auto result = controller.ExecuteCommand(
|
||||
* "Place tile 0x42 at overworld position (5, 7)"
|
||||
* );
|
||||
*
|
||||
* if (result->success) {
|
||||
* std::cout << "Success! Took " << result->iterations_performed
|
||||
* << " iterations\n";
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
class AIGUIController {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct controller with required services
|
||||
* @param gemini_service Gemini AI service for vision analysis
|
||||
* @param gui_client gRPC client for GUI automation
|
||||
*/
|
||||
AIGUIController(GeminiAIService* gemini_service,
|
||||
gui::GuiAutomationClient* gui_client);
|
||||
|
||||
~AIGUIController() = default;
|
||||
|
||||
/**
|
||||
* @brief Initialize the controller with configuration
|
||||
*/
|
||||
absl::Status Initialize(const ControlLoopConfig& config);
|
||||
|
||||
/**
|
||||
* @brief Execute a natural language command with AI vision guidance
|
||||
* @param command Natural language command (e.g., "Place tile 0x42 at (5, 7)")
|
||||
* @return Result including success status and execution details
|
||||
*/
|
||||
absl::StatusOr<ControlResult> ExecuteCommand(const std::string& command);
|
||||
|
||||
/**
|
||||
* @brief Execute a sequence of pre-parsed actions
|
||||
* @param actions Vector of AI actions to execute
|
||||
* @return Result including success status
|
||||
*/
|
||||
absl::StatusOr<ControlResult> ExecuteActions(
|
||||
const std::vector<ai::AIAction>& actions);
|
||||
|
||||
/**
|
||||
* @brief Execute a single action with optional vision verification
|
||||
* @param action The action to execute
|
||||
* @param verify_with_vision Whether to use vision to verify success
|
||||
* @return Success status and vision analysis
|
||||
*/
|
||||
absl::StatusOr<VisionAnalysisResult> ExecuteSingleAction(
|
||||
const AIAction& action,
|
||||
bool verify_with_vision = true);
|
||||
|
||||
/**
|
||||
* @brief Analyze the current GUI state without executing actions
|
||||
* @param context What to look for in the GUI
|
||||
* @return Vision analysis of current state
|
||||
*/
|
||||
absl::StatusOr<VisionAnalysisResult> AnalyzeCurrentGUIState(
|
||||
const std::string& context = "");
|
||||
|
||||
/**
|
||||
* @brief Get the current configuration
|
||||
*/
|
||||
const ControlLoopConfig& config() const { return config_; }
|
||||
|
||||
/**
|
||||
* @brief Update configuration
|
||||
*/
|
||||
void SetConfig(const ControlLoopConfig& config) { config_ = config; }
|
||||
|
||||
private:
|
||||
GeminiAIService* gemini_service_; // Not owned
|
||||
gui::GuiAutomationClient* gui_client_; // Not owned
|
||||
std::unique_ptr<VisionActionRefiner> vision_refiner_;
|
||||
gui::GuiActionGenerator action_generator_;
|
||||
ControlLoopConfig config_;
|
||||
std::filesystem::path screenshots_dir_;
|
||||
|
||||
// Helper methods
|
||||
absl::StatusOr<std::filesystem::path> CaptureCurrentState(
|
||||
const std::string& description);
|
||||
|
||||
absl::Status ExecuteGRPCAction(const AIAction& action);
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> VerifyActionSuccess(
|
||||
const AIAction& action,
|
||||
const std::filesystem::path& before_screenshot,
|
||||
const std::filesystem::path& after_screenshot);
|
||||
|
||||
absl::StatusOr<AIAction> RefineActionWithVision(
|
||||
const AIAction& original_action,
|
||||
const VisionAnalysisResult& analysis);
|
||||
|
||||
void EnsureScreenshotsDirectory();
|
||||
std::filesystem::path GenerateScreenshotPath(const std::string& suffix);
|
||||
};
|
||||
|
||||
} // namespace ai
|
||||
} // namespace cli
|
||||
} // namespace yaze
|
||||
|
||||
#endif // YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_
|
||||
353
src/cli/service/ai/vision_action_refiner.cc
Normal file
353
src/cli/service/ai/vision_action_refiner.cc
Normal file
@@ -0,0 +1,353 @@
|
||||
#include "cli/service/ai/vision_action_refiner.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "cli/service/ai/gemini_ai_service.h"
|
||||
|
||||
namespace yaze {
|
||||
namespace cli {
|
||||
namespace ai {
|
||||
|
||||
VisionActionRefiner::VisionActionRefiner(GeminiAIService* gemini_service)
|
||||
: gemini_service_(gemini_service) {
|
||||
if (!gemini_service_) {
|
||||
throw std::invalid_argument("Gemini service cannot be null");
|
||||
}
|
||||
}
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::AnalyzeScreenshot(
|
||||
const std::filesystem::path& screenshot_path,
|
||||
const std::string& context) {
|
||||
|
||||
if (!std::filesystem::exists(screenshot_path)) {
|
||||
return absl::NotFoundError(
|
||||
absl::StrCat("Screenshot not found: ", screenshot_path.string()));
|
||||
}
|
||||
|
||||
std::string prompt = BuildAnalysisPrompt(context);
|
||||
|
||||
auto response = gemini_service_->GenerateMultimodalResponse(
|
||||
screenshot_path.string(),
|
||||
prompt
|
||||
);
|
||||
|
||||
if (!response.ok()) {
|
||||
return response.status();
|
||||
}
|
||||
|
||||
return ParseAnalysisResponse(response->text_response);
|
||||
}
|
||||
|
||||
absl::StatusOr<VisionAnalysisResult> VisionActionRefiner::VerifyAction(
|
||||
const AIAction& action,
|
||||
const std::filesystem::path& before_screenshot,
|
||||
const std::filesystem::path& after_screenshot) {
|
||||
|
||||
if (!std::filesystem::exists(before_screenshot)) {
|
||||
return absl::NotFoundError("Before screenshot not found");
|
||||
}
|
||||
|
||||
if (!std::filesystem::exists(after_screenshot)) {
|
||||
return absl::NotFoundError("After screenshot not found");
|
||||
}
|
||||
|
||||
// First, analyze the after screenshot
|
||||
std::string verification_prompt = BuildVerificationPrompt(action);
|
||||
|
||||
auto after_response = gemini_service_->GenerateMultimodalResponse(
|
||||
after_screenshot.string(),
|
||||
verification_prompt
|
||||
);
|
||||
|
||||
if (!after_response.ok()) {
|
||||
return after_response.status();
|
||||
}
|
||||
|
||||
return ParseVerificationResponse(after_response->text_response, action);
|
||||
}
|
||||
|
||||
absl::StatusOr<ActionRefinement> VisionActionRefiner::RefineAction(
|
||||
const AIAction& original_action,
|
||||
const VisionAnalysisResult& analysis) {
|
||||
|
||||
ActionRefinement refinement;
|
||||
|
||||
// If action was successful, no refinement needed
|
||||
if (analysis.action_successful) {
|
||||
return refinement;
|
||||
}
|
||||
|
||||
// Determine refinement strategy based on error
|
||||
std::string error_lower = analysis.error_message;
|
||||
std::transform(error_lower.begin(), error_lower.end(),
|
||||
error_lower.begin(), ::tolower);
|
||||
|
||||
if (error_lower.find("not found") != std::string::npos ||
|
||||
error_lower.find("missing") != std::string::npos) {
|
||||
refinement.needs_different_approach = true;
|
||||
refinement.reasoning = "UI element not found, may need to open different editor";
|
||||
}
|
||||
else if (error_lower.find("wrong") != std::string::npos ||
|
||||
error_lower.find("incorrect") != std::string::npos) {
|
||||
refinement.needs_retry = true;
|
||||
refinement.reasoning = "Action executed on wrong element, adjusting parameters";
|
||||
|
||||
// Try to extract corrected parameters from suggestions
|
||||
for (const auto& suggestion : analysis.suggestions) {
|
||||
// Parse suggestions for parameter corrections
|
||||
// e.g., "Try position (6, 8) instead"
|
||||
if (suggestion.find("position") != std::string::npos) {
|
||||
// Extract coordinates
|
||||
size_t pos = suggestion.find('(');
|
||||
if (pos != std::string::npos) {
|
||||
size_t end = suggestion.find(')', pos);
|
||||
if (end != std::string::npos) {
|
||||
std::string coords = suggestion.substr(pos + 1, end - pos - 1);
|
||||
std::vector<std::string> parts = absl::StrSplit(coords, ',');
|
||||
if (parts.size() == 2) {
|
||||
refinement.adjusted_parameters["x"] =
|
||||
absl::StripAsciiWhitespace(parts[0]);
|
||||
refinement.adjusted_parameters["y"] =
|
||||
absl::StripAsciiWhitespace(parts[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
refinement.needs_retry = true;
|
||||
refinement.reasoning = "Generic failure, will retry with same parameters";
|
||||
}
|
||||
|
||||
return refinement;
|
||||
}
|
||||
|
||||
absl::StatusOr<std::map<std::string, std::string>>
|
||||
VisionActionRefiner::LocateUIElement(
|
||||
const std::filesystem::path& screenshot_path,
|
||||
const std::string& element_name) {
|
||||
|
||||
std::string prompt = BuildElementLocationPrompt(element_name);
|
||||
|
||||
auto response = gemini_service_->GenerateMultimodalResponse(
|
||||
screenshot_path.string(),
|
||||
prompt
|
||||
);
|
||||
|
||||
if (!response.ok()) {
|
||||
return response.status();
|
||||
}
|
||||
|
||||
std::map<std::string, std::string> location;
|
||||
|
||||
// Parse location from response
|
||||
// Expected format: "The element is located at position (X, Y)"
|
||||
// or "The element is in the top-right corner"
|
||||
std::string text = response->text_response;
|
||||
std::transform(text.begin(), text.end(), text.begin(), ::tolower);
|
||||
|
||||
if (text.find("not found") != std::string::npos ||
|
||||
text.find("not visible") != std::string::npos) {
|
||||
location["found"] = "false";
|
||||
location["description"] = response->text_response;
|
||||
} else {
|
||||
location["found"] = "true";
|
||||
location["description"] = response->text_response;
|
||||
|
||||
// Try to extract coordinates
|
||||
size_t pos = text.find('(');
|
||||
if (pos != std::string::npos) {
|
||||
size_t end = text.find(')', pos);
|
||||
if (end != std::string::npos) {
|
||||
std::string coords = text.substr(pos + 1, end - pos - 1);
|
||||
std::vector<std::string> parts = absl::StrSplit(coords, ',');
|
||||
if (parts.size() == 2) {
|
||||
location["x"] = absl::StripAsciiWhitespace(parts[0]);
|
||||
location["y"] = absl::StripAsciiWhitespace(parts[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return location;
|
||||
}
|
||||
|
||||
absl::StatusOr<std::vector<std::string>>
|
||||
VisionActionRefiner::ExtractVisibleWidgets(
|
||||
const std::filesystem::path& screenshot_path) {
|
||||
|
||||
std::string prompt = BuildWidgetExtractionPrompt();
|
||||
|
||||
auto response = gemini_service_->GenerateMultimodalResponse(
|
||||
screenshot_path.string(),
|
||||
prompt
|
||||
);
|
||||
|
||||
if (!response.ok()) {
|
||||
return response.status();
|
||||
}
|
||||
|
||||
// Parse widget list from response
|
||||
std::vector<std::string> widgets;
|
||||
std::stringstream ss(response->text_response);
|
||||
std::string line;
|
||||
|
||||
while (std::getline(ss, line)) {
|
||||
// Skip empty lines
|
||||
if (line.empty() || line.find_first_not_of(" \t\n\r") == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Remove list markers (-, *, 1., etc.)
|
||||
size_t start = 0;
|
||||
if (line[0] == '-' || line[0] == '*') {
|
||||
start = 1;
|
||||
} else if (std::isdigit(line[0])) {
|
||||
start = line.find('.');
|
||||
if (start != std::string::npos) {
|
||||
start++;
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
absl::string_view widget_view = absl::StripAsciiWhitespace(
|
||||
absl::string_view(line).substr(start));
|
||||
|
||||
if (!widget_view.empty()) {
|
||||
widgets.push_back(std::string(widget_view));
|
||||
}
|
||||
}
|
||||
|
||||
return widgets;
|
||||
}
|
||||
|
||||
// Private helper methods
|
||||
|
||||
std::string VisionActionRefiner::BuildAnalysisPrompt(const std::string& context) {
|
||||
std::string base_prompt =
|
||||
"Analyze this screenshot of the YAZE ROM editor GUI. "
|
||||
"Identify all visible UI elements, windows, and widgets. "
|
||||
"List them in order of importance.";
|
||||
|
||||
if (!context.empty()) {
|
||||
return absl::StrCat(base_prompt, "\n\nContext: ", context);
|
||||
}
|
||||
|
||||
return base_prompt;
|
||||
}
|
||||
|
||||
std::string VisionActionRefiner::BuildVerificationPrompt(const AIAction& action) {
|
||||
std::string action_desc = AIActionParser::ActionToString(action);
|
||||
|
||||
return absl::StrCat(
|
||||
"This screenshot was taken after attempting to perform the following action: ",
|
||||
action_desc,
|
||||
"\n\nDid the action succeed? Look for visual evidence that the action completed. "
|
||||
"Respond with:\n"
|
||||
"SUCCESS: <description of what changed>\n"
|
||||
"or\n"
|
||||
"FAILURE: <description of what went wrong>"
|
||||
);
|
||||
}
|
||||
|
||||
std::string VisionActionRefiner::BuildElementLocationPrompt(
|
||||
const std::string& element_name) {
|
||||
return absl::StrCat(
|
||||
"Locate the '", element_name, "' UI element in this screenshot. "
|
||||
"If found, describe its position (coordinates if possible, or relative position). "
|
||||
"If not found, state 'NOT FOUND'."
|
||||
);
|
||||
}
|
||||
|
||||
std::string VisionActionRefiner::BuildWidgetExtractionPrompt() {
|
||||
return
|
||||
"List all visible UI widgets, buttons, windows, and interactive elements "
|
||||
"in this screenshot. Format as a bulleted list, one element per line.";
|
||||
}
|
||||
|
||||
VisionAnalysisResult VisionActionRefiner::ParseAnalysisResponse(
|
||||
const std::string& response) {
|
||||
|
||||
VisionAnalysisResult result;
|
||||
result.description = response;
|
||||
|
||||
// Extract widgets from description
|
||||
// Look for common patterns like "- Button", "1. Window", etc.
|
||||
std::stringstream ss(response);
|
||||
std::string line;
|
||||
|
||||
while (std::getline(ss, line)) {
|
||||
// Check if line contains a widget mention
|
||||
std::string lower = line;
|
||||
std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
|
||||
|
||||
if (lower.find("button") != std::string::npos ||
|
||||
lower.find("window") != std::string::npos ||
|
||||
lower.find("panel") != std::string::npos ||
|
||||
lower.find("selector") != std::string::npos ||
|
||||
lower.find("editor") != std::string::npos) {
|
||||
result.widgets.push_back(std::string(absl::StripAsciiWhitespace(line)));
|
||||
}
|
||||
|
||||
// Extract suggestions
|
||||
if (lower.find("suggest") != std::string::npos ||
|
||||
lower.find("try") != std::string::npos ||
|
||||
lower.find("could") != std::string::npos) {
|
||||
result.suggestions.push_back(std::string(absl::StripAsciiWhitespace(line)));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
VisionAnalysisResult VisionActionRefiner::ParseVerificationResponse(
|
||||
const std::string& response,
|
||||
const AIAction& action) {
|
||||
|
||||
VisionAnalysisResult result;
|
||||
result.description = response;
|
||||
|
||||
std::string response_upper = response;
|
||||
std::transform(response_upper.begin(), response_upper.end(),
|
||||
response_upper.begin(), ::toupper);
|
||||
|
||||
if (response_upper.find("SUCCESS") != std::string::npos) {
|
||||
result.action_successful = true;
|
||||
|
||||
// Extract success description
|
||||
size_t pos = response_upper.find("SUCCESS:");
|
||||
if (pos != std::string::npos) {
|
||||
std::string desc = response.substr(pos + 8);
|
||||
result.description = absl::StripAsciiWhitespace(desc);
|
||||
}
|
||||
}
|
||||
else if (response_upper.find("FAILURE") != std::string::npos) {
|
||||
result.action_successful = false;
|
||||
|
||||
// Extract failure description
|
||||
size_t pos = response_upper.find("FAILURE:");
|
||||
if (pos != std::string::npos) {
|
||||
std::string desc = response.substr(pos + 8);
|
||||
result.error_message = absl::StripAsciiWhitespace(desc);
|
||||
} else {
|
||||
result.error_message = "Action failed (details in description)";
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Ambiguous response, assume failure
|
||||
result.action_successful = false;
|
||||
result.error_message = "Could not determine action success from vision analysis";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace ai
|
||||
} // namespace cli
|
||||
} // namespace yaze
|
||||
155
src/cli/service/ai/vision_action_refiner.h
Normal file
155
src/cli/service/ai/vision_action_refiner.h
Normal file
@@ -0,0 +1,155 @@
|
||||
#ifndef YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
|
||||
#define YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
|
||||
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/status/statusor.h"
|
||||
#include "cli/service/ai/ai_action_parser.h"
|
||||
|
||||
namespace yaze {
|
||||
namespace cli {
|
||||
|
||||
// Forward declare
|
||||
class GeminiAIService;
|
||||
|
||||
namespace ai {
|
||||
|
||||
/**
|
||||
* @struct VisionAnalysisResult
|
||||
* @brief Result of analyzing a screenshot with Gemini Vision
|
||||
*/
|
||||
struct VisionAnalysisResult {
|
||||
std::string description; // What Gemini sees in the image
|
||||
std::vector<std::string> widgets; // Detected UI widgets
|
||||
std::vector<std::string> suggestions; // Action suggestions
|
||||
bool action_successful = false; // Whether the last action succeeded
|
||||
std::string error_message; // Error description if action failed
|
||||
};
|
||||
|
||||
/**
|
||||
* @struct ActionRefinement
|
||||
* @brief Refined action parameters based on vision analysis
|
||||
*/
|
||||
struct ActionRefinement {
|
||||
bool needs_retry = false;
|
||||
bool needs_different_approach = false;
|
||||
std::map<std::string, std::string> adjusted_parameters;
|
||||
std::string reasoning;
|
||||
};
|
||||
|
||||
/**
|
||||
* @class VisionActionRefiner
|
||||
* @brief Uses Gemini Vision to analyze GUI screenshots and refine AI actions
|
||||
*
|
||||
* This class implements the vision-guided action loop:
|
||||
* 1. Take screenshot of current GUI state
|
||||
* 2. Send to Gemini Vision with contextual prompt
|
||||
* 3. Analyze response to determine next action
|
||||
* 4. Verify action success by comparing screenshots
|
||||
*
|
||||
* Example usage:
|
||||
* ```cpp
|
||||
* VisionActionRefiner refiner(gemini_service);
|
||||
*
|
||||
* // Analyze current state
|
||||
* auto analysis = refiner.AnalyzeCurrentState(
|
||||
* "overworld_editor",
|
||||
* "Looking for tile selector"
|
||||
* );
|
||||
*
|
||||
* // Verify action was successful
|
||||
* auto verification = refiner.VerifyAction(
|
||||
* AIAction(kPlaceTile, {{"x", "5"}, {"y", "7"}}),
|
||||
* before_screenshot,
|
||||
* after_screenshot
|
||||
* );
|
||||
*
|
||||
* // Refine failed action
|
||||
* if (!verification->action_successful) {
|
||||
* auto refinement = refiner.RefineAction(
|
||||
* original_action,
|
||||
* *verification
|
||||
* );
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
class VisionActionRefiner {
|
||||
public:
|
||||
/**
|
||||
* @brief Construct refiner with Gemini service
|
||||
* @param gemini_service Pointer to Gemini AI service (not owned)
|
||||
*/
|
||||
explicit VisionActionRefiner(GeminiAIService* gemini_service);
|
||||
|
||||
/**
|
||||
* @brief Analyze the current GUI state from a screenshot
|
||||
* @param screenshot_path Path to screenshot file
|
||||
* @param context Additional context about what we're looking for
|
||||
* @return Vision analysis result
|
||||
*/
|
||||
absl::StatusOr<VisionAnalysisResult> AnalyzeScreenshot(
|
||||
const std::filesystem::path& screenshot_path,
|
||||
const std::string& context = "");
|
||||
|
||||
/**
|
||||
* @brief Verify an action was successful by comparing before/after screenshots
|
||||
* @param action The action that was performed
|
||||
* @param before_screenshot Screenshot before action
|
||||
* @param after_screenshot Screenshot after action
|
||||
* @return Analysis indicating whether action succeeded
|
||||
*/
|
||||
absl::StatusOr<VisionAnalysisResult> VerifyAction(
|
||||
const AIAction& action,
|
||||
const std::filesystem::path& before_screenshot,
|
||||
const std::filesystem::path& after_screenshot);
|
||||
|
||||
/**
|
||||
* @brief Refine an action based on vision analysis feedback
|
||||
* @param original_action The action that failed or needs adjustment
|
||||
* @param analysis Vision analysis showing why action failed
|
||||
* @return Refined action with adjusted parameters
|
||||
*/
|
||||
absl::StatusOr<ActionRefinement> RefineAction(
|
||||
const AIAction& original_action,
|
||||
const VisionAnalysisResult& analysis);
|
||||
|
||||
/**
|
||||
* @brief Find a specific UI element in a screenshot
|
||||
* @param screenshot_path Path to screenshot
|
||||
* @param element_name Name/description of element to find
|
||||
* @return Coordinates or description of where element is located
|
||||
*/
|
||||
absl::StatusOr<std::map<std::string, std::string>> LocateUIElement(
|
||||
const std::filesystem::path& screenshot_path,
|
||||
const std::string& element_name);
|
||||
|
||||
/**
|
||||
* @brief Extract all visible widgets from a screenshot
|
||||
* @param screenshot_path Path to screenshot
|
||||
* @return List of detected widgets with their properties
|
||||
*/
|
||||
absl::StatusOr<std::vector<std::string>> ExtractVisibleWidgets(
|
||||
const std::filesystem::path& screenshot_path);
|
||||
|
||||
private:
|
||||
GeminiAIService* gemini_service_; // Not owned
|
||||
|
||||
// Build prompts for different vision analysis tasks
|
||||
std::string BuildAnalysisPrompt(const std::string& context);
|
||||
std::string BuildVerificationPrompt(const AIAction& action);
|
||||
std::string BuildElementLocationPrompt(const std::string& element_name);
|
||||
std::string BuildWidgetExtractionPrompt();
|
||||
|
||||
// Parse Gemini vision responses
|
||||
VisionAnalysisResult ParseAnalysisResponse(const std::string& response);
|
||||
VisionAnalysisResult ParseVerificationResponse(
|
||||
const std::string& response, const AIAction& action);
|
||||
};
|
||||
|
||||
} // namespace ai
|
||||
} // namespace cli
|
||||
} // namespace yaze
|
||||
|
||||
#endif // YAZE_CLI_SERVICE_AI_VISION_ACTION_REFINER_H_
|
||||
Reference in New Issue
Block a user