Files
yaze/src/cli/service/ai/ai_gui_controller.h
scawful ec88f087a2 feat: Add AI GUI controller and vision action refiner for enhanced automation
- Introduced `AIGUIController` class to manage AI-driven GUI automation with vision feedback, enabling natural language command execution and iterative action refinement.
- Implemented `VisionActionRefiner` class to analyze screenshots and refine actions based on visual feedback, improving action success rates.
- Added header and implementation files for both classes, along with necessary methods for screenshot analysis, action verification, and UI element location.
- Updated CMake configuration to include new source files for the AI GUI controller and vision action refiner functionalities.
2025-10-04 23:09:59 -04:00

174 lines
5.3 KiB
C++

#ifndef YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_
#define YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_
#include <filesystem>
#include <memory>
#include <string>
#include <vector>
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "cli/service/ai/ai_action_parser.h"
#include "cli/service/ai/vision_action_refiner.h"
#include "cli/service/gui/gui_action_generator.h"
namespace yaze {
namespace cli {
// Forward declares
class GeminiAIService;
namespace gui {
class GuiAutomationClient;
}
namespace ai {
/**
* @struct ControlLoopConfig
* @brief Configuration for the AI GUI control loop
*/
struct ControlLoopConfig {
int max_iterations = 10; // Max attempts before giving up
int screenshot_delay_ms = 500; // Delay before taking screenshots
bool enable_vision_verification = true; // Use vision to verify actions
bool enable_iterative_refinement = true; // Retry with refined actions
int max_retries_per_action = 3; // Max retries for a single action
std::string screenshots_dir = "/tmp/yaze/ai_gui_control";
};
/**
* @struct ControlResult
* @brief Result of AI-controlled GUI automation
*/
struct ControlResult {
bool success = false;
int iterations_performed = 0;
std::vector<ai::AIAction> actions_executed;
std::vector<VisionAnalysisResult> vision_analyses;
std::vector<std::filesystem::path> screenshots_taken;
std::string error_message;
std::string final_state_description;
};
/**
* @class AIGUIController
* @brief High-level controller for AI-driven GUI automation with vision feedback
*
* This class implements the complete vision-guided control loop:
*
* 1. **Parse Command** → Natural language → AIActions
* 2. **Take Screenshot** → Capture current GUI state
* 3. **Analyze Vision** → Gemini analyzes screenshot
* 4. **Execute Action** → Send gRPC command to GUI
* 5. **Verify Success** → Compare before/after screenshots
* 6. **Refine & Retry** → Adjust parameters if action failed
* 7. **Repeat** → Until goal achieved or max iterations reached
*
* Example usage:
* ```cpp
* AIGUIController controller(gemini_service, gui_client);
* controller.Initialize(config);
*
* auto result = controller.ExecuteCommand(
* "Place tile 0x42 at overworld position (5, 7)"
* );
*
* if (result->success) {
* std::cout << "Success! Took " << result->iterations_performed
* << " iterations\n";
* }
* ```
*/
class AIGUIController {
public:
/**
* @brief Construct controller with required services
* @param gemini_service Gemini AI service for vision analysis
* @param gui_client gRPC client for GUI automation
*/
AIGUIController(GeminiAIService* gemini_service,
gui::GuiAutomationClient* gui_client);
~AIGUIController() = default;
/**
* @brief Initialize the controller with configuration
*/
absl::Status Initialize(const ControlLoopConfig& config);
/**
* @brief Execute a natural language command with AI vision guidance
* @param command Natural language command (e.g., "Place tile 0x42 at (5, 7)")
* @return Result including success status and execution details
*/
absl::StatusOr<ControlResult> ExecuteCommand(const std::string& command);
/**
* @brief Execute a sequence of pre-parsed actions
* @param actions Vector of AI actions to execute
* @return Result including success status
*/
absl::StatusOr<ControlResult> ExecuteActions(
const std::vector<ai::AIAction>& actions);
/**
* @brief Execute a single action with optional vision verification
* @param action The action to execute
* @param verify_with_vision Whether to use vision to verify success
* @return Success status and vision analysis
*/
absl::StatusOr<VisionAnalysisResult> ExecuteSingleAction(
const AIAction& action,
bool verify_with_vision = true);
/**
* @brief Analyze the current GUI state without executing actions
* @param context What to look for in the GUI
* @return Vision analysis of current state
*/
absl::StatusOr<VisionAnalysisResult> AnalyzeCurrentGUIState(
const std::string& context = "");
/**
* @brief Get the current configuration
*/
const ControlLoopConfig& config() const { return config_; }
/**
* @brief Update configuration
*/
void SetConfig(const ControlLoopConfig& config) { config_ = config; }
private:
GeminiAIService* gemini_service_; // Not owned
gui::GuiAutomationClient* gui_client_; // Not owned
std::unique_ptr<VisionActionRefiner> vision_refiner_;
gui::GuiActionGenerator action_generator_;
ControlLoopConfig config_;
std::filesystem::path screenshots_dir_;
// Helper methods
absl::StatusOr<std::filesystem::path> CaptureCurrentState(
const std::string& description);
absl::Status ExecuteGRPCAction(const AIAction& action);
absl::StatusOr<VisionAnalysisResult> VerifyActionSuccess(
const AIAction& action,
const std::filesystem::path& before_screenshot,
const std::filesystem::path& after_screenshot);
absl::StatusOr<AIAction> RefineActionWithVision(
const AIAction& original_action,
const VisionAnalysisResult& analysis);
void EnsureScreenshotsDirectory();
std::filesystem::path GenerateScreenshotPath(const std::string& suffix);
};
} // namespace ai
} // namespace cli
} // namespace yaze
#endif // YAZE_CLI_SERVICE_AI_AI_GUI_CONTROLLER_H_