Files
yaze/src/cli/service/ai/ai_gui_controller.cc
scawful ec88f087a2 feat: Add AI GUI controller and vision action refiner for enhanced automation
- Introduced `AIGUIController` class to manage AI-driven GUI automation with vision feedback, enabling natural language command execution and iterative action refinement.
- Implemented `VisionActionRefiner` class to analyze screenshots and refine actions based on visual feedback, improving action success rates.
- Added header and implementation files for both classes, along with necessary methods for screenshot analysis, action verification, and UI element location.
- Updated CMake configuration to include new source files for the AI GUI controller and vision action refiner functionalities.
2025-10-04 23:09:59 -04:00

352 lines
10 KiB
C++

#include "cli/service/ai/ai_gui_controller.h"
#include <chrono>
#include <thread>
#include "absl/strings/str_format.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "cli/service/ai/gemini_ai_service.h"
#ifdef YAZE_WITH_GRPC
#include "cli/service/gui/gui_automation_client.h"
#include "app/core/service/screenshot_utils.h"
#endif
namespace yaze {
namespace cli {
namespace ai {
AIGUIController::AIGUIController(GeminiAIService* gemini_service,
gui::GuiAutomationClient* gui_client)
: gemini_service_(gemini_service),
gui_client_(gui_client),
vision_refiner_(std::make_unique<VisionActionRefiner>(gemini_service)) {
if (!gemini_service_) {
throw std::invalid_argument("Gemini service cannot be null");
}
if (!gui_client_) {
throw std::invalid_argument("GUI client cannot be null");
}
}
absl::Status AIGUIController::Initialize(const ControlLoopConfig& config) {
config_ = config;
screenshots_dir_ = config.screenshots_dir;
EnsureScreenshotsDirectory();
return absl::OkStatus();
}
absl::StatusOr<ControlResult> AIGUIController::ExecuteCommand(
const std::string& command) {
// Parse natural language command into actions
auto actions_result = AIActionParser::ParseCommand(command);
if (!actions_result.ok()) {
return actions_result.status();
}
return ExecuteActions(*actions_result);
}
absl::StatusOr<ControlResult> AIGUIController::ExecuteActions(
const std::vector<AIAction>& actions) {
ControlResult result;
result.success = false;
for (const auto& action : actions) {
int retry_count = 0;
bool action_succeeded = false;
AIAction current_action = action;
while (retry_count < config_.max_retries_per_action && !action_succeeded) {
result.iterations_performed++;
if (result.iterations_performed > config_.max_iterations) {
result.error_message = "Max iterations reached";
return result;
}
// Execute the action with vision verification
auto execute_result = ExecuteSingleAction(
current_action,
config_.enable_vision_verification
);
if (!execute_result.ok()) {
result.error_message = std::string(execute_result.status().message());
return result;
}
result.vision_analyses.push_back(*execute_result);
result.actions_executed.push_back(current_action);
if (execute_result->action_successful) {
action_succeeded = true;
}
else if (config_.enable_iterative_refinement) {
// Refine action and retry
auto refinement = vision_refiner_->RefineAction(
current_action,
*execute_result
);
if (!refinement.ok()) {
result.error_message =
absl::StrCat("Failed to refine action: ",
refinement.status().message());
return result;
}
if (refinement->needs_different_approach) {
result.error_message =
absl::StrCat("Action requires different approach: ",
refinement->reasoning);
return result;
}
if (refinement->needs_retry) {
// Update action parameters
for (const auto& [key, value] : refinement->adjusted_parameters) {
current_action.parameters[key] = value;
}
}
retry_count++;
}
else {
// No refinement, just fail
result.error_message = execute_result->error_message;
return result;
}
}
if (!action_succeeded) {
result.error_message =
absl::StrFormat("Action failed after %d retries", retry_count);
return result;
}
}
result.success = true;
// Capture final state
auto final_screenshot = CaptureCurrentState("final_state");
if (final_screenshot.ok()) {
result.screenshots_taken.push_back(*final_screenshot);
// Analyze final state
auto final_analysis = vision_refiner_->AnalyzeScreenshot(
*final_screenshot,
"Verify all actions completed successfully"
);
if (final_analysis.ok()) {
result.final_state_description = final_analysis->description;
}
}
return result;
}
absl::StatusOr<VisionAnalysisResult> AIGUIController::ExecuteSingleAction(
const AIAction& action,
bool verify_with_vision) {
VisionAnalysisResult result;
// Capture before screenshot
std::filesystem::path before_screenshot;
if (verify_with_vision) {
auto before_result = CaptureCurrentState("before_action");
if (!before_result.ok()) {
return before_result.status();
}
before_screenshot = *before_result;
}
// Wait for UI to settle
if (config_.screenshot_delay_ms > 0) {
std::this_thread::sleep_for(
std::chrono::milliseconds(config_.screenshot_delay_ms));
}
// Execute the action via gRPC
auto execute_status = ExecuteGRPCAction(action);
if (!execute_status.ok()) {
result.action_successful = false;
result.error_message = std::string(execute_status.message());
return result;
}
// Wait for action to complete
std::this_thread::sleep_for(
std::chrono::milliseconds(config_.screenshot_delay_ms));
if (verify_with_vision) {
// Capture after screenshot
auto after_result = CaptureCurrentState("after_action");
if (!after_result.ok()) {
return after_result.status();
}
// Verify with vision
return VerifyActionSuccess(action, before_screenshot, *after_result);
}
else {
// Assume success without verification
result.action_successful = true;
result.description = "Action executed (no vision verification)";
return result;
}
}
absl::StatusOr<VisionAnalysisResult> AIGUIController::AnalyzeCurrentGUIState(
const std::string& context) {
auto screenshot = CaptureCurrentState("analysis");
if (!screenshot.ok()) {
return screenshot.status();
}
return vision_refiner_->AnalyzeScreenshot(*screenshot, context);
}
// Private helper methods
absl::StatusOr<std::filesystem::path> AIGUIController::CaptureCurrentState(
const std::string& description) {
#ifdef YAZE_WITH_GRPC
std::filesystem::path path = GenerateScreenshotPath(description);
auto result = yaze::test::CaptureHarnessScreenshot(path.string());
if (!result.ok()) {
return result.status();
}
return std::filesystem::path(result->file_path);
#else
return absl::UnimplementedError("Screenshot capture requires gRPC support");
#endif
}
absl::Status AIGUIController::ExecuteGRPCAction(const AIAction& action) {
// Convert AI action to gRPC test commands
auto grpc_commands = action_generator_.GenerateGRPCCommands({action});
if (grpc_commands.empty()) {
return absl::InternalError("No gRPC commands generated for action");
}
// Execute each command
for (const auto& command_json : grpc_commands) {
// Parse JSON and execute via GUI client
// This is a placeholder - actual implementation would parse JSON
// and call appropriate GUI client methods
if (action.type == AIActionType::kClickButton) {
auto button_it = action.parameters.find("button");
if (button_it != action.parameters.end()) {
auto status = gui_client_->ClickButton(button_it->second);
if (!status.ok()) {
return status;
}
}
}
else if (action.type == AIActionType::kPlaceTile) {
// Extract parameters
auto x_it = action.parameters.find("x");
auto y_it = action.parameters.find("y");
auto tile_it = action.parameters.find("tile_id");
if (x_it != action.parameters.end() &&
y_it != action.parameters.end() &&
tile_it != action.parameters.end()) {
int x = std::stoi(x_it->second);
int y = std::stoi(y_it->second);
int tile_id = std::stoi(tile_it->second);
// Use GUI client to place tile
// (This would need actual implementation in GuiAutomationClient)
auto status = gui_client_->ExecuteTestScript(
absl::StrFormat("PlaceTile(%d, %d, %d)", x, y, tile_id));
if (!status.ok()) {
return status;
}
}
}
else if (action.type == AIActionType::kWait) {
int wait_ms = config_.screenshot_delay_ms;
auto wait_it = action.parameters.find("duration_ms");
if (wait_it != action.parameters.end()) {
wait_ms = std::stoi(wait_it->second);
}
std::this_thread::sleep_for(std::chrono::milliseconds(wait_ms));
}
}
return absl::OkStatus();
}
absl::StatusOr<VisionAnalysisResult> AIGUIController::VerifyActionSuccess(
const AIAction& action,
const std::filesystem::path& before_screenshot,
const std::filesystem::path& after_screenshot) {
return vision_refiner_->VerifyAction(action, before_screenshot, after_screenshot);
}
absl::StatusOr<AIAction> AIGUIController::RefineActionWithVision(
const AIAction& original_action,
const VisionAnalysisResult& analysis) {
auto refinement = vision_refiner_->RefineAction(original_action, analysis);
if (!refinement.ok()) {
return refinement.status();
}
AIAction refined_action = original_action;
// Apply adjusted parameters
for (const auto& [key, value] : refinement->adjusted_parameters) {
refined_action.parameters[key] = value;
}
return refined_action;
}
void AIGUIController::EnsureScreenshotsDirectory() {
std::error_code ec;
std::filesystem::create_directories(screenshots_dir_, ec);
if (ec) {
std::cerr << "Warning: Failed to create screenshots directory: "
<< ec.message() << std::endl;
}
}
std::filesystem::path AIGUIController::GenerateScreenshotPath(
const std::string& suffix) {
int64_t timestamp = absl::ToUnixMillis(absl::Now());
std::string filename = absl::StrFormat(
"ai_gui_%s_%lld.png",
suffix,
static_cast<long long>(timestamp)
);
return screenshots_dir_ / filename;
}
} // namespace ai
} // namespace cli
} // namespace yaze