feat: Add collaborative chat sessions and multimodal vision support in Z3ED

This commit is contained in:
scawful
2025-10-04 16:56:43 -04:00
parent 0cc420e53e
commit 59ef5fb8bf
6 changed files with 365 additions and 13 deletions

View File

@@ -46,6 +46,10 @@
#include "app/emu/emulator.h"
#include "app/gfx/performance_dashboard.h"
#include "editor/editor.h"
#ifdef YAZE_WITH_GRPC
#include "app/core/service/screenshot_utils.h"
#include "cli/service/ai/gemini_ai_service.h"
#endif
#include "imgui/imgui.h"
#include "imgui/misc/cpp/imgui_stdlib.h"
#include "util/log.h"
@@ -239,6 +243,10 @@ void EditorManager::Initialize(const std::string& filename) {
context.session_id = session.session_id;
context.session_name = session.session_name;
context.participants = session.participants;
// Switch to shared chat history for this session
agent_chat_widget_.SwitchToSharedHistory(session.session_id);
return context;
};
collab_callbacks.join_session =
@@ -250,10 +258,21 @@ void EditorManager::Initialize(const std::string& filename) {
context.session_id = session.session_id;
context.session_name = session.session_name;
context.participants = session.participants;
// Switch to shared chat history for this session
agent_chat_widget_.SwitchToSharedHistory(session.session_id);
return context;
};
collab_callbacks.leave_session =
[this]() { return collaboration_coordinator_.LeaveSession(); };
[this]() {
absl::Status status = collaboration_coordinator_.LeaveSession();
if (status.ok()) {
// Switch back to local chat history
agent_chat_widget_.SwitchToLocalHistory();
}
return status;
};
collab_callbacks.refresh_session =
[this]() -> absl::StatusOr<AgentChatWidget::CollaborationCallbacks::SessionContext> {
ASSIGN_OR_RETURN(auto session, collaboration_coordinator_.RefreshSession());
@@ -264,6 +283,53 @@ void EditorManager::Initialize(const std::string& filename) {
return context;
};
agent_chat_widget_.SetCollaborationCallbacks(collab_callbacks);
// Set up multimodal (vision) callbacks for Gemini
AgentChatWidget::MultimodalCallbacks multimodal_callbacks;
multimodal_callbacks.capture_snapshot =
[](std::filesystem::path* output_path) -> absl::Status {
auto result = yaze::test::CaptureHarnessScreenshot("");
if (!result.ok()) {
return result.status();
}
*output_path = result->file_path;
return absl::OkStatus();
};
multimodal_callbacks.send_to_gemini =
[this](const std::filesystem::path& image_path,
const std::string& prompt) -> absl::Status {
// Get Gemini API key from environment
const char* api_key = std::getenv("GEMINI_API_KEY");
if (!api_key || std::strlen(api_key) == 0) {
return absl::FailedPreconditionError(
"GEMINI_API_KEY environment variable not set");
}
// Create Gemini service
cli::GeminiConfig config;
config.api_key = api_key;
config.model = "gemini-2.0-flash-exp"; // Use vision-capable model
config.verbose = false;
cli::GeminiAIService gemini_service(config);
// Generate multimodal response
auto response =
gemini_service.GenerateMultimodalResponse(image_path.string(), prompt);
if (!response.ok()) {
return response.status();
}
// Add the response to chat history
cli::agent::ChatMessage agent_msg;
agent_msg.sender = cli::agent::ChatMessage::Sender::kAgent;
agent_msg.message = response->text_response;
agent_msg.timestamp = absl::Now();
agent_chat_widget_.SetRomContext(current_rom_);
return absl::OkStatus();
};
agent_chat_widget_.SetMultimodalCallbacks(multimodal_callbacks);
#endif
// Load critical user settings first

View File

@@ -40,12 +40,19 @@ std::filesystem::path ExpandUserPath(std::string path) {
return std::filesystem::path(path);
}
std::filesystem::path ResolveHistoryPath() {
std::filesystem::path ResolveHistoryPath(const std::string& session_id = "") {
std::filesystem::path base = ExpandUserPath(yaze::core::GetConfigDirectory());
if (base.empty()) {
base = ExpandUserPath(".yaze");
}
auto directory = base / "agent";
// If in a collaborative session, use shared history
if (!session_id.empty()) {
directory = directory / "sessions";
return directory / (session_id + "_history.json");
}
return directory / "chat_history.json";
}
@@ -802,5 +809,45 @@ void AgentChatWidget::MarkHistoryDirty() {
}
}
void AgentChatWidget::SwitchToSharedHistory(const std::string& session_id) {
// Save current local history before switching
if (history_loaded_ && history_dirty_) {
PersistHistory();
}
// Switch to shared history path
history_path_ = ResolveHistoryPath(session_id);
history_loaded_ = false;
// Load shared history
EnsureHistoryLoaded();
if (toast_manager_) {
toast_manager_->Show(
absl::StrFormat("Switched to shared chat history for session %s",
session_id),
ToastType::kInfo, 3.0f);
}
}
void AgentChatWidget::SwitchToLocalHistory() {
// Save shared history before switching
if (history_loaded_ && history_dirty_) {
PersistHistory();
}
// Switch back to local history
history_path_ = ResolveHistoryPath("");
history_loaded_ = false;
// Load local history
EnsureHistoryLoaded();
if (toast_manager_) {
toast_manager_->Show("Switched to local chat history",
ToastType::kInfo, 3.0f);
}
}
} // namespace editor
} // namespace yaze

View File

@@ -80,6 +80,8 @@ class AgentChatWidget {
void EnsureHistoryLoaded();
void PersistHistory();
void SwitchToSharedHistory(const std::string& session_id);
void SwitchToLocalHistory();
void RenderHistory();
void RenderMessage(const cli::agent::ChatMessage& msg, int index);
void RenderProposalQuickActions(const cli::agent::ChatMessage& msg,

View File

@@ -43,7 +43,7 @@ namespace yaze {
namespace cli {
GeminiAIService::GeminiAIService(const GeminiConfig& config)
: config_(config), function_calling_enabled_(config.use_function_calling) {
: function_calling_enabled_(config.use_function_calling), config_(config) {
if (config_.verbose) {
std::cerr << "[DEBUG] Initializing Gemini service..." << std::endl;
std::cerr << "[DEBUG] Function calling: " << (function_calling_enabled_ ? "enabled" : "disabled") << std::endl;
@@ -533,5 +533,182 @@ absl::StatusOr<AgentResponse> GeminiAIService::ParseGeminiResponse(
#endif
}
absl::StatusOr<std::string> GeminiAIService::EncodeImageToBase64(
const std::string& image_path) const {
#ifndef YAZE_WITH_JSON
(void)image_path; // Suppress unused parameter warning
return absl::UnimplementedError(
"Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
#else
std::ifstream file(image_path, std::ios::binary);
if (!file.is_open()) {
return absl::NotFoundError(
absl::StrCat("Failed to open image file: ", image_path));
}
// Read file into buffer
file.seekg(0, std::ios::end);
size_t size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<unsigned char> buffer(size);
if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
return absl::InternalError("Failed to read image file");
}
// Base64 encode
static const char* base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
std::string encoded;
encoded.reserve(((size + 2) / 3) * 4);
int i = 0;
int j = 0;
unsigned char char_array_3[3];
unsigned char char_array_4[4];
for (size_t idx = 0; idx < size; idx++) {
char_array_3[i++] = buffer[idx];
if (i == 3) {
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
char_array_4[3] = char_array_3[2] & 0x3f;
for (i = 0; i < 4; i++)
encoded += base64_chars[char_array_4[i]];
i = 0;
}
}
if (i) {
for (j = i; j < 3; j++)
char_array_3[j] = '\0';
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
for (j = 0; j < i + 1; j++)
encoded += base64_chars[char_array_4[j]];
while (i++ < 3)
encoded += '=';
}
return encoded;
#endif
}
absl::StatusOr<AgentResponse> GeminiAIService::GenerateMultimodalResponse(
const std::string& image_path, const std::string& prompt) {
#ifndef YAZE_WITH_JSON
(void)image_path; // Suppress unused parameter warnings
(void)prompt;
return absl::UnimplementedError(
"Gemini AI service requires JSON support. Build with -DYAZE_WITH_JSON=ON");
#else
if (config_.api_key.empty()) {
return absl::FailedPreconditionError("Gemini API key not configured");
}
// Determine MIME type from file extension
std::string mime_type = "image/png";
if (image_path.ends_with(".jpg") || image_path.ends_with(".jpeg")) {
mime_type = "image/jpeg";
} else if (image_path.ends_with(".bmp")) {
mime_type = "image/bmp";
} else if (image_path.ends_with(".webp")) {
mime_type = "image/webp";
}
// Encode image to base64
auto encoded_or = EncodeImageToBase64(image_path);
if (!encoded_or.ok()) {
return encoded_or.status();
}
std::string encoded_image = std::move(encoded_or.value());
try {
if (config_.verbose) {
std::cerr << "[DEBUG] Preparing multimodal request with image" << std::endl;
}
// Build multimodal request with image and text
nlohmann::json request_body = {
{"contents", {{
{"parts", {
{
{"inline_data", {
{"mime_type", mime_type},
{"data", encoded_image}
}}
},
{{"text", prompt}}
}}
}}},
{"generationConfig", {
{"temperature", config_.temperature},
{"maxOutputTokens", config_.max_output_tokens}
}}
};
// Write request body to temp file
std::string temp_file = "/tmp/gemini_multimodal_request.json";
std::ofstream out(temp_file);
out << request_body.dump();
out.close();
// Use curl to make the request
std::string endpoint = "https://generativelanguage.googleapis.com/v1beta/models/" +
config_.model + ":generateContent";
std::string curl_cmd = "curl -s -X POST '" + endpoint + "' "
"-H 'Content-Type: application/json' "
"-H 'x-goog-api-key: " + config_.api_key + "' "
"-d @" + temp_file + " 2>&1";
if (config_.verbose) {
std::cerr << "[DEBUG] Executing multimodal API request..." << std::endl;
}
FILE* pipe = popen(curl_cmd.c_str(), "r");
if (!pipe) {
return absl::InternalError("Failed to execute curl command");
}
std::string response_str;
char buffer[4096];
while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
response_str += buffer;
}
int status = pclose(pipe);
std::remove(temp_file.c_str());
if (status != 0) {
return absl::InternalError(absl::StrCat("Curl failed with status ", status));
}
if (response_str.empty()) {
return absl::InternalError("Empty response from Gemini API");
}
if (config_.verbose) {
std::cout << "\n" << "\033[35m" << "🔍 Raw Gemini Multimodal Response:" << "\033[0m" << "\n"
<< "\033[2m" << response_str.substr(0, 500) << "\033[0m" << "\n\n";
}
return ParseGeminiResponse(response_str);
} catch (const std::exception& e) {
if (config_.verbose) {
std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
}
return absl::InternalError(absl::StrCat("Exception during multimodal generation: ", e.what()));
}
#endif
}
} // namespace cli
} // namespace yaze

View File

@@ -45,12 +45,20 @@ class GeminiAIService : public AIService {
void EnableFunctionCalling(bool enable = true);
std::vector<std::string> GetAvailableTools() const;
// Multimodal support (vision + text)
absl::StatusOr<AgentResponse> GenerateMultimodalResponse(
const std::string& image_path, const std::string& prompt);
private:
std::string BuildSystemInstruction();
std::string BuildFunctionCallSchemas();
absl::StatusOr<AgentResponse> ParseGeminiResponse(
const std::string& response_body);
// Helper for encoding images as base64
absl::StatusOr<std::string> EncodeImageToBase64(
const std::string& image_path) const;
bool function_calling_enabled_ = true;
GeminiConfig config_;