feat: Enhance system prompt JSON schema and clarify tool result formatting in conversational agent

2025-10-04 03:19:28 -04:00
parent 551f926aba
commit 9acd6ba17f
3 changed files with 28 additions and 9 deletions
--- a/assets/agent/system_prompt.txt
+++ b/assets/agent/system_prompt.txt
@@ -3,19 +3,26 @@ You are an expert ROM hacking assistant for The Legend of Zelda: A Link to the P
 Your task is to generate a sequence of z3ed CLI commands to achieve the user's request, or to answer questions about the ROM using available tools.
 # Output Format
-You MUST respond with ONLY a JSON object with the following structure:
+You MUST respond with ONLY a JSON object. NO other text before or after the JSON.
 **REQUIRED JSON SCHEMA:**
 ```json
 {
-  "text_response": "Your natural language reply to the user.",
+  "text_response": "string (your natural language reply)",
-  "tool_calls": [{ "tool_name": "tool_name", "args": { "arg1": "value1" } }],
+  "tool_calls": [{"tool_name": "string", "args": {"key": "value"}}],
-  "commands": ["command1", "command2"],
+  "commands": ["string array of z3ed commands"],
-  "reasoning": "Your thought process."
+  "reasoning": "string (your thought process)"
 }
 ```
 **CRITICAL:** The field name is `"text_response"` NOT `"response"` NOT `"answer"` NOT anything else.
 # CRITICAL RULES:
-1. If you previously called tools and received [TOOL RESULT], you MUST include text_response with your answer
+1. If you previously called tools and received [TOOL RESULT], you MUST include "text_response" with your answer
-2. NEVER send an empty text_response after receiving tool results
+2. NEVER send an empty "text_response" after receiving tool results
 3. NEVER call the same tool twice with the same arguments
-4. If you have all the information needed to answer, provide text_response WITHOUT calling more tools
+4. If you have all the information needed to answer, provide "text_response" WITHOUT calling more tools
 5. The field name is `"text_response"` - this exact spelling is REQUIRED
 # Tool Calling Workflow (CRITICAL)
--- a/src/cli/service/agent/conversational_agent_service.cc
+++ b/src/cli/service/agent/conversational_agent_service.cc
@@ -270,7 +270,12 @@ absl::StatusOr<ChatMessage> ConversationalAgentService::SendMessage(
          }
          // Add tool result with a clear marker for the LLM
-          std::string marked_output = "[TOOL RESULT] " + tool_output;
+          // Format as plain text to avoid confusing the LLM with nested JSON
          std::string marked_output = absl::StrCat(
              "[TOOL RESULT for ", tool_call.tool_name, "]\n",
              "The tool returned the following data:\n",
              tool_output, "\n\n",
              "Please provide a text_response field in your JSON to summarize this information for the user.");
          history_.push_back(
              CreateMessage(ChatMessage::Sender::kUser, marked_output));
        }
--- a/src/cli/service/ai/ollama_ai_service.cc
+++ b/src/cli/service/ai/ollama_ai_service.cc
@@ -222,6 +222,13 @@ absl::StatusOr<AgentResponse> OllamaAIService::GenerateResponse(
    std::string llm_output = ollama_wrapper["response"].get<std::string>();
    // Debug: Print raw LLM output when verbose mode is enabled
    const char* verbose_env = std::getenv("Z3ED_VERBOSE");
    if (verbose_env && std::string(verbose_env) == "1") {
      std::cout << "\n" << "\033[35m" << "🔍 Raw LLM Response:" << "\033[0m" << "\n"
                << "\033[2m" << llm_output << "\033[0m" << "\n\n";
    }
    // Parse the LLM's JSON response (the agent structure)
    nlohmann::json response_json;
    try {