diff --git a/src/app/editor/message/message_data.cc b/src/app/editor/message/message_data.cc index 5e34d4d0..ade53419 100644 --- a/src/app/editor/message/message_data.cc +++ b/src/app/editor/message/message_data.cc @@ -95,9 +95,10 @@ std::string ParseTextDataByte(uint8_t value) { } // Check for dictionary. - int dictionary = FindDictionaryEntry(value); + int8_t dictionary = FindDictionaryEntry(value); if (dictionary >= 0) { - return absl::StrFormat("[%s:%02X]", DICTIONARYTOKEN, dictionary); + return absl::StrFormat("[%s:%02X]", DICTIONARYTOKEN, + static_cast(dictionary)); } return ""; @@ -105,7 +106,7 @@ std::string ParseTextDataByte(uint8_t value) { std::vector ParseMessageToData(std::string str) { std::vector bytes; - std::string temp_string = str; + std::string temp_string = std::move(str); int pos = 0; while (pos < temp_string.size()) { // Get next text fragment. @@ -181,8 +182,8 @@ std::vector BuildDictionaryEntries(Rom* rom) { } std::string ReplaceAllDictionaryWords(std::string str, - std::vector dictionary) { - std::string temp = str; + const std::vector& dictionary) { + std::string temp = std::move(str); for (const auto& entry : dictionary) { if (entry.ContainedInString(temp)) { temp = entry.ReplaceInstancesOfIn(temp); @@ -192,7 +193,7 @@ std::string ReplaceAllDictionaryWords(std::string str, } DictionaryEntry FindRealDictionaryEntry( - uint8_t value, std::vector dictionary) { + uint8_t value, const std::vector& dictionary) { for (const auto& entry : dictionary) { if (entry.ID + DICTOFF == value) { return entry; @@ -245,12 +246,12 @@ absl::StatusOr ParseSingleMessage( } // Check for dictionary. - int dictionary = FindDictionaryEntry(current_byte); + int8_t dictionary = FindDictionaryEntry(current_byte); if (dictionary >= 0) { current_message_raw.append("["); current_message_raw.append(DICTIONARYTOKEN); current_message_raw.append(":"); - current_message_raw.append(util::HexWord(dictionary)); + current_message_raw.append(util::HexWord(static_cast(dictionary))); current_message_raw.append("]"); auto mutable_rom_data = const_cast(rom_data.data()); @@ -288,44 +289,53 @@ std::vector ParseMessageData( for (auto& message : message_data) { std::string parsed_message = ""; - int pos = 0; - for (const uint8_t& byte : message.Data) { - if (CharEncoder.contains(byte)) { - parsed_message.push_back(CharEncoder.at(byte)); - } else { - if (byte >= DICTOFF && byte < (DICTOFF + 97)) { - DictionaryEntry dic_entry; - for (const auto& entry : dictionary_entries) { - if (entry.ID == byte - DICTOFF) { - dic_entry = entry; - break; - } - } - parsed_message.append(dic_entry.Contents); + // Use index-based loop to properly skip argument bytes + for (size_t pos = 0; pos < message.Data.size(); ++pos) { + uint8_t byte = message.Data[pos]; + + // Check for text commands first (they may have arguments to skip) + auto text_element = FindMatchingCommand(byte); + if (text_element != std::nullopt) { + // Add newline for certain commands + if (text_element->ID == kScrollVertical || + text_element->ID == kLine2 || text_element->ID == kLine3) { + parsed_message.append("\n"); + } + // If command has an argument, get it from next byte and skip it + if (text_element->HasArgument && pos + 1 < message.Data.size()) { + uint8_t arg_byte = message.Data[pos + 1]; + parsed_message.append(text_element->GetParamToken(arg_byte)); + pos++; // Skip the argument byte } else { - auto text_element = FindMatchingCommand(byte); - if (text_element != std::nullopt) { - if (text_element->ID == kScrollVertical || - text_element->ID == kLine2 || text_element->ID == kLine3) { - parsed_message.append("\n"); - } - // If there is a param, add it to the message using GetParamToken. - if (text_element->HasArgument) { - // The next byte is the param. - parsed_message.append( - text_element->GetParamToken(message.Data[pos + 1])); - pos++; - } else { - parsed_message.append(text_element->GetParamToken()); - } - } - auto special_element = FindMatchingSpecial(byte); - if (special_element != std::nullopt) { - parsed_message.append(special_element->GetParamToken()); + parsed_message.append(text_element->GetParamToken()); + } + continue; // Move to next byte + } + + // Check for special characters + auto special_element = FindMatchingSpecial(byte); + if (special_element != std::nullopt) { + parsed_message.append(special_element->GetParamToken()); + continue; + } + + // Check for dictionary entries + if (byte >= DICTOFF && byte < (DICTOFF + 97)) { + DictionaryEntry dic_entry; + for (const auto& entry : dictionary_entries) { + if (entry.ID == byte - DICTOFF) { + dic_entry = entry; + break; } } + parsed_message.append(dic_entry.Contents); + continue; + } + + // Finally check for regular characters + if (CharEncoder.contains(byte)) { + parsed_message.push_back(CharEncoder.at(byte)); } - pos++; } parsed_messages.push_back(parsed_message); } @@ -389,10 +399,10 @@ std::vector ReadAllTextData(uint8_t* rom, int pos) { } // Check for dictionary. - int dictionary = FindDictionaryEntry(current_byte); + int8_t dictionary = FindDictionaryEntry(current_byte); if (dictionary >= 0) { current_raw_message.append(absl::StrFormat("[%s:%s]", DICTIONARYTOKEN, - util::HexByte(dictionary))); + util::HexByte(static_cast(dictionary)))); uint32_t address = Get24LocalFromPC(rom, kPointersDictionaries + (dictionary * 2)); diff --git a/src/app/editor/message/message_data.h b/src/app/editor/message/message_data.h index 5a8f1591..09c2bb58 100644 --- a/src/app/editor/message/message_data.h +++ b/src/app/editor/message/message_data.h @@ -1,6 +1,83 @@ #ifndef YAZE_APP_EDITOR_MESSAGE_MESSAGE_DATA_H #define YAZE_APP_EDITOR_MESSAGE_MESSAGE_DATA_H +// =========================================================================== +// Message Data System for Zelda 3 (A Link to the Past) +// =========================================================================== +// +// This system handles the parsing, editing, and serialization of in-game text +// messages from The Legend of Zelda: A Link to the Past (SNES). +// +// ## Architecture Overview +// +// The message system consists of several key components: +// +// 1. **Character Encoding** (`CharEncoder`): +// Maps byte values (0x00-0x66) to displayable characters (A-Z, a-z, 0-9, +// punctuation). This is the basic text representation in the ROM. +// +// 2. **Text Commands** (`TextCommands`): +// Special control codes (0x67-0x80) that control message display behavior: +// - Window appearance (border, position) +// - Text flow (line breaks, scrolling, delays) +// - Interactive elements (choices, player name insertion) +// - Some commands have arguments (e.g., [W:02] = window border type 2) +// +// 3. **Special Characters** (`SpecialChars`): +// Extended character set (0x43-0x5E) for game-specific symbols: +// - Directional arrows +// - Button prompts (A, B, X, Y) +// - HP indicators +// - Hieroglyphs +// +// 4. **Dictionary System** (`DictionaryEntry`): +// Compression system using byte values 0x88+ to reference common words/phrases +// stored separately in ROM. This saves space by replacing frequently-used +// text with single-byte references. +// +// 5. **Message Data** (`MessageData`): +// Represents a single in-game message with both raw binary data and parsed +// human-readable text. Each message is terminated by 0x7F in ROM. +// +// ## Data Flow +// +// ### Reading from ROM: +// ROM bytes → ReadAllTextData() → MessageData (raw) → ParseMessageData() → +// Human-readable string with [command] tokens +// +// ### Writing to ROM: +// User edits text → ParseMessageToData() → Binary bytes → ROM +// +// ### Dictionary Optimization: +// Text string → OptimizeMessageForDictionary() → Replace common phrases with +// [D:XX] tokens → Smaller binary representation +// +// ## ROM Memory Layout (SNES) +// +// - Text Data Block 1: 0xE0000 - 0xE7FFF (32KB) +// - Text Data Block 2: 0x75F40 - 0x773FF (5.3KB) +// - Dictionary Pointers: 0x74703 +// - Character Widths: Table storing pixel widths for proportional font +// - Font Graphics: 0x70000+ (2bpp tile data) +// +// ## Message Format +// +// Messages are stored as byte sequences terminated by 0x7F: +// Example: [0x00, 0x01, 0x02, 0x7F] = "ABC" +// Example: [0x6A, 0x59, 0x2C, 0x61, 0x32, 0x28, 0x2B, 0x23, 0x7F] +// = "[L] saved Hyrule" (0x6A = player name command) +// +// ## Token Syntax (Human-Readable Format) +// +// Commands: [TOKEN:HEX] or [TOKEN] +// Examples: [W:02] (window border), [K] (wait for key) +// Dictionary: [D:HEX] +// Examples: [D:00] (first dictionary entry) +// Special Chars:[TOKEN] +// Examples: [A] (A button), [UP] (up arrow) +// +// =========================================================================== + #include #include #include @@ -18,10 +95,12 @@ namespace editor { const std::string kBankToken = "BANK"; const std::string DICTIONARYTOKEN = "D"; -constexpr uint8_t kMessageTerminator = 0x7F; -constexpr uint8_t DICTOFF = 0x88; +constexpr uint8_t kMessageTerminator = 0x7F; // Marks end of message in ROM +constexpr uint8_t DICTOFF = 0x88; // Dictionary entries start at byte 0x88 constexpr uint8_t kWidthArraySize = 100; +// Character encoding table: Maps ROM byte values to displayable characters +// Used for both parsing ROM data into text and converting text back to bytes static const std::unordered_map CharEncoder = { {0x00, 'A'}, {0x01, 'B'}, {0x02, 'C'}, {0x03, 'D'}, {0x04, 'E'}, {0x05, 'F'}, {0x06, 'G'}, {0x07, 'H'}, {0x08, 'I'}, {0x09, 'J'}, @@ -42,16 +121,27 @@ static const std::unordered_map CharEncoder = { {0x65, ' '}, {0x66, '_'}, }; +// Finds the ROM byte value for a given character (reverse lookup in CharEncoder) +// Returns 0xFF if character is not found uint8_t FindMatchingCharacter(char value); + +// Checks if a byte value represents a dictionary entry +// Returns dictionary index (0-96) or -1 if not a dictionary entry int8_t FindDictionaryEntry(uint8_t value); + +// Converts a human-readable message string (with [command] tokens) into ROM bytes +// This is the inverse operation of ParseMessageData std::vector ParseMessageToData(std::string str); +// Represents a single dictionary entry (common word/phrase) used for text compression +// Dictionary entries are stored separately in ROM and referenced by bytes 0x88-0xE8 +// Example: Dictionary entry 0x00 might contain "the" and be referenced as [D:00] struct DictionaryEntry { - uint8_t ID = 0; - std::string Contents = ""; - std::vector Data; - int Length = 0; - std::string Token = ""; + uint8_t ID = 0; // Dictionary index (0-96) + std::string Contents = ""; // The actual text this entry represents + std::vector Data; // Binary representation of Contents + int Length = 0; // Character count + std::string Token = ""; // Human-readable token like "[D:00]" DictionaryEntry() = default; DictionaryEntry(uint8_t i, std::string_view s) @@ -60,10 +150,14 @@ struct DictionaryEntry { Data = ParseMessageToData(Contents); } + // Checks if this dictionary entry's text appears in the given string bool ContainedInString(std::string_view s) const { - return absl::StrContains(s, Contents); + // Convert to std::string to avoid Debian string_view bug with absl::StrContains + return absl::StrContains(std::string(s), Contents); } + // Replaces all occurrences of this dictionary entry's text with its token + // Example: "the cat" with dictionary[0]="the" becomes "[D:00] cat" std::string ReplaceInstancesOfIn(std::string_view s) const { auto replaced_string = std::string(s); size_t pos = replaced_string.find(Contents); @@ -84,22 +178,33 @@ constexpr uint8_t kLine1 = 0x74; constexpr uint8_t kLine2 = 0x75; constexpr uint8_t kLine3 = 0x76; +// Reads all dictionary entries from ROM and builds the dictionary table std::vector BuildDictionaryEntries(Rom* rom); -std::string ReplaceAllDictionaryWords(std::string str, - std::vector dictionary); -DictionaryEntry FindRealDictionaryEntry( - uint8_t value, std::vector dictionary); -// Inserted into commands to protect them from dictionary replacements. +// Replaces all dictionary words in a string with their [D:XX] tokens +// Used for text compression when saving messages back to ROM +std::string ReplaceAllDictionaryWords(std::string str, + const std::vector& dictionary); + +// Looks up a dictionary entry by its ROM byte value +DictionaryEntry FindRealDictionaryEntry( + uint8_t value, const std::vector& dictionary); + +// Special marker inserted into commands to protect them from dictionary replacements +// during optimization. Removed after dictionary replacement is complete. const std::string CHEESE = "\uBEBE"; +// Represents a complete in-game message with both raw and parsed representations +// Messages can exist in two forms: +// 1. Raw: Direct ROM bytes with dictionary references as [D:XX] tokens +// 2. Parsed: Fully expanded with dictionary words replaced by actual text struct MessageData { - int ID = 0; - int Address = 0; - std::string RawString; - std::string ContentsParsed; - std::vector Data; - std::vector DataParsed; + int ID = 0; // Message index in the ROM + int Address = 0; // ROM address where this message is stored + std::string RawString; // Human-readable with [D:XX] dictionary tokens + std::string ContentsParsed; // Fully expanded human-readable text + std::vector Data; // Raw ROM bytes (may contain dict references) + std::vector DataParsed; // Expanded bytes (dict entries expanded) MessageData() = default; MessageData(int id, int address, const std::string& rawString, @@ -123,11 +228,16 @@ struct MessageData { ContentsParsed = other.ContentsParsed; } + // Optimizes a message by replacing common phrases with dictionary tokens + // Inserts CHEESE markers inside commands to prevent dictionary replacement + // from corrupting command syntax like [W:02] + // Example: "Link saved the day" → "[D:00] saved [D:01] day" std::string OptimizeMessageForDictionary( std::string_view message_string, const std::vector& dictionary) { std::stringstream protons; bool command = false; + // Insert CHEESE markers inside commands to protect them for (const auto& c : message_string) { if (c == '[') { command = true; @@ -137,7 +247,7 @@ struct MessageData { protons << c; if (command) { - protons << CHEESE; + protons << CHEESE; // Protect command contents from replacement } } @@ -150,6 +260,8 @@ struct MessageData { return final_string; } + // Updates this message with new text content + // Automatically optimizes the message using dictionary compression void SetMessage(const std::string& message, const std::vector& dictionary) { RawString = message; @@ -157,14 +269,17 @@ struct MessageData { } }; +// Represents a text command or special character definition +// Text commands control message display (line breaks, colors, choices, etc.) +// Special characters are game-specific symbols (arrows, buttons, HP hearts) struct TextElement { - uint8_t ID; - std::string Token; - std::string GenericToken; - std::string Pattern; - std::string StrictPattern; - std::string Description; - bool HasArgument; + uint8_t ID; // ROM byte value for this element + std::string Token; // Short token like "W" or "UP" + std::string GenericToken; // Display format like "[W:##]" or "[UP]" + std::string Pattern; // Regex pattern for parsing + std::string StrictPattern; // Strict regex pattern for exact matching + std::string Description; // Human-readable description + bool HasArgument; // True if command takes a parameter byte TextElement() = default; TextElement(uint8_t id, const std::string& token, bool arg, @@ -197,7 +312,7 @@ struct TextElement { } } - std::smatch MatchMe(std::string dfrag) const { + std::smatch MatchMe(const std::string& dfrag) const { std::regex pattern(StrictPattern); std::smatch match; std::regex_match(dfrag, match, pattern); @@ -258,8 +373,12 @@ static const std::vector TextCommands = { TextElement(0x70, "NONO", false, kCrash), }; +// Finds the TextElement definition for a command byte value +// Returns nullopt if the byte is not a recognized command std::optional FindMatchingCommand(uint8_t b); +// Special characters available in Zelda 3 messages +// These are symbols and game-specific icons that appear in text static const std::vector SpecialChars = { TextElement(0x43, "...", false, "Ellipsis …"), TextElement(0x4D, "UP", false, "Arrow ↑"), @@ -284,25 +403,39 @@ static const std::vector SpecialChars = { TextElement(0x4B, "LFR", false, "Link face right"), }; +// Finds the TextElement definition for a special character byte +// Returns nullopt if the byte is not a recognized special character std::optional FindMatchingSpecial(uint8_t b); +// Result of parsing a text token like "[W:02]" +// Contains both the command definition and its argument value struct ParsedElement { - TextElement Parent; - uint8_t Value; - bool Active = false; + TextElement Parent; // The command or special character definition + uint8_t Value; // Argument value (if command has argument) + bool Active = false; // True if parsing was successful ParsedElement() = default; ParsedElement(const TextElement& textElement, uint8_t value) : Parent(textElement), Value(value), Active(true) {} }; +// Parses a token string like "[W:02]" and returns its ParsedElement +// Returns inactive ParsedElement if token is invalid ParsedElement FindMatchingElement(const std::string& str); +// Converts a single ROM byte into its human-readable text representation +// Handles characters, commands, special chars, and dictionary references std::string ParseTextDataByte(uint8_t value); +// Parses a single message from ROM data starting at current_pos +// Updates current_pos to point after the message terminator +// Returns error if message is malformed (e.g., missing terminator) absl::StatusOr ParseSingleMessage( const std::vector& rom_data, int* current_pos); +// Converts MessageData objects into human-readable strings with [command] tokens +// This is the main function for displaying messages in the editor +// Properly handles commands with arguments to avoid parsing errors std::vector ParseMessageData( std::vector& message_data, const std::vector& dictionary_entries); diff --git a/test/integration/zelda3/message_test.cc b/test/integration/zelda3/message_test.cc index 37b655e1..a7d21a98 100644 --- a/test/integration/zelda3/message_test.cc +++ b/test/integration/zelda3/message_test.cc @@ -208,5 +208,93 @@ TEST_F(MessageRomTest, BuildDictionaryEntries_CorrectSize) { EXPECT_FALSE(result.empty()); } +TEST_F(MessageRomTest, ParseMessageData_CommandWithArgument_NoExtraCharacters) { + // This test specifically checks for the bug where command arguments + // were being incorrectly parsed as characters (e.g., capital 'A' after [W]) + // The bug was caused by using a range-based for loop while also tracking position + + // Message: [W:01]ABC + // Bytes: 0x6B (W command), 0x01 (argument), 0x00 (A), 0x01 (B), 0x02 (C) + std::vector data = {0x6B, 0x01, 0x00, 0x01, 0x02}; + + editor::MessageData message; + message.ID = 0; + message.Address = 0; + message.Data = data; + + std::vector message_data_vector = {message}; + auto parsed = editor::ParseMessageData(message_data_vector, dictionary_); + + // Should be "[W:01]ABC" NOT "[W:01]BABC" or "[W:01]AABC" + EXPECT_EQ(parsed[0], "[W:01]ABC"); + + // The 'B' should not appear twice or be skipped + EXPECT_EQ(parsed[0].find("BABC"), std::string::npos); + EXPECT_EQ(parsed[0].find("AABC"), std::string::npos); +} + +TEST_F(MessageRomTest, ParseMessageData_MultipleCommandsWithArguments) { + // Test multiple commands with arguments in sequence + // [W:01][C:02]AB + std::vector data = { + 0x6B, 0x01, // [W:01] - Window border command with arg + 0x77, 0x02, // [C:02] - Color command with arg + 0x00, 0x01 // AB - Regular characters + }; + + editor::MessageData message; + message.ID = 0; + message.Data = data; + + std::vector message_data_vector = {message}; + auto parsed = editor::ParseMessageData(message_data_vector, dictionary_); + + EXPECT_EQ(parsed[0], "[W:01][C:02]AB"); + + // Make sure argument bytes (0x01, 0x02) weren't parsed as characters + EXPECT_EQ(parsed[0].find("BAB"), std::string::npos); + EXPECT_EQ(parsed[0].find("CAB"), std::string::npos); +} + +TEST_F(MessageRomTest, ParseMessageData_CommandWithoutArgument) { + // Test command without argument followed by text + // [K]ABC - Wait for key command (no arg) followed by ABC + std::vector data = { + 0x7E, // [K] - Wait for key (no argument) + 0x00, 0x01, 0x02 // ABC + }; + + editor::MessageData message; + message.ID = 0; + message.Data = data; + + std::vector message_data_vector = {message}; + auto parsed = editor::ParseMessageData(message_data_vector, dictionary_); + + EXPECT_EQ(parsed[0], "[K]ABC"); +} + +TEST_F(MessageRomTest, ParseMessageData_MixedCommands) { + // Test mix of commands with and without arguments + // [W:01]A[K]B[C:02]C + std::vector data = { + 0x6B, 0x01, // [W:01] - with arg + 0x00, // A + 0x7E, // [K] - no arg + 0x01, // B + 0x77, 0x02, // [C:02] - with arg + 0x02 // C + }; + + editor::MessageData message; + message.ID = 0; + message.Data = data; + + std::vector message_data_vector = {message}; + auto parsed = editor::ParseMessageData(message_data_vector, dictionary_); + + EXPECT_EQ(parsed[0], "[W:01]A[K]B[C:02]C"); +} + } // namespace test } // namespace yaze