Files
yaze/src/app/editor/message/message_data.h
scawful ba70176ee2 refactor: Improve message parsing and dictionary handling
- Changed the type of dictionary variable from int to int8_t for better type safety.
- Updated the handling of dictionary entries in message parsing to ensure correct formatting and prevent parsing errors with command arguments.
- Refactored message data parsing logic to use index-based loops, improving clarity and correctness in handling command arguments.
- Enhanced the documentation in message_data.h to provide a comprehensive overview of the message data system and its components.
- Added new tests to validate the correct parsing of messages with commands and arguments, ensuring robustness against previous bugs.
2025-10-08 21:17:09 -04:00

459 lines
19 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#ifndef YAZE_APP_EDITOR_MESSAGE_MESSAGE_DATA_H
#define YAZE_APP_EDITOR_MESSAGE_MESSAGE_DATA_H
// ===========================================================================
// Message Data System for Zelda 3 (A Link to the Past)
// ===========================================================================
//
// This system handles the parsing, editing, and serialization of in-game text
// messages from The Legend of Zelda: A Link to the Past (SNES).
//
// ## Architecture Overview
//
// The message system consists of several key components:
//
// 1. **Character Encoding** (`CharEncoder`):
// Maps byte values (0x00-0x66) to displayable characters (A-Z, a-z, 0-9,
// punctuation). This is the basic text representation in the ROM.
//
// 2. **Text Commands** (`TextCommands`):
// Special control codes (0x67-0x80) that control message display behavior:
// - Window appearance (border, position)
// - Text flow (line breaks, scrolling, delays)
// - Interactive elements (choices, player name insertion)
// - Some commands have arguments (e.g., [W:02] = window border type 2)
//
// 3. **Special Characters** (`SpecialChars`):
// Extended character set (0x43-0x5E) for game-specific symbols:
// - Directional arrows
// - Button prompts (A, B, X, Y)
// - HP indicators
// - Hieroglyphs
//
// 4. **Dictionary System** (`DictionaryEntry`):
// Compression system using byte values 0x88+ to reference common words/phrases
// stored separately in ROM. This saves space by replacing frequently-used
// text with single-byte references.
//
// 5. **Message Data** (`MessageData`):
// Represents a single in-game message with both raw binary data and parsed
// human-readable text. Each message is terminated by 0x7F in ROM.
//
// ## Data Flow
//
// ### Reading from ROM:
// ROM bytes → ReadAllTextData() → MessageData (raw) → ParseMessageData() →
// Human-readable string with [command] tokens
//
// ### Writing to ROM:
// User edits text → ParseMessageToData() → Binary bytes → ROM
//
// ### Dictionary Optimization:
// Text string → OptimizeMessageForDictionary() → Replace common phrases with
// [D:XX] tokens → Smaller binary representation
//
// ## ROM Memory Layout (SNES)
//
// - Text Data Block 1: 0xE0000 - 0xE7FFF (32KB)
// - Text Data Block 2: 0x75F40 - 0x773FF (5.3KB)
// - Dictionary Pointers: 0x74703
// - Character Widths: Table storing pixel widths for proportional font
// - Font Graphics: 0x70000+ (2bpp tile data)
//
// ## Message Format
//
// Messages are stored as byte sequences terminated by 0x7F:
// Example: [0x00, 0x01, 0x02, 0x7F] = "ABC"
// Example: [0x6A, 0x59, 0x2C, 0x61, 0x32, 0x28, 0x2B, 0x23, 0x7F]
// = "[L] saved Hyrule" (0x6A = player name command)
//
// ## Token Syntax (Human-Readable Format)
//
// Commands: [TOKEN:HEX] or [TOKEN]
// Examples: [W:02] (window border), [K] (wait for key)
// Dictionary: [D:HEX]
// Examples: [D:00] (first dictionary entry)
// Special Chars:[TOKEN]
// Examples: [A] (A button), [UP] (up arrow)
//
// ===========================================================================
#include <optional>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>
#include <string_view>
#include "absl/strings/str_format.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/match.h"
#include "app/rom.h"
namespace yaze {
namespace editor {
const std::string kBankToken = "BANK";
const std::string DICTIONARYTOKEN = "D";
constexpr uint8_t kMessageTerminator = 0x7F; // Marks end of message in ROM
constexpr uint8_t DICTOFF = 0x88; // Dictionary entries start at byte 0x88
constexpr uint8_t kWidthArraySize = 100;
// Character encoding table: Maps ROM byte values to displayable characters
// Used for both parsing ROM data into text and converting text back to bytes
static const std::unordered_map<uint8_t, wchar_t> CharEncoder = {
{0x00, 'A'}, {0x01, 'B'}, {0x02, 'C'}, {0x03, 'D'}, {0x04, 'E'},
{0x05, 'F'}, {0x06, 'G'}, {0x07, 'H'}, {0x08, 'I'}, {0x09, 'J'},
{0x0A, 'K'}, {0x0B, 'L'}, {0x0C, 'M'}, {0x0D, 'N'}, {0x0E, 'O'},
{0x0F, 'P'}, {0x10, 'Q'}, {0x11, 'R'}, {0x12, 'S'}, {0x13, 'T'},
{0x14, 'U'}, {0x15, 'V'}, {0x16, 'W'}, {0x17, 'X'}, {0x18, 'Y'},
{0x19, 'Z'}, {0x1A, 'a'}, {0x1B, 'b'}, {0x1C, 'c'}, {0x1D, 'd'},
{0x1E, 'e'}, {0x1F, 'f'}, {0x20, 'g'}, {0x21, 'h'}, {0x22, 'i'},
{0x23, 'j'}, {0x24, 'k'}, {0x25, 'l'}, {0x26, 'm'}, {0x27, 'n'},
{0x28, 'o'}, {0x29, 'p'}, {0x2A, 'q'}, {0x2B, 'r'}, {0x2C, 's'},
{0x2D, 't'}, {0x2E, 'u'}, {0x2F, 'v'}, {0x30, 'w'}, {0x31, 'x'},
{0x32, 'y'}, {0x33, 'z'}, {0x34, '0'}, {0x35, '1'}, {0x36, '2'},
{0x37, '3'}, {0x38, '4'}, {0x39, '5'}, {0x3A, '6'}, {0x3B, '7'},
{0x3C, '8'}, {0x3D, '9'}, {0x3E, '!'}, {0x3F, '?'}, {0x40, '-'},
{0x41, '.'}, {0x42, ','}, {0x44, '>'}, {0x45, '('}, {0x46, ')'},
{0x4C, '"'}, {0x51, '\''}, {0x59, ' '}, {0x5A, '<'}, {0x5F, L'¡'},
{0x60, L'¡'}, {0x61, L'¡'}, {0x62, L' '}, {0x63, L' '}, {0x64, L' '},
{0x65, ' '}, {0x66, '_'},
};
// Finds the ROM byte value for a given character (reverse lookup in CharEncoder)
// Returns 0xFF if character is not found
uint8_t FindMatchingCharacter(char value);
// Checks if a byte value represents a dictionary entry
// Returns dictionary index (0-96) or -1 if not a dictionary entry
int8_t FindDictionaryEntry(uint8_t value);
// Converts a human-readable message string (with [command] tokens) into ROM bytes
// This is the inverse operation of ParseMessageData
std::vector<uint8_t> ParseMessageToData(std::string str);
// Represents a single dictionary entry (common word/phrase) used for text compression
// Dictionary entries are stored separately in ROM and referenced by bytes 0x88-0xE8
// Example: Dictionary entry 0x00 might contain "the" and be referenced as [D:00]
struct DictionaryEntry {
uint8_t ID = 0; // Dictionary index (0-96)
std::string Contents = ""; // The actual text this entry represents
std::vector<uint8_t> Data; // Binary representation of Contents
int Length = 0; // Character count
std::string Token = ""; // Human-readable token like "[D:00]"
DictionaryEntry() = default;
DictionaryEntry(uint8_t i, std::string_view s)
: ID(i), Contents(s), Length(s.length()) {
Token = absl::StrFormat("[%s:%02X]", DICTIONARYTOKEN, ID);
Data = ParseMessageToData(Contents);
}
// Checks if this dictionary entry's text appears in the given string
bool ContainedInString(std::string_view s) const {
// Convert to std::string to avoid Debian string_view bug with absl::StrContains
return absl::StrContains(std::string(s), Contents);
}
// Replaces all occurrences of this dictionary entry's text with its token
// Example: "the cat" with dictionary[0]="the" becomes "[D:00] cat"
std::string ReplaceInstancesOfIn(std::string_view s) const {
auto replaced_string = std::string(s);
size_t pos = replaced_string.find(Contents);
while (pos != std::string::npos) {
replaced_string.replace(pos, Contents.length(), Token);
pos = replaced_string.find(Contents, pos + Token.length());
}
return replaced_string;
}
};
constexpr int kTextData = 0xE0000;
constexpr int kTextDataEnd = 0xE7FFF;
constexpr int kNumDictionaryEntries = 0x61;
constexpr int kPointersDictionaries = 0x74703;
constexpr uint8_t kScrollVertical = 0x73;
constexpr uint8_t kLine1 = 0x74;
constexpr uint8_t kLine2 = 0x75;
constexpr uint8_t kLine3 = 0x76;
// Reads all dictionary entries from ROM and builds the dictionary table
std::vector<DictionaryEntry> BuildDictionaryEntries(Rom* rom);
// Replaces all dictionary words in a string with their [D:XX] tokens
// Used for text compression when saving messages back to ROM
std::string ReplaceAllDictionaryWords(std::string str,
const std::vector<DictionaryEntry>& dictionary);
// Looks up a dictionary entry by its ROM byte value
DictionaryEntry FindRealDictionaryEntry(
uint8_t value, const std::vector<DictionaryEntry>& dictionary);
// Special marker inserted into commands to protect them from dictionary replacements
// during optimization. Removed after dictionary replacement is complete.
const std::string CHEESE = "\uBEBE";
// Represents a complete in-game message with both raw and parsed representations
// Messages can exist in two forms:
// 1. Raw: Direct ROM bytes with dictionary references as [D:XX] tokens
// 2. Parsed: Fully expanded with dictionary words replaced by actual text
struct MessageData {
int ID = 0; // Message index in the ROM
int Address = 0; // ROM address where this message is stored
std::string RawString; // Human-readable with [D:XX] dictionary tokens
std::string ContentsParsed; // Fully expanded human-readable text
std::vector<uint8_t> Data; // Raw ROM bytes (may contain dict references)
std::vector<uint8_t> DataParsed; // Expanded bytes (dict entries expanded)
MessageData() = default;
MessageData(int id, int address, const std::string& rawString,
const std::vector<uint8_t>& rawData,
const std::string& parsedString,
const std::vector<uint8_t>& parsedData)
: ID(id),
Address(address),
RawString(rawString),
ContentsParsed(parsedString),
Data(rawData),
DataParsed(parsedData) {}
// Copy constructor
MessageData(const MessageData& other) {
ID = other.ID;
Address = other.Address;
RawString = other.RawString;
Data = other.Data;
DataParsed = other.DataParsed;
ContentsParsed = other.ContentsParsed;
}
// Optimizes a message by replacing common phrases with dictionary tokens
// Inserts CHEESE markers inside commands to prevent dictionary replacement
// from corrupting command syntax like [W:02]
// Example: "Link saved the day" → "[D:00] saved [D:01] day"
std::string OptimizeMessageForDictionary(
std::string_view message_string,
const std::vector<DictionaryEntry>& dictionary) {
std::stringstream protons;
bool command = false;
// Insert CHEESE markers inside commands to protect them
for (const auto& c : message_string) {
if (c == '[') {
command = true;
} else if (c == ']') {
command = false;
}
protons << c;
if (command) {
protons << CHEESE; // Protect command contents from replacement
}
}
std::string protons_string = protons.str();
std::string replaced_string =
ReplaceAllDictionaryWords(protons_string, dictionary);
std::string final_string =
absl::StrReplaceAll(replaced_string, {{CHEESE, ""}});
return final_string;
}
// Updates this message with new text content
// Automatically optimizes the message using dictionary compression
void SetMessage(const std::string& message,
const std::vector<DictionaryEntry>& dictionary) {
RawString = message;
ContentsParsed = OptimizeMessageForDictionary(message, dictionary);
}
};
// Represents a text command or special character definition
// Text commands control message display (line breaks, colors, choices, etc.)
// Special characters are game-specific symbols (arrows, buttons, HP hearts)
struct TextElement {
uint8_t ID; // ROM byte value for this element
std::string Token; // Short token like "W" or "UP"
std::string GenericToken; // Display format like "[W:##]" or "[UP]"
std::string Pattern; // Regex pattern for parsing
std::string StrictPattern; // Strict regex pattern for exact matching
std::string Description; // Human-readable description
bool HasArgument; // True if command takes a parameter byte
TextElement() = default;
TextElement(uint8_t id, const std::string& token, bool arg,
const std::string& description) {
ID = id;
Token = token;
if (arg) {
GenericToken = absl::StrFormat("[%s:##]", Token);
} else {
GenericToken = absl::StrFormat("[%s]", Token);
}
HasArgument = arg;
Description = description;
if (arg) {
Pattern = absl::StrFormat(
"\\[%s(:[0-9A-F]{1,2})?\\]",
absl::StrReplaceAll(Token, {{"[", "\\["}, {"]", "\\]"}}));
} else {
Pattern = absl::StrFormat(
"\\[%s\\]", absl::StrReplaceAll(Token, {{"[", "\\["}, {"]", "\\]"}}));
}
StrictPattern = absl::StrFormat("^%s$", Pattern);
}
std::string GetParamToken(uint8_t value = 0) const {
if (HasArgument) {
return absl::StrFormat("[%s:%02X]", Token, value);
} else {
return absl::StrFormat("[%s]", Token);
}
}
std::smatch MatchMe(const std::string& dfrag) const {
std::regex pattern(StrictPattern);
std::smatch match;
std::regex_match(dfrag, match, pattern);
return match;
}
bool Empty() const { return ID == 0; }
// Comparison operator
bool operator==(const TextElement& other) const { return ID == other.ID; }
};
const static std::string kWindowBorder = "Window border";
const static std::string kWindowPosition = "Window position";
const static std::string kScrollSpeed = "Scroll speed";
const static std::string kTextDrawSpeed = "Text draw speed";
const static std::string kTextColor = "Text color";
const static std::string kPlayerName = "Player name";
const static std::string kLine1Str = "Line 1";
const static std::string kLine2Str = "Line 2";
const static std::string kLine3Str = "Line 3";
const static std::string kWaitForKey = "Wait for key";
const static std::string kScrollText = "Scroll text";
const static std::string kDelayX = "Delay X";
const static std::string kBCDNumber = "BCD number";
const static std::string kSoundEffect = "Sound effect";
const static std::string kChoose3 = "Choose 3";
const static std::string kChoose2High = "Choose 2 high";
const static std::string kChoose2Low = "Choose 2 low";
const static std::string kChoose2Indented = "Choose 2 indented";
const static std::string kChooseItem = "Choose item";
const static std::string kNextAttractImage = "Next attract image";
const static std::string kBankMarker = "Bank marker (automatic)";
const static std::string kCrash = "Crash";
static const std::vector<TextElement> TextCommands = {
TextElement(0x6B, "W", true, kWindowBorder),
TextElement(0x6D, "P", true, kWindowPosition),
TextElement(0x6E, "SPD", true, kScrollSpeed),
TextElement(0x7A, "S", true, kTextDrawSpeed),
TextElement(0x77, "C", true, kTextColor),
TextElement(0x6A, "L", false, kPlayerName),
TextElement(0x74, "1", false, kLine1Str),
TextElement(0x75, "2", false, kLine2Str),
TextElement(0x76, "3", false, kLine3Str),
TextElement(0x7E, "K", false, kWaitForKey),
TextElement(0x73, "V", false, kScrollText),
TextElement(0x78, "WT", true, kDelayX),
TextElement(0x6C, "N", true, kBCDNumber),
TextElement(0x79, "SFX", true, kSoundEffect),
TextElement(0x71, "CH3", false, kChoose3),
TextElement(0x72, "CH2", false, kChoose2High),
TextElement(0x6F, "CH2L", false, kChoose2Low),
TextElement(0x68, "CH2I", false, kChoose2Indented),
TextElement(0x69, "CHI", false, kChooseItem),
TextElement(0x67, "IMG", false, kNextAttractImage),
TextElement(0x80, kBankToken, false, kBankMarker),
TextElement(0x70, "NONO", false, kCrash),
};
// Finds the TextElement definition for a command byte value
// Returns nullopt if the byte is not a recognized command
std::optional<TextElement> FindMatchingCommand(uint8_t b);
// Special characters available in Zelda 3 messages
// These are symbols and game-specific icons that appear in text
static const std::vector<TextElement> SpecialChars = {
TextElement(0x43, "...", false, "Ellipsis …"),
TextElement(0x4D, "UP", false, "Arrow ↑"),
TextElement(0x4E, "DOWN", false, "Arrow ↓"),
TextElement(0x4F, "LEFT", false, "Arrow ←"),
TextElement(0x50, "RIGHT", false, "Arrow →"),
TextElement(0x5B, "A", false, "Button Ⓐ"),
TextElement(0x5C, "B", false, "Button Ⓑ"),
TextElement(0x5D, "X", false, "Button ⓧ"),
TextElement(0x5E, "Y", false, "Button ⓨ"),
TextElement(0x52, "HP1L", false, "1 HP left"),
TextElement(0x53, "HP1R", false, "1 HP right"),
TextElement(0x54, "HP2L", false, "2 HP left"),
TextElement(0x55, "HP3L", false, "3 HP left"),
TextElement(0x56, "HP3R", false, "3 HP right"),
TextElement(0x57, "HP4L", false, "4 HP left"),
TextElement(0x58, "HP4R", false, "4 HP right"),
TextElement(0x47, "HY0", false, "Hieroglyph ☥"),
TextElement(0x48, "HY1", false, "Hieroglyph 𓈗"),
TextElement(0x49, "HY2", false, "Hieroglyph Ƨ"),
TextElement(0x4A, "LFL", false, "Link face left"),
TextElement(0x4B, "LFR", false, "Link face right"),
};
// Finds the TextElement definition for a special character byte
// Returns nullopt if the byte is not a recognized special character
std::optional<TextElement> FindMatchingSpecial(uint8_t b);
// Result of parsing a text token like "[W:02]"
// Contains both the command definition and its argument value
struct ParsedElement {
TextElement Parent; // The command or special character definition
uint8_t Value; // Argument value (if command has argument)
bool Active = false; // True if parsing was successful
ParsedElement() = default;
ParsedElement(const TextElement& textElement, uint8_t value)
: Parent(textElement), Value(value), Active(true) {}
};
// Parses a token string like "[W:02]" and returns its ParsedElement
// Returns inactive ParsedElement if token is invalid
ParsedElement FindMatchingElement(const std::string& str);
// Converts a single ROM byte into its human-readable text representation
// Handles characters, commands, special chars, and dictionary references
std::string ParseTextDataByte(uint8_t value);
// Parses a single message from ROM data starting at current_pos
// Updates current_pos to point after the message terminator
// Returns error if message is malformed (e.g., missing terminator)
absl::StatusOr<MessageData> ParseSingleMessage(
const std::vector<uint8_t>& rom_data, int* current_pos);
// Converts MessageData objects into human-readable strings with [command] tokens
// This is the main function for displaying messages in the editor
// Properly handles commands with arguments to avoid parsing errors
std::vector<std::string> ParseMessageData(
std::vector<MessageData>& message_data,
const std::vector<DictionaryEntry>& dictionary_entries);
constexpr int kTextData2 = 0x75F40;
constexpr int kTextData2End = 0x773FF;
// Reads all text data from the ROM and returns a vector of MessageData objects.
std::vector<MessageData> ReadAllTextData(uint8_t* rom, int pos = kTextData);
// Calls the file dialog and loads expanded messages from a BIN file.
absl::Status LoadExpandedMessages(std::string& expanded_message_path,
std::vector<std::string>& parsed_messages,
std::vector<MessageData>& expanded_messages,
std::vector<DictionaryEntry>& dictionary);
} // namespace editor
} // namespace yaze
#endif // YAZE_APP_EDITOR_MESSAGE_MESSAGE_DATA_H