backend-infra-engineer: Post v0.3.9-hotfix7 snapshot (build cleanup)

2025-12-22 00:20:49 +00:00
parent 2934c82b75
commit 5c4cd57ff8
1259 changed files with 239160 additions and 43801 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -151,7 +151,7 @@ Validates CMake configuration by checking targets, flags, and platform-specific
 cmake -P scripts/validate-cmake-config.cmake

 # Validate specific build directory
-cmake -P scripts/validate-cmake-config.cmake build_ai
+cmake -P scripts/validate-cmake-config.cmake build
 ```

 **What it checks:**
@@ -171,7 +171,7 @@ Validates include paths in compile_commands.json to catch missing includes befor
 ./scripts/check-include-paths.sh

 # Check specific build
-./scripts/check-include-paths.sh build_ai
+./scripts/check-include-paths.sh build

 # Verbose mode (show all include dirs)
 VERBOSE=1 ./scripts/check-include-paths.sh build
@@ -403,3 +403,164 @@ inline void ProcessData() { /* ... */ }
 Full documentation available in:
 - [docs/internal/testing/symbol-conflict-detection.md](../docs/internal/testing/symbol-conflict-detection.md)
 - [docs/internal/testing/sample-symbol-database.json](../docs/internal/testing/sample-symbol-database.json)
+
+## AI Model Evaluation Suite
+
+Tools for evaluating and comparing AI models used with the z3ed CLI agent system. Located in `scripts/ai/`.
+
+### Quick Start
+
+```bash
+# Run a quick smoke test
+./scripts/ai/run-model-eval.sh --quick
+
+# Evaluate specific models
+./scripts/ai/run-model-eval.sh --models llama3.2,qwen2.5-coder
+
+# Evaluate all available models
+./scripts/ai/run-model-eval.sh --all
+
+# Evaluate with comparison report
+./scripts/ai/run-model-eval.sh --default --compare
+```
+
+### Components
+
+#### run-model-eval.sh
+
+Main entry point script. Handles prerequisites checking, model pulling, and orchestrates the evaluation.
+
+**Options:**
+- `--models, -m LIST` - Comma-separated list of models to evaluate
+- `--all` - Evaluate all available Ollama models
+- `--default` - Evaluate default models from config (llama3.2, qwen2.5-coder, etc.)
+- `--tasks, -t LIST` - Task categories: rom_inspection, code_analysis, tool_calling, conversation
+- `--timeout SEC` - Timeout per task (default: 120)
+- `--quick` - Quick smoke test (single model, fewer tasks)
+- `--compare` - Generate comparison report after evaluation
+- `--dry-run` - Show what would run without executing
+
+#### eval-runner.py
+
+Python evaluation engine that runs tasks against models and scores responses.
+
+**Features:**
+- Multi-model evaluation
+- Pattern-based accuracy scoring
+- Response completeness analysis
+- Tool usage detection
+- Response time measurement
+- JSON output for analysis
+
+**Direct usage:**
+```bash
+python scripts/ai/eval-runner.py \
+  --models llama3.2,qwen2.5-coder \
+  --tasks all \
+  --output results/eval-$(date +%Y%m%d).json
+```
+
+#### compare-models.py
+
+Generates comparison reports from evaluation results.
+
+**Formats:**
+- `--format table` - ASCII table (default)
+- `--format markdown` - Markdown with analysis
+- `--format json` - Machine-readable JSON
+
+**Usage:**
+```bash
+# Compare all recent evaluations
+python scripts/ai/compare-models.py results/eval-*.json
+
+# Generate markdown report
+python scripts/ai/compare-models.py --format markdown --output report.md results/*.json
+
+# Get best model name (for scripting)
+BEST_MODEL=$(python scripts/ai/compare-models.py --best results/eval-*.json)
+```
+
+#### eval-tasks.yaml
+
+Task definitions and scoring configuration. Categories:
+
+| Category | Description | Example Tasks |
+|----------|-------------|---------------|
+| rom_inspection | ROM data structure queries | List dungeons, describe maps |
+| code_analysis | Code understanding tasks | Explain functions, find bugs |
+| tool_calling | Tool usage evaluation | File operations, build commands |
+| conversation | Multi-turn dialog | Follow-ups, clarifications |
+
+**Scoring dimensions:**
+- **Accuracy** (40%): Pattern matching against expected responses
+- **Completeness** (30%): Response depth and structure
+- **Tool Usage** (20%): Appropriate tool selection
+- **Response Time** (10%): Speed (normalized to 0-10)
+
+### Output
+
+Results are saved to `scripts/ai/results/`:
+- `eval-YYYYMMDD-HHMMSS.json` - Individual evaluation results
+- `comparison-YYYYMMDD-HHMMSS.md` - Comparison reports
+
+**Sample output:**
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│                    YAZE AI Model Evaluation Report                   │
+├──────────────────────────────────────────────────────────────────────┤
+│ Model                    │ Accuracy   │ Tool Use   │ Speed   │ Runs │
+├──────────────────────────────────────────────────────────────────────┤
+│ qwen2.5-coder:7b         │     8.8/10 │     9.2/10 │    2.1s │     3 │
+│ llama3.2:latest          │     7.9/10 │     7.5/10 │    2.3s │     3 │
+│ codellama:7b             │     7.2/10 │     8.1/10 │    2.8s │     3 │
+├──────────────────────────────────────────────────────────────────────┤
+│ Recommended: qwen2.5-coder:7b (score: 8.7/10)                        │
+└──────────────────────────────────────────────────────────────────────┘
+```
+
+### Prerequisites
+
+- **Ollama**: Install from https://ollama.ai
+- **Python 3.10+** with `requests` and `pyyaml`:
+  ```bash
+  pip install requests pyyaml
+  ```
+- **At least one model pulled**:
+  ```bash
+  ollama pull llama3.2
+  ```
+
+### Adding Custom Tasks
+
+Edit `scripts/ai/eval-tasks.yaml` to add new evaluation tasks:
+
+```yaml
+categories:
+  custom_category:
+    description: "My custom tasks"
+    tasks:
+      - id: "my_task"
+        name: "My Task Name"
+        prompt: "What is the purpose of..."
+        expected_patterns:
+          - "expected|keyword|pattern"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Must mention X, Y, Z"
+          completeness_criteria: "Should include examples"
+```
+
+### Integration with CI
+
+The evaluation suite can be integrated into CI pipelines:
+
+```yaml
+# .github/workflows/ai-eval.yml
+- name: Run AI Evaluation
+  run: |
+    ollama serve &
+    sleep 5
+    ollama pull llama3.2
+    ./scripts/ai/run-model-eval.sh --models llama3.2 --tasks tool_calling
+```
--- a/scripts/README_analyze_room.md
+++ b/scripts/README_analyze_room.md
@@ -0,0 +1,158 @@
+# Room Object Analyzer (`analyze_room.py`)
+
+A Python script for analyzing dungeon room object data from A Link to the Past ROMs. Useful for debugging layer compositing, understanding room structure, and validating draw routine implementations.
+
+## Requirements
+
+- Python 3.6+
+- A Link to the Past ROM file (vanilla .sfc)
+
+## Basic Usage
+
+```bash
+# Analyze a single room
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1
+
+# Analyze multiple rooms
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1 2 3
+
+# Analyze a range of rooms
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc --range 0 10
+
+# Analyze all 296 rooms (summary only)
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc --all
+```
+
+## Common Options
+
+| Option | Description |
+|--------|-------------|
+| `--rom PATH` | Path to the ROM file |
+| `--compositing` | Include layer compositing analysis |
+| `--list-bg2` | List all rooms with BG2 overlay objects |
+| `--json` | Output as JSON for programmatic use |
+| `--summary` | Show summary only (object counts) |
+| `--quiet` | Minimal output |
+
+## Layer Analysis
+
+The script identifies objects by their layer assignment:
+
+| Layer | Buffer | Description |
+|-------|--------|-------------|
+| Layer 0 | BG1 Main | Primary floor/walls |
+| Layer 1 | BG2 Overlay | Background details (platforms, statues) |
+| Layer 2 | BG1 Priority | Priority objects on BG1 (torches) |
+
+### Finding Rooms with BG2 Overlay Issues
+
+```bash
+# List all 94 rooms with BG2 overlay objects
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc --list-bg2
+
+# Analyze specific room's layer compositing
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1 --compositing
+```
+
+## Output Format
+
+### Default Output
+```
+======================================================================
+ROOM 001 (0x001) OBJECT ANALYSIS
+======================================================================
+Room data at PC: 0x5230F (SNES: 0x8AA30F)
+Floor: BG1=6, BG2=6, Layout=4
+
+OBJECTS (Layer 0=BG1 main, Layer 1=BG2 overlay, Layer 2=BG1 priority)
+======================================================================
+  L0 (BG1_Main): [FC 21 C0] -> T2 ID=0x100 @ ( 2, 7) sz= 0 - Corner NW (concave)
+  ...
+  L1 (BG2_Overlay): [59 34 33] -> T1 ID=0x033 @ (22,13) sz= 4 - Floor 4x4
+  ...
+```
+
+### JSON Output
+```bash
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1 --json > room_001.json
+```
+
+```json
+{
+  "room_id": 1,
+  "floor1": 6,
+  "floor2": 6,
+  "layout": 4,
+  "objects_by_layer": {
+    "0": [...],
+    "1": [...],
+    "2": [...]
+  }
+}
+```
+
+## Object Decoding
+
+Objects are decoded based on their type:
+
+| Type | Byte Pattern | ID Range | Description |
+|------|--------------|----------|-------------|
+| Type 1 | `xxxxxxss yyyyyyss iiiiiiii` | 0x00-0xFF | Standard objects |
+| Type 2 | `111111xx xxxxyyyy yyiiiiii` | 0x100-0x1FF | Layout corners |
+| Type 3 | `xxxxxxii yyyyyyii 11111iii` | 0xF00-0xFFF | Interactive objects |
+
+## Integration with yaze Development
+
+### Validating Draw Routine Fixes
+
+1. Find rooms using a specific object:
+   ```bash
+   python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc --all --json | \
+     python3 -c "import json,sys; d=json.load(sys.stdin); print([r['room_id'] for r in d if any(o['id']==0x033 for l in r['objects_by_layer'].values() for o in l)])"
+   ```
+
+2. Test BG2 masking on affected rooms:
+   ```bash
+   for room in $(python3 scripts/analyze_room.py --list-bg2 | grep "Room" | awk '{print $2}'); do
+     echo "Testing room $room"
+   done
+   ```
+
+### Debugging Object Dimensions
+
+Compare script output with `CalculateObjectDimensions` in `object_drawer.cc`:
+
+```bash
+# Get Room 001 Layer 1 objects with sizes
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1 | grep "L1"
+```
+
+Expected dimension calculations:
+- `0x033 @ (22,13) size=4`: routine 16, count=5, width=160px, height=32px
+- `0x034 @ (23,16) size=14`: routine 25, count=18, width=144px, height=8px
+
+## ROM Address Reference
+
+| Data | Address | Notes |
+|------|---------|-------|
+| Object Pointers | 0x874C | 3 bytes per room |
+| Header Pointers | 0xB5DD | Room header data |
+| Total Rooms | 296 | 0x128 rooms |
+
+## Example: Room 001 Analysis
+
+Room 001 is a good test case for BG2 overlay debugging:
+
+```bash
+python3 scripts/analyze_room.py --rom roms/alttp_vanilla.sfc 1 --compositing
+```
+
+Key objects on Layer 1 (BG2):
+- Platform floor (0x033) at center
+- Statues (0x038) near stairs
+- Solid tiles (0x034, 0x071) for platform edges
+- Inter-room stairs (0x13B)
+
+These objects should create "holes" in BG1 floor tiles to show through.
+
+
--- a/scripts/agent_build.sh
+++ b/scripts/agent_build.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# scripts/agent_build.sh
+# Agent build helper (shared build directory by default; override via YAZE_BUILD_DIR).
+# Usage: ./scripts/agent_build.sh [target]
+# Default target is "yaze" if not specified.
+
+set -e
+
+# Detect OS
+OS="$(uname -s)"
+case "${OS}" in
+    Linux*)     PRESET="lin-ai";;
+    Darwin*)    PRESET="mac-ai";;
+    CYGWIN*)    PRESET="win-ai";;
+    MINGW*)     PRESET="win-ai";;
+    *)          echo "Unknown OS: ${OS}"; exit 1;;
+esac
+
+BUILD_DIR="${YAZE_BUILD_DIR:-build}"
+TARGET="${1:-yaze}"
+
+echo "=================================================="
+echo "🤖 Agent Build System"
+echo "Platform: ${OS}"
+echo "Preset:   ${PRESET}"
+echo "Build Dir: ${BUILD_DIR}"
+echo "Target:   ${TARGET}"
+echo "=================================================="
+
+# Ensure we are in the project root
+if [ ! -f "CMakePresets.json" ]; then
+    echo "❌ Error: CMakePresets.json not found. Must run from project root."
+    exit 1
+fi
+
+# Configure if needed (using the preset which now enforces binaryDir)
+if [ ! -d "${BUILD_DIR}" ]; then
+    echo "🔧 Configuring ${PRESET}..."
+    cmake --preset "${PRESET}"
+fi
+
+# Build
+echo "🔨 Building target: ${TARGET}..."
+cmake --build "${BUILD_DIR}" --target "${TARGET}"
+
+echo "✅ Build complete."
--- a/scripts/agent_test_suite.sh
+++ b/scripts/agent_test_suite.sh
@@ -9,7 +9,8 @@ RED='\033[0;31m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color

-Z3ED="./build_test/bin/z3ed"
+BUILD_DIR="${YAZE_BUILD_DIR:-./build}"
+Z3ED="${BUILD_DIR}/bin/z3ed"
 RESULTS_FILE="/tmp/z3ed_ai_test_results.txt"
 USE_MOCK_ROM=true  # Set to false if you want to test with a real ROM
 OLLAMA_MODEL="${OLLAMA_MODEL:-qwen2.5-coder:0.5b}"
@@ -148,7 +149,7 @@ fi
 # Check binary exists
 if [ ! -f "$Z3ED" ]; then
    echo -e "${RED}✗ z3ed binary not found at: $Z3ED${NC}"
-    echo "Run: cmake --build build_test"
+    echo "Run: cmake --build $BUILD_DIR"
    exit 1
 fi
 echo "✅ z3ed binary found"
--- a/scripts/agents/README.md
+++ b/scripts/agents/README.md
@@ -38,48 +38,48 @@ workflows/builds were triggered and where to find artifacts/logs.

 Local builds can take 10-15+ minutes from scratch. Follow these practices to minimize rebuild time:

-### Use Dedicated Build Directories
-Always use a dedicated build directory like `build_ai` or `build_agent` to avoid interfering with the user's `build` directory:
+### Use a Consistent Build Directory
+Defaults now use `build/` for native builds. If you need isolation, set `YAZE_BUILD_DIR` or add a `CMakeUserPresets.json` locally:
 ```bash
-cmake --preset mac-dbg -B build_ai
-cmake --build build_ai -j8 --target yaze
+cmake --preset mac-dbg
+cmake --build build -j8 --target yaze
 ```

 ### Incremental Builds
 Once configured, only rebuild—don't reconfigure unless CMakeLists.txt changed:
 ```bash
 # GOOD: Just rebuild (fast, only recompiles changed files)
-cmake --build build_ai -j8 --target yaze
+cmake --build build -j8 --target yaze

 # AVOID: Reconfiguring when unnecessary (triggers full dependency resolution)
-cmake --preset mac-dbg -B build_ai && cmake --build build_ai
+cmake --preset mac-dbg && cmake --build build
 ```

 ### Build Specific Targets
 Don't build everything when you only need to verify a specific component:
 ```bash
 # Build only the main editor (skips CLI, tests, etc.)
-cmake --build build_ai -j8 --target yaze
+cmake --build build -j8 --target yaze

 # Build only the CLI tool
-cmake --build build_ai -j8 --target z3ed
+cmake --build build -j8 --target z3ed

 # Build only tests
-cmake --build build_ai -j8 --target yaze_test
+cmake --build build -j8 --target yaze_test
 ```

 ### Parallel Compilation
 Always use `-j8` or higher based on CPU cores:
 ```bash
-cmake --build build_ai -j$(sysctl -n hw.ncpu)  # macOS
-cmake --build build_ai -j$(nproc)              # Linux
+cmake --build build -j$(sysctl -n hw.ncpu)  # macOS
+cmake --build build -j$(nproc)              # Linux
 ```

 ### Quick Syntax Check
 For rapid iteration on compile errors, build just the affected library:
 ```bash
 # If fixing errors in src/app/editor/dungeon/, build just the editor lib
-cmake --build build_ai -j8 --target yaze_editor
+cmake --build build -j8 --target yaze_editor
 ```

 ### Verifying Changes Before CI
--- a/scripts/aggregate_test_results.py
+++ b/scripts/aggregate_test_results.py
@@ -0,0 +1,636 @@
+#!/usr/bin/env python3
+"""
+Aggregate test results from multiple sources and generate comprehensive reports.
+Used by CI/CD pipeline to combine results from parallel test execution.
+"""
+
+import json
+import xml.etree.ElementTree as ET
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, List, Any
+from dataclasses import dataclass, asdict
+from datetime import datetime
+import re
+
+@dataclass
+class TestCase:
+    """Individual test case result."""
+    name: str
+    suite: str
+    status: str  # passed, failed, skipped, error
+    duration: float
+    message: str = ""
+    output: str = ""
+
+@dataclass
+class TestSuite:
+    """Test suite results."""
+    name: str
+    tests: int = 0
+    passed: int = 0
+    failed: int = 0
+    skipped: int = 0
+    errors: int = 0
+    duration: float = 0.0
+    test_cases: List[TestCase] = None
+
+    def __post_init__(self):
+        if self.test_cases is None:
+            self.test_cases = []
+
+@dataclass
+class StageResults:
+    """Results for a testing stage."""
+    name: str
+    status: str
+    total: int
+    passed: int
+    failed: int
+    skipped: int
+    duration: float
+    pass_rate: float
+    emoji: str = ""
+
+@dataclass
+class AggregatedResults:
+    """Complete aggregated test results."""
+    overall_passed: bool
+    total_tests: int
+    total_passed: int
+    total_failed: int
+    total_skipped: int
+    total_duration: float
+    tests_per_second: float
+    parallel_efficiency: float
+    stage1: StageResults
+    stage2: StageResults
+    stage3: StageResults
+    test_suites: List[TestSuite]
+    failed_tests: List[TestCase]
+    slowest_tests: List[TestCase]
+    timestamp: str
+
+class TestResultAggregator:
+    """Aggregates test results from multiple sources."""
+
+    def __init__(self, input_dir: Path):
+        self.input_dir = input_dir
+        self.test_suites = {}
+        self.all_tests = []
+        self.stage_results = {}
+
+    def parse_junit_xml(self, xml_file: Path) -> TestSuite:
+        """Parse JUnit XML test results."""
+        try:
+            tree = ET.parse(xml_file)
+            root = tree.getroot()
+
+            # Handle both testsuites and testsuite root elements
+            if root.tag == "testsuites":
+                suites = root.findall("testsuite")
+            else:
+                suites = [root]
+
+            suite_results = TestSuite(name=xml_file.stem)
+
+            for suite_elem in suites:
+                suite_name = suite_elem.get("name", "unknown")
+
+                for testcase_elem in suite_elem.findall("testcase"):
+                    test_name = testcase_elem.get("name")
+                    classname = testcase_elem.get("classname", suite_name)
+                    time = float(testcase_elem.get("time", 0))
+
+                    # Determine status
+                    status = "passed"
+                    message = ""
+                    output = ""
+
+                    failure = testcase_elem.find("failure")
+                    error = testcase_elem.find("error")
+                    skipped = testcase_elem.find("skipped")
+
+                    if failure is not None:
+                        status = "failed"
+                        message = failure.get("message", "")
+                        output = failure.text or ""
+                    elif error is not None:
+                        status = "error"
+                        message = error.get("message", "")
+                        output = error.text or ""
+                    elif skipped is not None:
+                        status = "skipped"
+                        message = skipped.get("message", "")
+
+                    test_case = TestCase(
+                        name=test_name,
+                        suite=classname,
+                        status=status,
+                        duration=time,
+                        message=message,
+                        output=output
+                    )
+
+                    suite_results.test_cases.append(test_case)
+                    suite_results.tests += 1
+                    suite_results.duration += time
+
+                    if status == "passed":
+                        suite_results.passed += 1
+                    elif status == "failed":
+                        suite_results.failed += 1
+                    elif status == "skipped":
+                        suite_results.skipped += 1
+                    elif status == "error":
+                        suite_results.errors += 1
+
+            return suite_results
+
+        except (ET.ParseError, IOError) as e:
+            print(f"Warning: Failed to parse {xml_file}: {e}", file=sys.stderr)
+            return TestSuite(name=xml_file.stem)
+
+    def parse_json_results(self, json_file: Path) -> TestSuite:
+        """Parse JSON test results (gtest format)."""
+        try:
+            with open(json_file) as f:
+                data = json.load(f)
+
+            suite_results = TestSuite(name=json_file.stem)
+
+            # Handle both single suite and multiple suites
+            if "testsuites" in data:
+                suites = data["testsuites"]
+            elif "testsuite" in data:
+                suites = [data]
+            else:
+                suites = []
+
+            for suite in suites:
+                suite_name = suite.get("name", "unknown")
+
+                for test in suite.get("testsuite", []):
+                    test_name = test.get("name")
+                    status = "passed" if test.get("result") == "COMPLETED" else "failed"
+                    duration = float(test.get("time", "0").replace("s", ""))
+
+                    test_case = TestCase(
+                        name=test_name,
+                        suite=suite_name,
+                        status=status,
+                        duration=duration,
+                        output=test.get("output", "")
+                    )
+
+                    suite_results.test_cases.append(test_case)
+                    suite_results.tests += 1
+                    suite_results.duration += duration
+
+                    if status == "passed":
+                        suite_results.passed += 1
+                    else:
+                        suite_results.failed += 1
+
+            return suite_results
+
+        except (json.JSONDecodeError, IOError, KeyError) as e:
+            print(f"Warning: Failed to parse {json_file}: {e}", file=sys.stderr)
+            return TestSuite(name=json_file.stem)
+
+    def collect_results(self):
+        """Collect all test results from input directory."""
+        # Find all result files
+        xml_files = list(self.input_dir.rglob("*.xml"))
+        json_files = list(self.input_dir.rglob("*.json"))
+
+        print(f"Found {len(xml_files)} XML and {len(json_files)} JSON result files")
+
+        # Parse XML results
+        for xml_file in xml_files:
+            # Skip non-test XML files
+            if "coverage" in xml_file.name.lower():
+                continue
+
+            suite = self.parse_junit_xml(xml_file)
+            if suite.tests > 0:
+                self.test_suites[suite.name] = suite
+                self.all_tests.extend(suite.test_cases)
+
+        # Parse JSON results
+        for json_file in json_files:
+            # Skip non-test JSON files
+            if any(skip in json_file.name.lower()
+                   for skip in ["summary", "metrics", "times", "coverage"]):
+                continue
+
+            suite = self.parse_json_results(json_file)
+            if suite.tests > 0:
+                # Merge with existing suite if name matches
+                if suite.name in self.test_suites:
+                    existing = self.test_suites[suite.name]
+                    existing.test_cases.extend(suite.test_cases)
+                    existing.tests += suite.tests
+                    existing.passed += suite.passed
+                    existing.failed += suite.failed
+                    existing.skipped += suite.skipped
+                    existing.errors += suite.errors
+                    existing.duration += suite.duration
+                else:
+                    self.test_suites[suite.name] = suite
+                    self.all_tests.extend(suite.test_cases)
+
+    def categorize_by_stage(self):
+        """Categorize results by CI stage."""
+        # Initialize stage results
+        stages = {
+            "stage1": StageResults("Smoke Tests", "unknown", 0, 0, 0, 0, 0.0, 0.0),
+            "stage2": StageResults("Unit Tests", "unknown", 0, 0, 0, 0, 0.0, 0.0),
+            "stage3": StageResults("Integration Tests", "unknown", 0, 0, 0, 0, 0.0, 0.0),
+        }
+
+        # Categorize tests
+        for test in self.all_tests:
+            # Determine stage based on test name or suite
+            stage = None
+            if "smoke" in test.name.lower() or "critical" in test.name.lower():
+                stage = "stage1"
+            elif "unit" in test.suite.lower() or "unit" in test.name.lower():
+                stage = "stage2"
+            elif ("integration" in test.suite.lower() or
+                  "integration" in test.name.lower() or
+                  "e2e" in test.name.lower() or
+                  "gui" in test.name.lower()):
+                stage = "stage3"
+            else:
+                # Default to unit tests
+                stage = "stage2"
+
+            if stage:
+                stage_result = stages[stage]
+                stage_result.total += 1
+                stage_result.duration += test.duration
+
+                if test.status == "passed":
+                    stage_result.passed += 1
+                elif test.status in ["failed", "error"]:
+                    stage_result.failed += 1
+                elif test.status == "skipped":
+                    stage_result.skipped += 1
+
+        # Calculate pass rates and status
+        for stage_key, stage in stages.items():
+            if stage.total > 0:
+                stage.pass_rate = (stage.passed / stage.total) * 100
+                stage.status = "✅" if stage.failed == 0 else "❌"
+                stage.emoji = "✅" if stage.failed == 0 else "❌"
+            else:
+                stage.status = "⏭️"
+                stage.emoji = "⏭️"
+
+        self.stage_results = stages
+
+    def generate_summary(self) -> AggregatedResults:
+        """Generate aggregated summary of all results."""
+        total_tests = len(self.all_tests)
+        total_passed = sum(1 for t in self.all_tests if t.status == "passed")
+        total_failed = sum(1 for t in self.all_tests
+                          if t.status in ["failed", "error"])
+        total_skipped = sum(1 for t in self.all_tests if t.status == "skipped")
+        total_duration = sum(t.duration for t in self.all_tests)
+
+        # Find failed tests
+        failed_tests = [t for t in self.all_tests
+                       if t.status in ["failed", "error"]]
+
+        # Find slowest tests
+        slowest_tests = sorted(self.all_tests,
+                               key=lambda t: t.duration,
+                               reverse=True)[:10]
+
+        # Calculate metrics
+        tests_per_second = total_tests / total_duration if total_duration > 0 else 0
+
+        # Estimate parallel efficiency (simplified)
+        num_shards = len(self.test_suites)
+        if num_shards > 1:
+            ideal_time = total_duration / num_shards
+            actual_time = max(suite.duration for suite in self.test_suites.values())
+            parallel_efficiency = (ideal_time / actual_time * 100) if actual_time > 0 else 0
+        else:
+            parallel_efficiency = 100
+
+        return AggregatedResults(
+            overall_passed=(total_failed == 0),
+            total_tests=total_tests,
+            total_passed=total_passed,
+            total_failed=total_failed,
+            total_skipped=total_skipped,
+            total_duration=round(total_duration, 2),
+            tests_per_second=round(tests_per_second, 2),
+            parallel_efficiency=round(parallel_efficiency, 1),
+            stage1=self.stage_results.get("stage1"),
+            stage2=self.stage_results.get("stage2"),
+            stage3=self.stage_results.get("stage3"),
+            test_suites=list(self.test_suites.values()),
+            failed_tests=failed_tests,
+            slowest_tests=slowest_tests,
+            timestamp=datetime.now().isoformat()
+        )
+
+    def generate_html_report(self, results: AggregatedResults, output_path: Path):
+        """Generate HTML report from aggregated results."""
+        html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Yaze Test Results - {datetime.now().strftime('%Y-%m-%d %H:%M')}</title>
+    <style>
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+        }}
+        .container {{
+            max-width: 1200px;
+            margin: 0 auto;
+            background: white;
+            border-radius: 10px;
+            padding: 30px;
+            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
+        }}
+        h1 {{
+            color: #333;
+            border-bottom: 3px solid #667eea;
+            padding-bottom: 10px;
+        }}
+        .summary {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 20px;
+            margin: 30px 0;
+        }}
+        .metric {{
+            background: #f8f9fa;
+            padding: 20px;
+            border-radius: 8px;
+            text-align: center;
+            border-left: 4px solid #667eea;
+        }}
+        .metric-value {{
+            font-size: 32px;
+            font-weight: bold;
+            color: #667eea;
+        }}
+        .metric-label {{
+            color: #666;
+            font-size: 14px;
+            margin-top: 5px;
+        }}
+        .status-pass {{
+            color: #28a745;
+        }}
+        .status-fail {{
+            color: #dc3545;
+        }}
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            margin: 20px 0;
+        }}
+        th {{
+            background: #667eea;
+            color: white;
+            padding: 12px;
+            text-align: left;
+        }}
+        td {{
+            padding: 10px;
+            border-bottom: 1px solid #ddd;
+        }}
+        tr:hover {{
+            background: #f8f9fa;
+        }}
+        .stage-badge {{
+            display: inline-block;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            font-weight: bold;
+        }}
+        .stage-pass {{
+            background: #d4edda;
+            color: #155724;
+        }}
+        .stage-fail {{
+            background: #f8d7da;
+            color: #721c24;
+        }}
+        .progress-bar {{
+            width: 100%;
+            height: 30px;
+            background: #f0f0f0;
+            border-radius: 15px;
+            overflow: hidden;
+            margin: 10px 0;
+        }}
+        .progress-fill {{
+            height: 100%;
+            background: linear-gradient(90deg, #28a745 0%, #20c997 100%);
+            display: flex;
+            align-items: center;
+            padding-left: 10px;
+            color: white;
+            font-weight: bold;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🎯 Yaze Test Results Report</h1>
+
+        <div class="summary">
+            <div class="metric">
+                <div class="metric-value {'status-pass' if results.overall_passed else 'status-fail'}">
+                    {'PASSED' if results.overall_passed else 'FAILED'}
+                </div>
+                <div class="metric-label">Overall Status</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value">{results.total_tests}</div>
+                <div class="metric-label">Total Tests</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value">{results.total_passed}</div>
+                <div class="metric-label">Passed</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value">{results.total_failed}</div>
+                <div class="metric-label">Failed</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value">{results.total_duration}s</div>
+                <div class="metric-label">Duration</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value">{results.parallel_efficiency}%</div>
+                <div class="metric-label">Efficiency</div>
+            </div>
+        </div>
+
+        <h2>📊 Pass Rate</h2>
+        <div class="progress-bar">
+            <div class="progress-fill" style="width: {results.total_passed / results.total_tests * 100:.1f}%">
+                {results.total_passed / results.total_tests * 100:.1f}%
+            </div>
+        </div>
+
+        <h2>🚀 Stage Results</h2>
+        <table>
+            <tr>
+                <th>Stage</th>
+                <th>Status</th>
+                <th>Tests</th>
+                <th>Passed</th>
+                <th>Failed</th>
+                <th>Pass Rate</th>
+                <th>Duration</th>
+            </tr>
+            <tr>
+                <td>Stage 1: Smoke</td>
+                <td><span class="stage-badge {'stage-pass' if results.stage1.failed == 0 else 'stage-fail'}">
+                    {results.stage1.emoji}
+                </span></td>
+                <td>{results.stage1.total}</td>
+                <td>{results.stage1.passed}</td>
+                <td>{results.stage1.failed}</td>
+                <td>{results.stage1.pass_rate:.1f}%</td>
+                <td>{results.stage1.duration:.2f}s</td>
+            </tr>
+            <tr>
+                <td>Stage 2: Unit</td>
+                <td><span class="stage-badge {'stage-pass' if results.stage2.failed == 0 else 'stage-fail'}">
+                    {results.stage2.emoji}
+                </span></td>
+                <td>{results.stage2.total}</td>
+                <td>{results.stage2.passed}</td>
+                <td>{results.stage2.failed}</td>
+                <td>{results.stage2.pass_rate:.1f}%</td>
+                <td>{results.stage2.duration:.2f}s</td>
+            </tr>
+            <tr>
+                <td>Stage 3: Integration</td>
+                <td><span class="stage-badge {'stage-pass' if results.stage3.failed == 0 else 'stage-fail'}">
+                    {results.stage3.emoji}
+                </span></td>
+                <td>{results.stage3.total}</td>
+                <td>{results.stage3.passed}</td>
+                <td>{results.stage3.failed}</td>
+                <td>{results.stage3.pass_rate:.1f}%</td>
+                <td>{results.stage3.duration:.2f}s</td>
+            </tr>
+        </table>
+
+        {'<h2>❌ Failed Tests</h2><table><tr><th>Test</th><th>Suite</th><th>Message</th></tr>' if results.failed_tests else ''}
+        {''.join(f'<tr><td>{t.name}</td><td>{t.suite}</td><td>{t.message[:100]}</td></tr>' for t in results.failed_tests[:20])}
+        {'</table>' if results.failed_tests else ''}
+
+        <h2>🐌 Slowest Tests</h2>
+        <table>
+            <tr>
+                <th>Test</th>
+                <th>Suite</th>
+                <th>Duration</th>
+            </tr>
+            {''.join(f'<tr><td>{t.name}</td><td>{t.suite}</td><td>{t.duration:.3f}s</td></tr>' for t in results.slowest_tests)}
+        </table>
+
+        <p style="text-align: center; color: #666; margin-top: 40px;">
+            Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} |
+            <a href="https://github.com/yaze/yaze">Yaze Project</a>
+        </p>
+    </div>
+</body>
+</html>"""
+
+        output_path.write_text(html)
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Aggregate test results from multiple sources"
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        required=True,
+        help="Directory containing test result files"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("results_summary.json"),
+        help="Output JSON file for aggregated results"
+    )
+    parser.add_argument(
+        "--generate-html",
+        type=Path,
+        help="Generate HTML report at specified path"
+    )
+
+    args = parser.parse_args()
+
+    if not args.input_dir.exists():
+        print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    # Create aggregator
+    aggregator = TestResultAggregator(args.input_dir)
+
+    # Collect and process results
+    print("Collecting test results...")
+    aggregator.collect_results()
+
+    print(f"Found {len(aggregator.all_tests)} total tests across "
+          f"{len(aggregator.test_suites)} suites")
+
+    # Categorize by stage
+    aggregator.categorize_by_stage()
+
+    # Generate summary
+    summary = aggregator.generate_summary()
+
+    # Save JSON summary
+    with open(args.output, 'w') as f:
+        # Convert dataclasses to dict
+        summary_dict = asdict(summary)
+        json.dump(summary_dict, f, indent=2, default=str)
+
+    print(f"Summary saved to {args.output}")
+
+    # Generate HTML report if requested
+    if args.generate_html:
+        aggregator.generate_html_report(summary, args.generate_html)
+        print(f"HTML report saved to {args.generate_html}")
+
+    # Print summary
+    print(f"\n{'=' * 60}")
+    print(f"Test Results Summary")
+    print(f"{'=' * 60}")
+    print(f"Overall Status: {'✅ PASSED' if summary.overall_passed else '❌ FAILED'}")
+    print(f"Total Tests:    {summary.total_tests}")
+    print(f"Passed:         {summary.total_passed} ({summary.total_passed/summary.total_tests*100:.1f}%)")
+    print(f"Failed:         {summary.total_failed}")
+    print(f"Duration:       {summary.total_duration}s")
+
+    # Exit with appropriate code
+    sys.exit(0 if summary.overall_passed else 1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/ai/compare-models.py
+++ b/scripts/ai/compare-models.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+"""
+YAZE AI Model Comparison Report Generator
+
+Generates comparison reports from evaluation results.
+
+Usage:
+    python compare-models.py results/eval-*.json
+    python compare-models.py --format markdown results/eval-20241125.json
+    python compare-models.py --best results/eval-*.json
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+def load_results(file_paths: list[str]) -> list[dict]:
+    """Load evaluation results from JSON files."""
+    results = []
+    for path in file_paths:
+        try:
+            with open(path, 'r') as f:
+                data = json.load(f)
+                data['_source_file'] = path
+                results.append(data)
+        except Exception as e:
+            print(f"Warning: Could not load {path}: {e}", file=sys.stderr)
+    return results
+
+
+def merge_results(results: list[dict]) -> dict:
+    """Merge multiple result files into a single comparison."""
+    merged = {
+        "sources": [],
+        "models": {},
+        "timestamp": datetime.now().isoformat()
+    }
+    
+    for result in results:
+        merged["sources"].append(result.get('_source_file', 'unknown'))
+        
+        for model, model_data in result.get('models', {}).items():
+            if model not in merged["models"]:
+                merged["models"][model] = {
+                    "runs": [],
+                    "summary": {}
+                }
+            
+            merged["models"][model]["runs"].append({
+                "source": result.get('_source_file'),
+                "timestamp": result.get('timestamp'),
+                "summary": model_data.get('summary', {}),
+                "task_count": len(model_data.get('tasks', []))
+            })
+    
+    # Calculate averages across runs
+    for model, data in merged["models"].items():
+        runs = data["runs"]
+        if runs:
+            data["summary"] = {
+                "avg_accuracy": sum(r["summary"].get("avg_accuracy", 0) for r in runs) / len(runs),
+                "avg_completeness": sum(r["summary"].get("avg_completeness", 0) for r in runs) / len(runs),
+                "avg_tool_usage": sum(r["summary"].get("avg_tool_usage", 0) for r in runs) / len(runs),
+                "avg_response_time": sum(r["summary"].get("avg_response_time", 0) for r in runs) / len(runs),
+                "overall_score": sum(r["summary"].get("overall_score", 0) for r in runs) / len(runs),
+                "run_count": len(runs)
+            }
+    
+    return merged
+
+
+def format_table(merged: dict) -> str:
+    """Format results as ASCII table."""
+    lines = []
+    
+    lines.append("┌" + "─"*78 + "┐")
+    lines.append("│" + " "*18 + "YAZE AI Model Comparison Report" + " "*27 + "│")
+    lines.append("│" + " "*18 + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}" + " "*27 + "│")
+    lines.append("├" + "─"*78 + "┤")
+    lines.append("│ {:24} │ {:10} │ {:10} │ {:10} │ {:10} │ {:5} │".format(
+        "Model", "Accuracy", "Complete", "Tool Use", "Speed", "Runs"
+    ))
+    lines.append("├" + "─"*78 + "┤")
+    
+    # Sort by overall score
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    for model, data in sorted_models:
+        summary = data["summary"]
+        model_name = model[:24] if len(model) <= 24 else model[:21] + "..."
+        
+        lines.append("│ {:24} │ {:8.1f}/10 │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:5} │".format(
+            model_name,
+            summary.get("avg_accuracy", 0),
+            summary.get("avg_completeness", 0),
+            summary.get("avg_tool_usage", 0),
+            summary.get("avg_response_time", 0),
+            summary.get("run_count", 0)
+        ))
+    
+    lines.append("├" + "─"*78 + "┤")
+    
+    # Add recommendation
+    if sorted_models:
+        best_model = sorted_models[0][0]
+        best_score = sorted_models[0][1]["summary"].get("overall_score", 0)
+        lines.append("│ {:76} │".format(f"Recommended: {best_model} (score: {best_score:.1f}/10)"))
+    
+    lines.append("└" + "─"*78 + "┘")
+    
+    return "\n".join(lines)
+
+
+def format_markdown(merged: dict) -> str:
+    """Format results as Markdown."""
+    lines = []
+    
+    lines.append("# YAZE AI Model Comparison Report")
+    lines.append("")
+    lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append("")
+    lines.append("| Model | Accuracy | Completeness | Tool Use | Speed | Overall | Runs |")
+    lines.append("|-------|----------|--------------|----------|-------|---------|------|")
+    
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    for model, data in sorted_models:
+        summary = data["summary"]
+        lines.append("| {} | {:.1f}/10 | {:.1f}/10 | {:.1f}/10 | {:.1f}s | **{:.1f}/10** | {} |".format(
+            model,
+            summary.get("avg_accuracy", 0),
+            summary.get("avg_completeness", 0),
+            summary.get("avg_tool_usage", 0),
+            summary.get("avg_response_time", 0),
+            summary.get("overall_score", 0),
+            summary.get("run_count", 0)
+        ))
+    
+    lines.append("")
+    
+    # Recommendation section
+    if sorted_models:
+        best = sorted_models[0]
+        lines.append("## Recommendation")
+        lines.append("")
+        lines.append(f"**Best Model:** `{best[0]}`")
+        lines.append("")
+        lines.append("### Strengths")
+        lines.append("")
+        
+        summary = best[1]["summary"]
+        if summary.get("avg_accuracy", 0) >= 8:
+            lines.append("- ✅ High accuracy in responses")
+        if summary.get("avg_tool_usage", 0) >= 8:
+            lines.append("- ✅ Effective tool usage")
+        if summary.get("avg_response_time", 0) <= 3:
+            lines.append("- ✅ Fast response times")
+        if summary.get("avg_completeness", 0) >= 8:
+            lines.append("- ✅ Complete and detailed responses")
+        
+        lines.append("")
+        lines.append("### Considerations")
+        lines.append("")
+        
+        if summary.get("avg_accuracy", 0) < 7:
+            lines.append("- ⚠️ Accuracy could be improved")
+        if summary.get("avg_tool_usage", 0) < 7:
+            lines.append("- ⚠️ Tool usage needs improvement")
+        if summary.get("avg_response_time", 0) > 5:
+            lines.append("- ⚠️ Response times are slow")
+    
+    # Source files section
+    lines.append("")
+    lines.append("## Sources")
+    lines.append("")
+    for source in merged.get("sources", []):
+        lines.append(f"- `{source}`")
+    
+    return "\n".join(lines)
+
+
+def format_json(merged: dict) -> str:
+    """Format results as JSON."""
+    # Remove internal fields
+    output = {k: v for k, v in merged.items() if not k.startswith('_')}
+    return json.dumps(output, indent=2)
+
+
+def get_best_model(merged: dict) -> str:
+    """Get the name of the best performing model."""
+    sorted_models = sorted(
+        merged["models"].items(),
+        key=lambda x: x[1]["summary"].get("overall_score", 0),
+        reverse=True
+    )
+    
+    if sorted_models:
+        return sorted_models[0][0]
+    return "unknown"
+
+
+def analyze_task_performance(results: list[dict]) -> dict:
+    """Analyze performance broken down by task category."""
+    task_performance = {}
+    
+    for result in results:
+        for model, model_data in result.get('models', {}).items():
+            for task in model_data.get('tasks', []):
+                category = task.get('category', 'unknown')
+                task_id = task.get('task_id', 'unknown')
+                
+                key = f"{category}/{task_id}"
+                if key not in task_performance:
+                    task_performance[key] = {
+                        "category": category,
+                        "task_id": task_id,
+                        "task_name": task.get('task_name', 'Unknown'),
+                        "models": {}
+                    }
+                
+                if model not in task_performance[key]["models"]:
+                    task_performance[key]["models"][model] = {
+                        "scores": [],
+                        "times": []
+                    }
+                
+                task_performance[key]["models"][model]["scores"].append(
+                    task.get('accuracy_score', 0) * 0.5 + 
+                    task.get('completeness_score', 0) * 0.3 +
+                    task.get('tool_usage_score', 0) * 0.2
+                )
+                task_performance[key]["models"][model]["times"].append(
+                    task.get('response_time', 0)
+                )
+    
+    # Calculate averages
+    for task_key, task_data in task_performance.items():
+        for model, model_scores in task_data["models"].items():
+            scores = model_scores["scores"]
+            times = model_scores["times"]
+            model_scores["avg_score"] = sum(scores) / len(scores) if scores else 0
+            model_scores["avg_time"] = sum(times) / len(times) if times else 0
+    
+    return task_performance
+
+
+def format_task_analysis(task_performance: dict) -> str:
+    """Format task-level analysis."""
+    lines = []
+    lines.append("\n## Task-Level Performance\n")
+    
+    # Group by category
+    by_category = {}
+    for key, data in task_performance.items():
+        cat = data["category"]
+        if cat not in by_category:
+            by_category[cat] = []
+        by_category[cat].append(data)
+    
+    for category, tasks in sorted(by_category.items()):
+        lines.append(f"### {category.replace('_', ' ').title()}\n")
+        lines.append("| Task | Best Model | Score | Time |")
+        lines.append("|------|------------|-------|------|")
+        
+        for task in tasks:
+            # Find best model for this task
+            best_model = None
+            best_score = 0
+            for model, scores in task["models"].items():
+                if scores["avg_score"] > best_score:
+                    best_score = scores["avg_score"]
+                    best_model = model
+            
+            if best_model:
+                best_time = task["models"][best_model]["avg_time"]
+                lines.append("| {} | {} | {:.1f}/10 | {:.1f}s |".format(
+                    task["task_name"],
+                    best_model,
+                    best_score,
+                    best_time
+                ))
+        
+        lines.append("")
+    
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate comparison reports from AI evaluation results"
+    )
+    parser.add_argument(
+        "files",
+        nargs="+",
+        help="Evaluation result JSON files to compare"
+    )
+    parser.add_argument(
+        "--format", "-f",
+        choices=["table", "markdown", "json"],
+        default="table",
+        help="Output format (default: table)"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        help="Output file (default: stdout)"
+    )
+    parser.add_argument(
+        "--best",
+        action="store_true",
+        help="Only output the best model name (for scripting)"
+    )
+    parser.add_argument(
+        "--task-analysis",
+        action="store_true",
+        help="Include task-level performance analysis"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load and merge results
+    results = load_results(args.files)
+    if not results:
+        print("No valid result files found", file=sys.stderr)
+        sys.exit(1)
+    
+    merged = merge_results(results)
+    
+    # Handle --best flag
+    if args.best:
+        print(get_best_model(merged))
+        sys.exit(0)
+    
+    # Format output
+    if args.format == "table":
+        output = format_table(merged)
+    elif args.format == "markdown":
+        output = format_markdown(merged)
+        if args.task_analysis:
+            task_perf = analyze_task_performance(results)
+            output += format_task_analysis(task_perf)
+    else:
+        output = format_json(merged)
+    
+    # Write output
+    if args.output:
+        with open(args.output, 'w') as f:
+            f.write(output)
+        print(f"Report written to: {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/ai/eval-runner.py
+++ b/scripts/ai/eval-runner.py
@@ -0,0 +1,596 @@
+#!/usr/bin/env python3
+"""
+YAZE AI Model Evaluation Runner
+
+Runs evaluation tasks against multiple AI models and produces scored results.
+
+Usage:
+    python eval-runner.py --models llama3,qwen2.5-coder --tasks rom_inspection
+    python eval-runner.py --all-models --tasks all --output results/eval-$(date +%Y%m%d).json
+
+Requirements:
+    pip install requests pyyaml
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+
+import requests
+import yaml
+
+
+@dataclass
+class TaskResult:
+    """Result of a single task evaluation."""
+    task_id: str
+    task_name: str
+    category: str
+    model: str
+    prompt: str
+    response: str
+    response_time: float
+    accuracy_score: float = 0.0
+    completeness_score: float = 0.0
+    tool_usage_score: float = 0.0
+    pattern_matches: list = field(default_factory=list)
+    tools_used: list = field(default_factory=list)
+    error: Optional[str] = None
+    
+    @property
+    def overall_score(self) -> float:
+        """Calculate weighted overall score."""
+        # Default weights from eval-tasks.yaml
+        weights = {
+            'accuracy': 0.4,
+            'completeness': 0.3,
+            'tool_usage': 0.2,
+            'response_time': 0.1
+        }
+        
+        # Normalize response time to 0-10 scale (lower is better)
+        # 0s = 10, 60s+ = 0
+        time_score = max(0, 10 - (self.response_time / 6))
+        
+        return (
+            weights['accuracy'] * self.accuracy_score +
+            weights['completeness'] * self.completeness_score +
+            weights['tool_usage'] * self.tool_usage_score +
+            weights['response_time'] * time_score
+        )
+
+
+@dataclass
+class ModelResults:
+    """Aggregated results for a single model."""
+    model: str
+    tasks: list[TaskResult] = field(default_factory=list)
+    
+    @property
+    def avg_accuracy(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.accuracy_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_completeness(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.completeness_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_tool_usage(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.tool_usage_score for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def avg_response_time(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.response_time for t in self.tasks) / len(self.tasks)
+    
+    @property
+    def overall_score(self) -> float:
+        if not self.tasks:
+            return 0.0
+        return sum(t.overall_score for t in self.tasks) / len(self.tasks)
+
+
+class OllamaClient:
+    """Client for Ollama API."""
+    
+    def __init__(self, base_url: str = "http://localhost:11434"):
+        self.base_url = base_url
+        
+    def is_available(self) -> bool:
+        """Check if Ollama is running."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=5)
+            return resp.status_code == 200
+        except requests.exceptions.RequestException:
+            return False
+    
+    def list_models(self) -> list[str]:
+        """List available models."""
+        try:
+            resp = requests.get(f"{self.base_url}/api/tags", timeout=10)
+            if resp.status_code == 200:
+                data = resp.json()
+                return [m['name'] for m in data.get('models', [])]
+        except requests.exceptions.RequestException:
+            pass
+        return []
+    
+    def pull_model(self, model: str) -> bool:
+        """Pull a model if not available."""
+        print(f"  Pulling model {model}...", end=" ", flush=True)
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/pull",
+                json={"name": model},
+                timeout=600  # 10 minutes for large models
+            )
+            if resp.status_code == 200:
+                print("Done")
+                return True
+        except requests.exceptions.RequestException as e:
+            print(f"Failed: {e}")
+        return False
+    
+    def chat(self, model: str, prompt: str, timeout: int = 120) -> tuple[str, float]:
+        """
+        Send a chat message and return response + response time.
+        
+        Returns:
+            Tuple of (response_text, response_time_seconds)
+        """
+        start_time = time.time()
+        
+        try:
+            resp = requests.post(
+                f"{self.base_url}/api/chat",
+                json={
+                    "model": model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": False
+                },
+                timeout=timeout
+            )
+            
+            elapsed = time.time() - start_time
+            
+            if resp.status_code == 200:
+                data = resp.json()
+                content = data.get("message", {}).get("content", "")
+                return content, elapsed
+            else:
+                return f"Error: HTTP {resp.status_code}", elapsed
+                
+        except requests.exceptions.Timeout:
+            return "Error: Request timed out", timeout
+        except requests.exceptions.RequestException as e:
+            return f"Error: {str(e)}", time.time() - start_time
+
+
+class TaskEvaluator:
+    """Evaluates task responses and assigns scores."""
+    
+    def __init__(self, config: dict):
+        self.config = config
+        
+    def evaluate(self, task: dict, response: str, response_time: float) -> TaskResult:
+        """Evaluate a response for a task."""
+        result = TaskResult(
+            task_id=task['id'],
+            task_name=task['name'],
+            category=task.get('category', 'unknown'),
+            model=task.get('model', 'unknown'),
+            prompt=task.get('prompt', ''),
+            response=response,
+            response_time=response_time
+        )
+        
+        if response.startswith("Error:"):
+            result.error = response
+            return result
+        
+        # Check pattern matches
+        expected_patterns = task.get('expected_patterns', [])
+        for pattern in expected_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                result.pattern_matches.append(pattern)
+        
+        # Score accuracy based on pattern matches
+        if expected_patterns:
+            match_ratio = len(result.pattern_matches) / len(expected_patterns)
+            result.accuracy_score = match_ratio * 10
+        else:
+            # No patterns defined, give neutral score
+            result.accuracy_score = 5.0
+        
+        # Score completeness based on response length and structure
+        result.completeness_score = self._score_completeness(response, task)
+        
+        # Score tool usage
+        result.tool_usage_score = self._score_tool_usage(response, task)
+        
+        return result
+    
+    def _score_completeness(self, response: str, task: dict) -> float:
+        """Score completeness based on response characteristics."""
+        score = 0.0
+        
+        # Base score for having a response
+        if len(response.strip()) > 0:
+            score += 2.0
+        
+        # Length bonus (up to 4 points)
+        word_count = len(response.split())
+        if word_count >= 20:
+            score += min(4.0, word_count / 50)
+        
+        # Structure bonus (up to 2 points)
+        if '\n' in response:
+            score += 1.0  # Multi-line response
+        if '- ' in response or '* ' in response:
+            score += 0.5  # List items
+        if any(c.isdigit() for c in response):
+            score += 0.5  # Contains numbers/data
+        
+        # Code block bonus
+        if '```' in response or '    ' in response:
+            score += 1.0
+        
+        return min(10.0, score)
+    
+    def _score_tool_usage(self, response: str, task: dict) -> float:
+        """Score tool usage based on task requirements."""
+        required_tool = task.get('required_tool')
+        
+        if not required_tool:
+            # No tool required, check if response is sensible
+            return 7.0  # Neutral-good score
+        
+        # Check if the response mentions using tools
+        tool_patterns = [
+            r'filesystem-list',
+            r'filesystem-read',
+            r'filesystem-exists',
+            r'filesystem-info',
+            r'build-configure',
+            r'build-compile',
+            r'build-test',
+            r'memory-analyze',
+            r'memory-search',
+        ]
+        
+        tools_mentioned = []
+        for pattern in tool_patterns:
+            if re.search(pattern, response, re.IGNORECASE):
+                tools_mentioned.append(pattern)
+        
+        if required_tool.lower() in ' '.join(tools_mentioned).lower():
+            return 10.0  # Used the required tool
+        elif tools_mentioned:
+            return 6.0  # Used some tools but not the required one
+        else:
+            return 3.0  # Didn't use any tools when one was required
+
+
+def load_config(config_path: str) -> dict:
+    """Load the evaluation tasks configuration."""
+    with open(config_path, 'r') as f:
+        return yaml.safe_load(f)
+
+
+def get_tasks_for_categories(config: dict, categories: list[str]) -> list[dict]:
+    """Get all tasks for specified categories."""
+    tasks = []
+    
+    for cat_name, cat_data in config.get('categories', {}).items():
+        if 'all' in categories or cat_name in categories:
+            for task in cat_data.get('tasks', []):
+                task['category'] = cat_name
+                tasks.append(task)
+    
+    return tasks
+
+
+def run_evaluation(
+    models: list[str],
+    tasks: list[dict],
+    client: OllamaClient,
+    evaluator: TaskEvaluator,
+    timeout: int = 120
+) -> dict[str, ModelResults]:
+    """Run evaluation for all models and tasks."""
+    results = {}
+    
+    total = len(models) * len(tasks)
+    current = 0
+    
+    for model in models:
+        print(f"\n{'='*60}")
+        print(f"Evaluating: {model}")
+        print(f"{'='*60}")
+        
+        model_results = ModelResults(model=model)
+        
+        for task in tasks:
+            current += 1
+            print(f"\n  [{current}/{total}] {task['id']}: {task['name']}")
+            
+            # Handle multi-turn tasks differently
+            if task.get('multi_turn'):
+                response, resp_time = run_multi_turn_task(
+                    client, model, task, timeout
+                )
+            else:
+                prompt = task.get('prompt', '')
+                print(f"    Prompt: {prompt[:60]}...")
+                response, resp_time = client.chat(model, prompt, timeout)
+            
+            print(f"    Response time: {resp_time:.2f}s")
+            
+            # Create a copy of task with model info
+            task_with_model = {**task, 'model': model}
+            
+            # Evaluate the response
+            result = evaluator.evaluate(task_with_model, response, resp_time)
+            model_results.tasks.append(result)
+            
+            print(f"    Accuracy: {result.accuracy_score:.1f}/10")
+            print(f"    Completeness: {result.completeness_score:.1f}/10")
+            print(f"    Tool Usage: {result.tool_usage_score:.1f}/10")
+            print(f"    Overall: {result.overall_score:.1f}/10")
+        
+        results[model] = model_results
+    
+    return results
+
+
+def run_multi_turn_task(
+    client: OllamaClient,
+    model: str,
+    task: dict,
+    timeout: int
+) -> tuple[str, float]:
+    """Run a multi-turn conversation task."""
+    prompts = task.get('prompts', [])
+    if not prompts:
+        return "Error: No prompts defined for multi-turn task", 0.0
+    
+    total_time = 0.0
+    all_responses = []
+    
+    for i, prompt in enumerate(prompts):
+        # For simplicity, we send each prompt independently
+        # A more sophisticated version would maintain conversation context
+        print(f"    Turn {i+1}: {prompt[:50]}...")
+        response, resp_time = client.chat(model, prompt, timeout)
+        total_time += resp_time
+        all_responses.append(f"Turn {i+1}: {response}")
+    
+    return "\n\n".join(all_responses), total_time
+
+
+def print_summary(results: dict[str, ModelResults]):
+    """Print a summary table of results."""
+    print("\n")
+    print("┌" + "─"*70 + "┐")
+    print("│" + " "*20 + "YAZE AI Model Evaluation Report" + " "*18 + "│")
+    print("├" + "─"*70 + "┤")
+    print("│ {:20} │ {:10} │ {:10} │ {:10} │ {:10} │".format(
+        "Model", "Accuracy", "Tool Use", "Speed", "Overall"
+    ))
+    print("├" + "─"*70 + "┤")
+    
+    for model, model_results in sorted(
+        results.items(),
+        key=lambda x: x[1].overall_score,
+        reverse=True
+    ):
+        # Format model name (truncate if needed)
+        model_name = model[:20] if len(model) <= 20 else model[:17] + "..."
+        
+        print("│ {:20} │ {:8.1f}/10 │ {:8.1f}/10 │ {:7.1f}s │ {:8.1f}/10 │".format(
+            model_name,
+            model_results.avg_accuracy,
+            model_results.avg_tool_usage,
+            model_results.avg_response_time,
+            model_results.overall_score
+        ))
+    
+    print("└" + "─"*70 + "┘")
+
+
+def save_results(results: dict[str, ModelResults], output_path: str):
+    """Save detailed results to JSON file."""
+    output_data = {
+        "timestamp": datetime.now().isoformat(),
+        "version": "1.0",
+        "models": {}
+    }
+    
+    for model, model_results in results.items():
+        output_data["models"][model] = {
+            "summary": {
+                "avg_accuracy": model_results.avg_accuracy,
+                "avg_completeness": model_results.avg_completeness,
+                "avg_tool_usage": model_results.avg_tool_usage,
+                "avg_response_time": model_results.avg_response_time,
+                "overall_score": model_results.overall_score,
+            },
+            "tasks": [asdict(t) for t in model_results.tasks]
+        }
+    
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    
+    print(f"\nResults saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="YAZE AI Model Evaluation Runner"
+    )
+    parser.add_argument(
+        "--models", "-m",
+        type=str,
+        help="Comma-separated list of models to evaluate"
+    )
+    parser.add_argument(
+        "--all-models",
+        action="store_true",
+        help="Evaluate all available models"
+    )
+    parser.add_argument(
+        "--default-models",
+        action="store_true",
+        help="Evaluate default models from config"
+    )
+    parser.add_argument(
+        "--tasks", "-t",
+        type=str,
+        default="all",
+        help="Task categories to run (comma-separated, or 'all')"
+    )
+    parser.add_argument(
+        "--config", "-c",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "eval-tasks.yaml"),
+        help="Path to evaluation config file"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        help="Output file for results (default: results/eval-TIMESTAMP.json)"
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each task (default: 120)"
+    )
+    parser.add_argument(
+        "--ollama-url",
+        type=str,
+        default="http://localhost:11434",
+        help="Ollama API URL"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be evaluated without running"
+    )
+    
+    args = parser.parse_args()
+    
+    # Load configuration
+    print("Loading configuration...")
+    try:
+        config = load_config(args.config)
+    except Exception as e:
+        print(f"Error loading config: {e}")
+        sys.exit(1)
+    
+    # Initialize Ollama client
+    client = OllamaClient(args.ollama_url)
+    
+    if not client.is_available():
+        print("Error: Ollama is not running. Start it with 'ollama serve'")
+        sys.exit(1)
+    
+    # Determine which models to evaluate
+    available_models = client.list_models()
+    print(f"Available models: {', '.join(available_models) or 'none'}")
+    
+    if args.all_models:
+        models = available_models
+    elif args.default_models:
+        default_model_names = [
+            m['name'] for m in config.get('default_models', [])
+        ]
+        models = [m for m in default_model_names if m in available_models]
+        # Offer to pull missing models
+        missing = [m for m in default_model_names if m not in available_models]
+        if missing:
+            print(f"Missing default models: {', '.join(missing)}")
+            for m in missing:
+                if client.pull_model(m):
+                    models.append(m)
+    elif args.models:
+        models = [m.strip() for m in args.models.split(',')]
+        # Validate models exist
+        for m in models:
+            if m not in available_models:
+                print(f"Warning: Model '{m}' not found. Attempting to pull...")
+                if not client.pull_model(m):
+                    print(f"  Failed to pull {m}, skipping")
+                    models.remove(m)
+    else:
+        # Default to first available model
+        models = available_models[:1] if available_models else []
+    
+    if not models:
+        print("No models available for evaluation")
+        sys.exit(1)
+    
+    print(f"Models to evaluate: {', '.join(models)}")
+    
+    # Get tasks
+    categories = [c.strip() for c in args.tasks.split(',')]
+    tasks = get_tasks_for_categories(config, categories)
+    
+    if not tasks:
+        print(f"No tasks found for categories: {args.tasks}")
+        sys.exit(1)
+    
+    print(f"Tasks to run: {len(tasks)}")
+    for task in tasks:
+        print(f"  - [{task['category']}] {task['id']}: {task['name']}")
+    
+    if args.dry_run:
+        print("\nDry run complete. Use --help for options.")
+        sys.exit(0)
+    
+    # Run evaluation
+    evaluator = TaskEvaluator(config)
+    results = run_evaluation(
+        models, tasks, client, evaluator, args.timeout
+    )
+    
+    # Print summary
+    print_summary(results)
+    
+    # Save results
+    output_path = args.output or os.path.join(
+        os.path.dirname(__file__),
+        "results",
+        f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    save_results(results, output_path)
+    
+    # Return exit code based on best model score
+    best_score = max(r.overall_score for r in results.values())
+    if best_score >= 7.0:
+        sys.exit(0)  # Good
+    elif best_score >= 5.0:
+        sys.exit(1)  # Okay
+    else:
+        sys.exit(2)  # Poor
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/ai/eval-tasks.yaml
+++ b/scripts/ai/eval-tasks.yaml
@@ -0,0 +1,383 @@
+# YAZE AI Model Evaluation Tasks
+# 
+# This file defines evaluation tasks for comparing different AI models
+# used with the z3ed CLI agent system.
+#
+# Usage:
+#   ./scripts/ai/run-model-eval.sh --models "llama3,qwen2.5,codellama" --tasks all
+#   ./scripts/ai/run-model-eval.sh --tasks rom_inspection --models "llama3"
+#
+# Scoring:
+#   Each task is scored on a 0-10 scale across multiple dimensions:
+#   - accuracy: Did the model answer correctly?
+#   - completeness: Did it include all relevant information?
+#   - tool_usage: Did it use tools appropriately?
+#   - response_time: Measured in seconds (lower is better)
+
+version: "1.0"
+
+# Models to evaluate by default
+default_models:
+  - name: "llama3.2:latest"
+    description: "Meta's Llama 3.2 - default baseline"
+    type: "baseline"
+  - name: "qwen2.5-coder:7b"
+    description: "Qwen 2.5 Coder - optimized for code"
+    type: "code"
+  - name: "codellama:7b"
+    description: "Meta's CodeLlama - code generation"
+    type: "code"
+  - name: "mistral:7b"
+    description: "Mistral 7B - general purpose"
+    type: "general"
+  - name: "phi3:medium"
+    description: "Microsoft Phi-3 - efficient"
+    type: "efficient"
+
+# Scoring weights for overall score calculation
+scoring_weights:
+  accuracy: 0.4
+  completeness: 0.3
+  tool_usage: 0.2
+  response_time: 0.1
+
+# Maximum response time before timeout (seconds)
+timeout: 120
+
+# Evaluation task categories
+categories:
+  rom_inspection:
+    description: "Tasks that inspect ROM data structures"
+    tasks:
+      - id: "list_dungeons"
+        name: "List Dungeons"
+        prompt: "What dungeons are in this ROM? List their names and IDs."
+        expected_patterns:
+          - "eastern palace|palace of darkness|desert palace"
+          - "tower of hera|swamp palace|skull woods"
+          - "thieves|ice palace|misery mire"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Lists at least 8 dungeons with correct names"
+          completeness_criteria: "Includes dungeon IDs or entrance info"
+        
+      - id: "describe_overworld"
+        name: "Describe Overworld Map"
+        prompt: "Describe overworld map 0 (Light World). What areas and features are visible?"
+        expected_patterns:
+          - "light world|hyrule"
+          - "castle|sanctuary|kakariko"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Correctly identifies the Light World"
+          completeness_criteria: "Mentions multiple notable locations"
+        
+      - id: "find_sprites"
+        name: "Find Sprites in Room"
+        prompt: "What sprites are present in dungeon room 0? List their types and positions."
+        expected_patterns:
+          - "sprite|enemy|npc"
+          - "position|coordinate|x|y"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Lists sprites with correct types"
+          completeness_criteria: "Includes position data"
+        
+      - id: "entrance_info"
+        name: "Get Entrance Information"
+        prompt: "Where is the entrance to the Eastern Palace?"
+        expected_patterns:
+          - "eastern|palace|entrance"
+          - "east|light world"
+        required_tool: null
+        scoring:
+          accuracy_criteria: "Correctly identifies entrance location"
+          completeness_criteria: "Provides coordinates or map reference"
+
+  code_analysis:
+    description: "Tasks that analyze or generate code"
+    tasks:
+      - id: "explain_function"
+        name: "Explain Function"
+        prompt: "Explain what the function LoadDungeonRoom does in the codebase."
+        expected_patterns:
+          - "dungeon|room|load"
+          - "tilemap|object|sprite"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Correctly describes the function purpose"
+          completeness_criteria: "Explains key steps or data flows"
+        
+      - id: "find_bugs"
+        name: "Find Potential Issues"
+        prompt: "Are there any potential issues with how sprite coordinates are handled in room loading?"
+        expected_patterns:
+          - "bounds|overflow|check"
+          - "coordinate|position"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Identifies real or plausible issues"
+          completeness_criteria: "Explains why the issue matters"
+        
+      - id: "suggest_refactor"
+        name: "Suggest Refactoring"
+        prompt: "How could the dungeon editor's room rendering be improved for performance?"
+        expected_patterns:
+          - "cache|batch|optimize"
+          - "render|draw|update"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Suggests valid optimization strategies"
+          completeness_criteria: "Explains implementation approach"
+
+  tool_calling:
+    description: "Tasks that require proper tool usage"
+    tasks:
+      - id: "list_files"
+        name: "List Source Files"
+        prompt: "List all .cc files in src/app/editor/"
+        expected_patterns:
+          - "\\.cc"
+          - "editor"
+        required_tool: "filesystem-list"
+        scoring:
+          accuracy_criteria: "Uses filesystem-list tool correctly"
+          completeness_criteria: "Lists files in correct directory"
+        
+      - id: "read_file"
+        name: "Read File Contents"
+        prompt: "What are the first 20 lines of src/app/rom.h?"
+        expected_patterns:
+          - "#ifndef|#define|#include"
+          - "rom|Rom"
+        required_tool: "filesystem-read"
+        scoring:
+          accuracy_criteria: "Uses filesystem-read with correct path"
+          completeness_criteria: "Shows actual file content"
+        
+      - id: "check_existence"
+        name: "Check File Existence"
+        prompt: "Does the file src/app/editor/dungeon/dungeon_editor.cc exist?"
+        expected_patterns:
+          - "exists|found|yes"
+        required_tool: "filesystem-exists"
+        scoring:
+          accuracy_criteria: "Uses filesystem-exists tool"
+          completeness_criteria: "Provides clear yes/no answer"
+        
+      - id: "build_status"
+        name: "Get Build Status"
+        prompt: "What build presets are available for macOS?"
+        expected_patterns:
+          - "mac-dbg|mac-rel|mac-ai|mac-test"
+          - "preset|configure"
+        required_tool: "build-configure"
+        scoring:
+          accuracy_criteria: "Lists valid macOS presets"
+          completeness_criteria: "Describes preset purposes"
+
+  visual_analysis:
+    description: "Tasks for visual analysis and pattern recognition"
+    tasks:
+      - id: "find_similar_tiles"
+        name: "Find Similar Tiles"
+        prompt: "Find tiles similar to tile 42 in the ROM. Use a similarity threshold of 85%."
+        expected_patterns:
+          - "similar|match|tile"
+          - "similarity|score|percent"
+        required_tool: "visual-find-similar-tiles"
+        scoring:
+          accuracy_criteria: "Uses visual-find-similar-tiles with correct parameters"
+          completeness_criteria: "Returns list of matching tiles with scores"
+        
+      - id: "analyze_spritesheet"
+        name: "Analyze Spritesheet"
+        prompt: "Analyze graphics sheet 10 to find unused regions that could be used for custom graphics."
+        expected_patterns:
+          - "unused|empty|free"
+          - "region|space|tile"
+        required_tool: "visual-analyze-spritesheet"
+        scoring:
+          accuracy_criteria: "Uses visual-analyze-spritesheet tool"
+          completeness_criteria: "Reports locations and sizes of free regions"
+        
+      - id: "palette_usage"
+        name: "Palette Usage Analysis"
+        prompt: "Analyze which palettes are used most frequently in the overworld maps."
+        expected_patterns:
+          - "palette|color"
+          - "usage|count|percent"
+        required_tool: "visual-palette-usage"
+        scoring:
+          accuracy_criteria: "Uses visual-palette-usage with overworld type"
+          completeness_criteria: "Shows palette usage statistics"
+        
+      - id: "tile_histogram"
+        name: "Tile Usage Histogram"
+        prompt: "Generate a histogram of the top 20 most used tiles in dungeon rooms."
+        expected_patterns:
+          - "tile|usage|histogram"
+          - "count|frequency|top"
+        required_tool: "visual-tile-histogram"
+        scoring:
+          accuracy_criteria: "Uses visual-tile-histogram with dungeon type"
+          completeness_criteria: "Lists top tiles with usage counts"
+
+  project_management:
+    description: "Tasks for project state and snapshot management"
+    tasks:
+      - id: "project_status"
+        name: "Get Project Status"
+        prompt: "What is the current project status? Show me any pending edits and available snapshots."
+        expected_patterns:
+          - "project|status|snapshot"
+          - "edit|pending|initialized"
+        required_tool: "project-status"
+        scoring:
+          accuracy_criteria: "Uses project-status tool correctly"
+          completeness_criteria: "Reports project state, snapshots, and ROM checksum"
+
+      - id: "create_snapshot"
+        name: "Create Project Snapshot"
+        prompt: "Create a snapshot named 'v1.0' with description 'Initial sprite modifications'."
+        expected_patterns:
+          - "snapshot|created|v1.0"
+          - "edit|delta|saved"
+        required_tool: "project-snapshot"
+        scoring:
+          accuracy_criteria: "Uses project-snapshot with correct name parameter"
+          completeness_criteria: "Confirms snapshot creation with details"
+
+      - id: "compare_snapshots"
+        name: "Compare Snapshots"
+        prompt: "Compare snapshots 'before-fix' and 'after-fix' to see what changed."
+        expected_patterns:
+          - "diff|compare|changed"
+          - "added|removed|modified"
+        required_tool: "project-diff"
+        scoring:
+          accuracy_criteria: "Uses project-diff with both snapshot names"
+          completeness_criteria: "Shows detailed comparison of edits"
+
+      - id: "restore_checkpoint"
+        name: "Restore to Checkpoint"
+        prompt: "Restore the ROM to the 'stable' snapshot."
+        expected_patterns:
+          - "restore|snapshot|stable"
+          - "applied|reverted|edit"
+        required_tool: "project-restore"
+        scoring:
+          accuracy_criteria: "Uses project-restore with correct snapshot name"
+          completeness_criteria: "Confirms restoration and lists applied edits"
+
+  code_generation:
+    description: "Tasks for ASM code generation and patching"
+    tasks:
+      - id: "generate_hook"
+        name: "Generate ASM Hook"
+        prompt: "Generate an ASM hook at address $008040 with label MyCustomHook and 2 NOPs for alignment."
+        expected_patterns:
+          - "hook|JSL|008040"
+          - "MyCustomHook|NOP"
+        required_tool: "codegen-asm-hook"
+        scoring:
+          accuracy_criteria: "Uses codegen-asm-hook with correct address and label"
+          completeness_criteria: "Generates valid ASM with proper hook structure"
+
+      - id: "find_freespace"
+        name: "Find Freespace for Patch"
+        prompt: "Generate a freespace patch for 256 bytes of code labeled 'NewSpriteCode', preferring bank $3F."
+        expected_patterns:
+          - "freespace|org|NewSpriteCode"
+          - "1F8000|bank|free"
+        required_tool: "codegen-freespace-patch"
+        scoring:
+          accuracy_criteria: "Uses codegen-freespace-patch with size and label"
+          completeness_criteria: "Reports available regions and generates allocation code"
+
+      - id: "sprite_template"
+        name: "Generate Sprite Template"
+        prompt: "Generate a sprite template named 'FollowerSprite' with init code that sets sprite state and main code that follows the player."
+        expected_patterns:
+          - "sprite|FollowerSprite|template"
+          - "init|main|0DD0"
+        required_tool: "codegen-sprite-template"
+        scoring:
+          accuracy_criteria: "Uses codegen-sprite-template with name and custom code"
+          completeness_criteria: "Generates complete sprite with init and main sections"
+
+      - id: "event_handler"
+        name: "Generate Event Handler"
+        prompt: "Generate an NMI event handler labeled 'FrameCounter' that increments a counter each frame."
+        expected_patterns:
+          - "NMI|event|handler"
+          - "FrameCounter|INC|counter"
+        required_tool: "codegen-event-handler"
+        scoring:
+          accuracy_criteria: "Uses codegen-event-handler with type=nmi and label"
+          completeness_criteria: "Generates handler with state preservation and custom code"
+
+  conversation:
+    description: "Tasks testing multi-turn dialog and context"
+    tasks:
+      - id: "follow_up"
+        name: "Follow-up Questions"
+        multi_turn: true
+        prompts:
+          - "What is the main purpose of the Rom class?"
+          - "What methods does it have for loading data?"
+          - "Can you show me an example of using LoadFromFile?"
+        expected_patterns:
+          - "rom|ROM|file"
+          - "load|read|parse"
+          - "example|code|usage"
+        scoring:
+          accuracy_criteria: "Maintains context across turns"
+          completeness_criteria: "Each response builds on previous"
+        
+      - id: "clarification"
+        name: "Handle Clarification"
+        multi_turn: true
+        prompts:
+          - "How do I add a new sprite?"
+          - "I mean in the dungeon editor, not the overworld"
+        expected_patterns:
+          - "sprite|dungeon|editor"
+          - "add|create|place"
+        scoring:
+          accuracy_criteria: "Adjusts response based on clarification"
+          completeness_criteria: "Provides dungeon-specific instructions"
+
+# Scoring rubric definitions
+scoring_rubric:
+  accuracy:
+    10: "Perfect - completely correct with no errors"
+    8: "Excellent - minor inaccuracies that don't affect understanding"
+    6: "Good - mostly correct with some notable errors"
+    4: "Fair - partially correct but missing key points"
+    2: "Poor - significant errors or misunderstandings"
+    0: "Incorrect - completely wrong or off-topic"
+  
+  completeness:
+    10: "Comprehensive - covers all aspects thoroughly"
+    8: "Very complete - covers most aspects well"
+    6: "Adequate - covers main points but missing some details"
+    4: "Partial - covers some points but lacks depth"
+    2: "Minimal - barely addresses the question"
+    0: "Incomplete - doesn't meaningfully address the question"
+  
+  tool_usage:
+    10: "Perfect - uses correct tools with proper parameters"
+    8: "Good - uses appropriate tools with minor parameter issues"
+    6: "Adequate - uses tools but not optimally"
+    4: "Fair - attempts tool use but with errors"
+    2: "Poor - wrong tool or significant usage errors"
+    0: "Failed - doesn't use required tools or fails completely"
+
+# Report configuration
+reporting:
+  output_format: "table"  # table, json, markdown
+  show_individual_scores: true
+  show_response_samples: true
+  max_sample_length: 500
+
--- a/scripts/ai/results/.gitkeep
+++ b/scripts/ai/results/.gitkeep
@@ -0,0 +1,3 @@
+# This directory stores AI evaluation results
+# Results are gitignored but this file keeps the directory in the repo
+
--- a/scripts/ai/run-model-eval.sh
+++ b/scripts/ai/run-model-eval.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+# =============================================================================
+# YAZE AI Model Evaluation Script
+# 
+# Runs AI model evaluations using the eval-runner.py engine.
+#
+# Usage:
+#   ./run-model-eval.sh                          # Run with defaults
+#   ./run-model-eval.sh --models llama3,qwen2.5  # Specific models
+#   ./run-model-eval.sh --all                    # All available models
+#   ./run-model-eval.sh --quick                  # Quick smoke test
+#   ./run-model-eval.sh --compare                # Compare and report
+#
+# Prerequisites:
+#   - Ollama running (ollama serve)
+#   - Python 3.10+ with requests and pyyaml
+#   - At least one model pulled (ollama pull llama3.2)
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+RESULTS_DIR="$SCRIPT_DIR/results"
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+# Default settings
+MODELS=""
+TASKS="all"
+TIMEOUT=120
+DRY_RUN=false
+COMPARE=false
+QUICK_MODE=false
+ALL_MODELS=false
+DEFAULT_MODELS=false
+VERBOSE=false
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+print_header() {
+    echo -e "${CYAN}"
+    echo "╔════════════════════════════════════════════════════════════════════╗"
+    echo "║                    YAZE AI Model Evaluation                        ║"
+    echo "╚════════════════════════════════════════════════════════════════════╝"
+    echo -e "${NC}"
+}
+
+print_step() {
+    echo -e "${BLUE}[*]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[✓]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[!]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[✗]${NC} $1"
+}
+
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  --models, -m LIST    Comma-separated list of models to evaluate"
+    echo "  --all                Evaluate all available models"
+    echo "  --default            Evaluate default models from config"
+    echo "  --tasks, -t LIST     Task categories (default: all)"
+    echo "                       Options: rom_inspection, code_analysis, tool_calling, conversation"
+    echo "  --timeout SEC        Timeout per task in seconds (default: 120)"
+    echo "  --quick              Quick smoke test (fewer tasks)"
+    echo "  --dry-run            Show what would run without executing"
+    echo "  --compare            Generate comparison report after evaluation"
+    echo "  --verbose, -v        Verbose output"
+    echo "  --help, -h           Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 --models llama3.2,qwen2.5-coder --tasks tool_calling"
+    echo "  $0 --all --compare"
+    echo "  $0 --quick --default"
+}
+
+check_prerequisites() {
+    print_step "Checking prerequisites..."
+    
+    local missing=false
+    
+    # Check Python
+    if ! command -v python3 &> /dev/null; then
+        print_error "Python 3 not found"
+        missing=true
+    else
+        print_success "Python 3 found: $(python3 --version)"
+    fi
+    
+    # Check Python packages
+    if python3 -c "import requests" 2>/dev/null; then
+        print_success "Python 'requests' package installed"
+    else
+        print_warning "Python 'requests' package missing - installing..."
+        pip3 install requests --quiet || missing=true
+    fi
+    
+    if python3 -c "import yaml" 2>/dev/null; then
+        print_success "Python 'pyyaml' package installed"
+    else
+        print_warning "Python 'pyyaml' package missing - installing..."
+        pip3 install pyyaml --quiet || missing=true
+    fi
+    
+    # Check Ollama
+    if ! command -v ollama &> /dev/null; then
+        print_error "Ollama not found. Install from https://ollama.ai"
+        missing=true
+    else
+        print_success "Ollama found: $(ollama --version 2>/dev/null || echo 'version unknown')"
+    fi
+    
+    # Check if Ollama is running
+    if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+        print_success "Ollama server is running"
+    else
+        print_warning "Ollama server not running - attempting to start..."
+        ollama serve &> /dev/null &
+        sleep 3
+        if curl -s http://localhost:11434/api/tags > /dev/null 2>&1; then
+            print_success "Ollama server started"
+        else
+            print_error "Could not start Ollama server. Run 'ollama serve' manually."
+            missing=true
+        fi
+    fi
+    
+    if $missing; then
+        print_error "Prerequisites check failed"
+        exit 1
+    fi
+    
+    echo ""
+}
+
+list_available_models() {
+    curl -s http://localhost:11434/api/tags | python3 -c "
+import json, sys
+data = json.load(sys.stdin)
+for model in data.get('models', []):
+    print(model['name'])
+" 2>/dev/null || echo ""
+}
+
+ensure_model() {
+    local model=$1
+    local available=$(list_available_models)
+    
+    if echo "$available" | grep -q "^$model$"; then
+        return 0
+    else
+        print_warning "Model '$model' not found, pulling..."
+        ollama pull "$model"
+        return $?
+    fi
+}
+
+run_evaluation() {
+    local args=()
+    
+    if [ -n "$MODELS" ]; then
+        args+=(--models "$MODELS")
+    elif $ALL_MODELS; then
+        args+=(--all-models)
+    elif $DEFAULT_MODELS; then
+        args+=(--default-models)
+    fi
+    
+    args+=(--tasks "$TASKS")
+    args+=(--timeout "$TIMEOUT")
+    args+=(--config "$SCRIPT_DIR/eval-tasks.yaml")
+    
+    if $DRY_RUN; then
+        args+=(--dry-run)
+    fi
+    
+    local output_file="$RESULTS_DIR/eval-$(date +%Y%m%d-%H%M%S).json"
+    args+=(--output "$output_file")
+    
+    print_step "Running evaluation..."
+    if $VERBOSE; then
+        echo "  Command: python3 $SCRIPT_DIR/eval-runner.py ${args[*]}"
+    fi
+    echo ""
+    
+    python3 "$SCRIPT_DIR/eval-runner.py" "${args[@]}"
+    local exit_code=$?
+    
+    if [ $exit_code -eq 0 ]; then
+        print_success "Evaluation completed successfully"
+    elif [ $exit_code -eq 1 ]; then
+        print_warning "Evaluation completed with moderate scores"
+    else
+        print_error "Evaluation completed with poor scores"
+    fi
+    
+    return 0
+}
+
+run_comparison() {
+    print_step "Generating comparison report..."
+    
+    local result_files=$(ls -t "$RESULTS_DIR"/eval-*.json 2>/dev/null | head -5)
+    
+    if [ -z "$result_files" ]; then
+        print_error "No result files found"
+        return 1
+    fi
+    
+    local report_file="$RESULTS_DIR/comparison-$(date +%Y%m%d-%H%M%S).md"
+    
+    python3 "$SCRIPT_DIR/compare-models.py" \
+        --format markdown \
+        --task-analysis \
+        --output "$report_file" \
+        $result_files
+    
+    print_success "Comparison report: $report_file"
+    
+    # Also print table to console
+    echo ""
+    python3 "$SCRIPT_DIR/compare-models.py" --format table $result_files
+}
+
+quick_test() {
+    print_step "Running quick smoke test..."
+    
+    # Get first available model
+    local available=$(list_available_models | head -1)
+    
+    if [ -z "$available" ]; then
+        print_error "No models available. Pull a model with: ollama pull llama3.2"
+        exit 1
+    fi
+    
+    print_step "Using model: $available"
+    
+    # Run just one task category
+    python3 "$SCRIPT_DIR/eval-runner.py" \
+        --models "$available" \
+        --tasks tool_calling \
+        --timeout 60 \
+        --config "$SCRIPT_DIR/eval-tasks.yaml"
+}
+
+# =============================================================================
+# Main
+# =============================================================================
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --models|-m)
+            MODELS="$2"
+            shift 2
+            ;;
+        --all)
+            ALL_MODELS=true
+            shift
+            ;;
+        --default)
+            DEFAULT_MODELS=true
+            shift
+            ;;
+        --tasks|-t)
+            TASKS="$2"
+            shift 2
+            ;;
+        --timeout)
+            TIMEOUT="$2"
+            shift 2
+            ;;
+        --quick)
+            QUICK_MODE=true
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=true
+            shift
+            ;;
+        --compare)
+            COMPARE=true
+            shift
+            ;;
+        --verbose|-v)
+            VERBOSE=true
+            shift
+            ;;
+        --help|-h)
+            usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Ensure results directory exists
+mkdir -p "$RESULTS_DIR"
+
+print_header
+check_prerequisites
+
+if $QUICK_MODE; then
+    quick_test
+elif $DRY_RUN; then
+    run_evaluation
+else
+    run_evaluation
+    
+    if $COMPARE; then
+        echo ""
+        run_comparison
+    fi
+fi
+
+echo ""
+print_success "Done!"
+
--- a/scripts/analyze_room.py
+++ b/scripts/analyze_room.py
@@ -0,0 +1,825 @@
+#!/usr/bin/env python3
+"""
+Dungeon Room Object Analyzer for ALTTP ROM Hacking.
+
+This script parses room data from a Link to the Past ROM to understand which
+objects are on each layer (BG1/BG2). Useful for debugging layer compositing
+and understanding room structure.
+
+Usage:
+    python analyze_room.py [OPTIONS] [ROOM_IDS...]
+
+Examples:
+    python analyze_room.py 1                    # Analyze room 001
+    python analyze_room.py 1 2 3                # Analyze rooms 001, 002, 003
+    python analyze_room.py --range 0 10         # Analyze rooms 0-10
+    python analyze_room.py --all                # Analyze all 296 rooms (summary only)
+    python analyze_room.py 1 --json             # Output as JSON
+    python analyze_room.py 1 --rom path/to.sfc  # Use specific ROM file
+    python analyze_room.py --list-bg2           # List all rooms with BG2 overlay objects
+
+Collision Offset Features:
+    python analyze_room.py 0x27 --collision              # Show collision offsets
+    python analyze_room.py 0x27 --collision --asm        # Output ASM format
+    python analyze_room.py 0x27 --collision --filter-id 0xD9  # Filter by object ID
+    python analyze_room.py 0x27 --collision --area       # Expand objects to full tile area
+"""
+
+import argparse
+import json
+import os
+import struct
+import sys
+from typing import Dict, List, Optional, Tuple
+
+# ROM addresses from dungeon_rom_addresses.h
+ROOM_OBJECT_POINTER = 0x874C  # Object data pointer table
+ROOM_HEADER_POINTER = 0xB5DD  # Room header pointer
+NUMBER_OF_ROOMS = 296
+
+# Default ROM path (relative to script location)
+DEFAULT_ROM_PATHS = [
+    "roms/alttp_vanilla.sfc",
+    "../roms/alttp_vanilla.sfc",
+    "roms/vanilla.sfc",
+    "../roms/vanilla.sfc",
+]
+
+# Object descriptions - comprehensive list
+OBJECT_DESCRIPTIONS = {
+    # Type 1 Objects (0x00-0xFF)
+    0x00: "Ceiling (2x2)",
+    0x01: "Wall horizontal (2x4)",
+    0x02: "Wall horizontal (2x4, variant)",
+    0x03: "Diagonal wall NW->SE",
+    0x04: "Diagonal wall NE->SW",
+    0x05: "Pit horizontal (4x2)",
+    0x06: "Pit vertical (2x4)",
+    0x07: "Floor pattern",
+    0x08: "Water edge",
+    0x09: "Water edge variant",
+    0x0A: "Conveyor belt",
+    0x0B: "Conveyor belt variant",
+    0x0C: "Diagonal acute",
+    0x0D: "Diagonal acute variant",
+    0x0E: "Pushable block",
+    0x0F: "Rail",
+    0x10: "Diagonal grave",
+    0x11: "Diagonal grave variant",
+    0x12: "Wall top edge",
+    0x13: "Wall bottom edge",
+    0x14: "Diagonal acute 2",
+    0x15: "Diagonal acute 2 variant",
+    0x16: "Wall pattern",
+    0x17: "Wall pattern variant",
+    0x18: "Diagonal grave 2",
+    0x19: "Diagonal grave 2 variant",
+    0x1A: "Inner corner NW",
+    0x1B: "Inner corner NE",
+    0x1C: "Diagonal acute 3",
+    0x1D: "Diagonal acute 3 variant",
+    0x1E: "Diagonal grave 3",
+    0x1F: "Diagonal grave 3 variant",
+    0x20: "Diagonal acute 4",
+    
+    0x21: "Floor edge 1x2",
+    0x22: "Has edge 1x1",
+    0x23: "Has edge 1x1 variant",
+    0x24: "Has edge 1x1 variant 2",
+    0x25: "Has edge 1x1 variant 3",
+    0x26: "Has edge 1x1 variant 4",
+    
+    0x30: "Bottom corners 1x2",
+    0x31: "Nothing A",
+    0x32: "Nothing A",
+    0x33: "Floor 4x4",
+    0x34: "Solid 1x1",
+    0x35: "Door switcher",
+    0x36: "Decor 4x4",
+    0x37: "Decor 4x4 variant",
+    0x38: "Statue 2x3",
+    0x39: "Pillar 2x4",
+    0x3A: "Decor 4x3",
+    0x3B: "Decor 4x3 variant",
+    0x3C: "Doubled 2x2",
+    0x3D: "Pillar 2x4 variant",
+    0x3E: "Decor 2x2",
+    
+    0x47: "Waterfall",
+    0x48: "Waterfall variant",
+    0x49: "Floor tile 4x2",
+    0x4A: "Floor tile 4x2 variant",
+    0x4C: "Bar 4x3",
+    0x4D: "Shelf 4x4",
+    0x4E: "Shelf 4x4 variant",
+    0x4F: "Shelf 4x4 variant 2",
+    0x50: "Line 1x1",
+    0x51: "Cannon hole 4x3",
+    0x52: "Cannon hole 4x3 variant",
+    
+    0x60: "Wall vertical (2x2)",
+    0x61: "Wall vertical (4x2)",
+    0x62: "Wall vertical (4x2, variant)",
+    0x63: "Diagonal wall NW->SE (vert)",
+    0x64: "Diagonal wall NE->SW (vert)",
+    0x65: "Decor 4x2",
+    0x66: "Decor 4x2 variant",
+    0x67: "Floor 2x2",
+    0x68: "Floor 2x2 variant",
+    0x69: "Has edge 1x1 (vert)",
+    0x6A: "Edge 1x1",
+    0x6B: "Edge 1x1 variant",
+    0x6C: "Left corners 2x1",
+    0x6D: "Right corners 2x1",
+    
+    0x70: "Floor 4x4 (vert)",
+    0x71: "Solid 1x1 (vert)",
+    0x72: "Nothing B",
+    0x73: "Decor 4x4 (vert)",
+    
+    0x85: "Cannon hole 3x4",
+    0x86: "Cannon hole 3x4 variant",
+    0x87: "Pillar 2x4 (vert)",
+    0x88: "Big rail 3x1",
+    0x89: "Block 2x2",
+    
+    0xA0: "Diagonal ceiling TL",
+    0xA1: "Diagonal ceiling BL",
+    0xA2: "Diagonal ceiling TR",
+    0xA3: "Diagonal ceiling BR",
+    0xA4: "Big hole 4x4",
+    0xA5: "Diagonal ceiling TL B",
+    0xA6: "Diagonal ceiling BL B",
+    0xA7: "Diagonal ceiling TR B",
+    0xA8: "Diagonal ceiling BR B",
+    
+    0xC0: "Chest",
+    0xC1: "Chest variant",
+    0xC2: "Big chest",
+    0xC3: "Big chest variant",
+    0xC4: "Interroom stairs",
+    0xC5: "Torch",
+    0xC6: "Torch (variant)",
+    
+    0xE0: "Pot",
+    0xE1: "Block",
+    0xE2: "Pot variant",
+    0xE3: "Block variant",
+    0xE4: "Pot (skull)",
+    0xE5: "Block (push any)",
+    0xE6: "Skull pot",
+    0xE7: "Big gray block",
+    0xE8: "Spike block",
+    0xE9: "Spike block variant",
+    
+    # Type 2 objects (0x100+)
+    0x100: "Corner NW (concave)",
+    0x101: "Corner NE (concave)",
+    0x102: "Corner SW (concave)",
+    0x103: "Corner SE (concave)",
+    0x104: "Corner NW (convex)",
+    0x105: "Corner NE (convex)",
+    0x106: "Corner SW (convex)",
+    0x107: "Corner SE (convex)",
+    0x108: "4x4 Corner NW",
+    0x109: "4x4 Corner NE",
+    0x10A: "4x4 Corner SW",
+    0x10B: "4x4 Corner SE",
+    0x10C: "Corner piece NW",
+    0x10D: "Corner piece NE",
+    0x10E: "Corner piece SW",
+    0x10F: "Corner piece SE",
+    0x110: "Weird corner bottom NW",
+    0x111: "Weird corner bottom NE",
+    0x112: "Weird corner bottom SW",
+    0x113: "Weird corner bottom SE",
+    0x114: "Weird corner top NW",
+    0x115: "Weird corner top NE",
+    0x116: "Platform / Floor overlay",
+    0x117: "Platform variant",
+    0x118: "Statue / Pillar",
+    0x119: "Statue / Pillar variant",
+    0x11A: "Star tile switch",
+    0x11B: "Star tile switch variant",
+    0x11C: "Rail platform",
+    0x11D: "Rail platform variant",
+    0x11E: "Somaria platform",
+    0x11F: "Somaria platform variant",
+    0x120: "Stairs up (north)",
+    0x121: "Stairs down (south)",
+    0x122: "Stairs left",
+    0x123: "Stairs right",
+    0x124: "Spiral stairs up",
+    0x125: "Spiral stairs down",
+    0x126: "Sanctuary entrance",
+    0x127: "Sanctuary entrance variant",
+    0x128: "Hole/pit",
+    0x129: "Hole/pit variant",
+    0x12A: "Warp tile",
+    0x12B: "Warp tile variant",
+    0x12C: "Layer switch NW",
+    0x12D: "Layer switch NE",
+    0x12E: "Layer switch SW",
+    0x12F: "Layer switch SE",
+    0x130: "Light cone",
+    0x131: "Light cone variant",
+    0x132: "Floor switch",
+    0x133: "Floor switch (heavy)",
+    0x134: "Bombable floor",
+    0x135: "Bombable floor variant",
+    0x136: "Cracked floor",
+    0x137: "Cracked floor variant",
+    0x138: "Stairs inter-room",
+    0x139: "Stairs inter-room variant",
+    0x13A: "Stairs straight",
+    0x13B: "Stairs straight variant",
+    0x13C: "Eye switch",
+    0x13D: "Eye switch variant",
+    0x13E: "Crystal switch",
+    0x13F: "Crystal switch variant",
+}
+
+# Draw routine names for detailed analysis
+DRAW_ROUTINES = {
+    0x01: "RoomDraw_Rightwards2x4_1to15or26",
+    0x02: "RoomDraw_Rightwards2x4_1to15or26",
+    0x03: "RoomDraw_Rightwards2x4_1to16_BothBG",
+    0x04: "RoomDraw_Rightwards2x4_1to16_BothBG",
+    0x33: "RoomDraw_Rightwards4x4_1to16",
+    0x34: "RoomDraw_Rightwards1x1Solid_1to16_plus3",
+    0x38: "RoomDraw_RightwardsStatue2x3spaced2_1to16",
+    0x61: "RoomDraw_Downwards4x2_1to15or26",
+    0x62: "RoomDraw_Downwards4x2_1to15or26",
+    0x63: "RoomDraw_Downwards4x2_1to16_BothBG",
+    0x64: "RoomDraw_Downwards4x2_1to16_BothBG",
+    0x71: "RoomDraw_Downwards1x1Solid_1to16_plus3",
+    0xA4: "RoomDraw_BigHole4x4_1to16",
+    0xC6: "RoomDraw_Torch",
+}
+
+
+def snes_to_pc(snes_addr: int) -> int:
+    """Convert SNES LoROM address to PC file offset."""
+    bank = (snes_addr >> 16) & 0xFF
+    addr = snes_addr & 0xFFFF
+    
+    if bank >= 0x80:
+        bank -= 0x80
+    
+    if addr >= 0x8000:
+        return (bank * 0x8000) + (addr - 0x8000)
+    else:
+        return snes_addr & 0x3FFFFF
+
+
+def read_long(rom_data: bytes, offset: int) -> int:
+    """Read a 24-bit little-endian long address."""
+    return struct.unpack('<I', rom_data[offset:offset+3] + b'\x00')[0]
+
+
+def decode_object(b1: int, b2: int, b3: int, layer: int) -> Dict:
+    """Decode 3-byte object data into object properties."""
+    obj = {
+        'b1': b1, 'b2': b2, 'b3': b3,
+        'layer': layer,
+        'type': 1,
+        'id': 0,
+        'x': 0,
+        'y': 0,
+        'size': 0
+    }
+    
+    # Type 2: 111111xx xxxxyyyy yyiiiiii
+    if b1 >= 0xFC:
+        obj['type'] = 2
+        obj['id'] = (b3 & 0x3F) | 0x100
+        obj['x'] = ((b2 & 0xF0) >> 4) | ((b1 & 0x03) << 4)
+        obj['y'] = ((b2 & 0x0F) << 2) | ((b3 & 0xC0) >> 6)
+        obj['size'] = 0
+    # Type 3: xxxxxxii yyyyyyii 11111iii
+    elif b3 >= 0xF8:
+        obj['type'] = 3
+        obj['id'] = (b3 << 4) | 0x80 | ((b2 & 0x03) << 2) | (b1 & 0x03)
+        obj['x'] = (b1 & 0xFC) >> 2
+        obj['y'] = (b2 & 0xFC) >> 2
+        obj['size'] = ((b1 & 0x03) << 2) | (b2 & 0x03)
+    # Type 1: xxxxxxss yyyyyyss iiiiiiii
+    else:
+        obj['type'] = 1
+        obj['id'] = b3
+        obj['x'] = (b1 & 0xFC) >> 2
+        obj['y'] = (b2 & 0xFC) >> 2
+        obj['size'] = ((b1 & 0x03) << 2) | (b2 & 0x03)
+    
+    return obj
+
+
+def get_object_description(obj_id: int) -> str:
+    """Return a human-readable description of an object ID."""
+    return OBJECT_DESCRIPTIONS.get(obj_id, f"Object 0x{obj_id:03X}")
+
+
+def get_draw_routine(obj_id: int) -> str:
+    """Return the draw routine name for an object ID."""
+    return DRAW_ROUTINES.get(obj_id, "")
+
+
+# =============================================================================
+# Collision Offset Functions
+# =============================================================================
+
+def calculate_collision_offset(x_tile: int, y_tile: int) -> int:
+    """Calculate offset into $7F2000 collision map.
+
+    Collision map is 64 bytes per row (64 tiles wide).
+    Each position is 1 byte, but SNES uses 16-bit addressing.
+    Formula: offset = (Y * 64) + X
+    """
+    return (y_tile * 64) + x_tile
+
+
+def expand_object_area(obj: Dict) -> List[Tuple[int, int]]:
+    """Expand object to full tile coverage based on size.
+
+    Object 'size' field encodes dimensions differently per object type.
+    Water/flood objects use size as horizontal span.
+    Type 2 objects (0x100+) are typically fixed-size.
+    """
+    tiles = []
+    x, y, size = obj['x'], obj['y'], obj['size']
+    obj_id = obj['id']
+
+    # Water/flood objects (0x0C9, 0x0D9, etc.) - horizontal span
+    # Size encodes horizontal extent
+    if obj_id in [0xC9, 0xD9, 0x0C9, 0x0D9]:
+        # Size is the horizontal span (number of tiles - 1)
+        for dx in range(size + 1):
+            tiles.append((x + dx, y))
+
+    # Floor 4x4 objects (0x33, 0x70)
+    elif obj_id in [0x33, 0x70]:
+        # 4x4 block, size adds to dimensions
+        width = 4 + (size & 0x03)
+        height = 4 + ((size >> 2) & 0x03)
+        for dy in range(height):
+            for dx in range(width):
+                tiles.append((x + dx, y + dy))
+
+    # Wall objects (size extends in one direction)
+    elif obj_id in [0x01, 0x02, 0x03, 0x04]:
+        # Horizontal walls
+        for dx in range(size + 1):
+            for dy in range(4):  # 4 tiles tall
+                tiles.append((x + dx, y + dy))
+
+    elif obj_id in [0x61, 0x62, 0x63, 0x64]:
+        # Vertical walls
+        for dx in range(4):  # 4 tiles wide
+            for dy in range(size + 1):
+                tiles.append((x + dx, y + dy))
+
+    # Type 2 objects (0x100+) - fixed sizes, no expansion
+    elif obj_id >= 0x100:
+        tiles.append((x, y))
+
+    # Default: single tile or small area based on size
+    else:
+        # Generic expansion: size encodes width/height
+        width = max(1, (size & 0x03) + 1)
+        height = max(1, ((size >> 2) & 0x03) + 1)
+        for dy in range(height):
+            for dx in range(width):
+                tiles.append((x + dx, y + dy))
+
+    return tiles
+
+
+def format_collision_asm(offsets: List[int], room_id: int, label: str = None,
+                         objects: List[Dict] = None) -> str:
+    """Generate ASM-ready collision data block."""
+    lines = []
+    label = label or f"Room{room_id:02X}_CollisionData"
+
+    lines.append(f"; Room 0x{room_id:02X} - Collision Offsets")
+    lines.append(f"; Generated by analyze_room.py")
+
+    if objects:
+        for obj in objects:
+            lines.append(f"; Object 0x{obj['id']:03X} @ ({obj['x']},{obj['y']}) size={obj['size']}")
+
+    lines.append(f"{label}:")
+    lines.append("{")
+    lines.append(f"  db {len(offsets)}  ; Tile count")
+
+    # Group offsets by rows of 8 for readability
+    for i in range(0, len(offsets), 8):
+        row = offsets[i:i+8]
+        hex_vals = ", ".join(f"${o:04X}" for o in sorted(row))
+        lines.append(f"  dw {hex_vals}")
+
+    lines.append("}")
+    return "\n".join(lines)
+
+
+def analyze_collision_offsets(result: Dict, filter_id: Optional[int] = None,
+                               expand_area: bool = False, asm_output: bool = False,
+                               verbose: bool = True) -> Dict:
+    """Analyze collision offsets for objects in a room."""
+    analysis = {
+        'room_id': result['room_id'],
+        'objects': [],
+        'offsets': [],
+        'tiles': []
+    }
+
+    # Collect all objects from all layers
+    all_objects = []
+    for layer_num in [0, 1, 2]:
+        all_objects.extend(result['objects_by_layer'][layer_num])
+
+    # Filter by object ID if specified
+    if filter_id is not None:
+        all_objects = [obj for obj in all_objects if obj['id'] == filter_id]
+
+    analysis['objects'] = all_objects
+
+    # Calculate collision offsets
+    all_tiles = []
+    for obj in all_objects:
+        if expand_area:
+            tiles = expand_object_area(obj)
+        else:
+            tiles = [(obj['x'], obj['y'])]
+
+        for (tx, ty) in tiles:
+            # Validate tile coordinates
+            if 0 <= tx < 64 and 0 <= ty < 64:
+                offset = calculate_collision_offset(tx, ty)
+                all_tiles.append((tx, ty, offset, obj))
+
+    # Remove duplicates and sort
+    seen_offsets = set()
+    unique_tiles = []
+    for (tx, ty, offset, obj) in all_tiles:
+        if offset not in seen_offsets:
+            seen_offsets.add(offset)
+            unique_tiles.append((tx, ty, offset, obj))
+
+    analysis['tiles'] = unique_tiles
+    analysis['offsets'] = sorted(list(seen_offsets))
+
+    # Output
+    if asm_output:
+        asm = format_collision_asm(analysis['offsets'], result['room_id'],
+                                    objects=all_objects)
+        print(asm)
+    elif verbose:
+        print(f"\n{'='*70}")
+        print(f"COLLISION OFFSETS - Room 0x{result['room_id']:02X}")
+        print(f"{'='*70}")
+
+        if filter_id is not None:
+            print(f"Filtered by object ID: 0x{filter_id:03X}")
+
+        print(f"\nObjects analyzed: {len(all_objects)}")
+        for obj in all_objects:
+            desc = get_object_description(obj['id'])
+            print(f"  ID=0x{obj['id']:03X} @ ({obj['x']},{obj['y']}) size={obj['size']} - {desc}")
+
+        print(f"\nTile coverage: {len(unique_tiles)} tiles")
+        if expand_area:
+            print("(Area expansion enabled)")
+
+        print(f"\nCollision offsets (for $7F2000):")
+        for i, (tx, ty, offset, obj) in enumerate(sorted(unique_tiles, key=lambda t: t[2])):
+            print(f"  ({tx:2d},{ty:2d}) -> ${offset:04X}")
+            if i > 20 and len(unique_tiles) > 25:
+                print(f"  ... and {len(unique_tiles) - i - 1} more")
+                break
+
+    return analysis
+
+
+def parse_room_objects(rom_data: bytes, room_id: int, verbose: bool = True) -> Dict:
+    """Parse all objects for a given room."""
+    result = {
+        'room_id': room_id,
+        'floor1': 0,
+        'floor2': 0,
+        'layout': 0,
+        'objects_by_layer': {0: [], 1: [], 2: []},
+        'doors': [],
+        'data_address': 0,
+    }
+    
+    # Get room object data pointer
+    object_ptr_table = read_long(rom_data, ROOM_OBJECT_POINTER)
+    object_ptr_table_pc = snes_to_pc(object_ptr_table)
+    
+    # Read room-specific pointer (3 bytes per room)
+    room_ptr_addr = object_ptr_table_pc + (room_id * 3)
+    room_data_snes = read_long(rom_data, room_ptr_addr)
+    room_data_pc = snes_to_pc(room_data_snes)
+    
+    result['data_address'] = room_data_pc
+    
+    if verbose:
+        print(f"\n{'='*70}")
+        print(f"ROOM {room_id:03d} (0x{room_id:03X}) OBJECT ANALYSIS")
+        print(f"{'='*70}")
+        print(f"Room data at PC: 0x{room_data_pc:05X} (SNES: 0x{room_data_snes:06X})")
+    
+    # First 2 bytes: floor graphics and layout
+    floor_byte = rom_data[room_data_pc]
+    layout_byte = rom_data[room_data_pc + 1]
+    
+    result['floor1'] = floor_byte & 0x0F
+    result['floor2'] = (floor_byte >> 4) & 0x0F
+    result['layout'] = (layout_byte >> 2) & 0x07
+    
+    if verbose:
+        print(f"Floor: BG1={result['floor1']}, BG2={result['floor2']}, Layout={result['layout']}")
+    
+    # Parse objects starting at offset 2
+    pos = room_data_pc + 2
+    layer = 0
+    
+    if verbose:
+        print(f"\n{'='*70}")
+        print("OBJECTS (Layer 0=BG1 main, Layer 1=BG2 overlay, Layer 2=BG1 priority)")
+        print(f"{'='*70}")
+    
+    while pos + 2 < len(rom_data):
+        b1 = rom_data[pos]
+        b2 = rom_data[pos + 1]
+        
+        # Check for layer terminator (0xFFFF)
+        if b1 == 0xFF and b2 == 0xFF:
+            if verbose:
+                print(f"\n--- Layer {layer} END ---")
+            pos += 2
+            layer += 1
+            if layer >= 3:
+                break
+            if verbose:
+                print(f"\n--- Layer {layer} START ---")
+            continue
+        
+        # Check for door section marker (0xF0FF)
+        if b1 == 0xF0 and b2 == 0xFF:
+            if verbose:
+                print(f"\n--- Doors ---")
+            pos += 2
+            while pos + 1 < len(rom_data):
+                d1 = rom_data[pos]
+                d2 = rom_data[pos + 1]
+                if d1 == 0xFF and d2 == 0xFF:
+                    break
+                door = {
+                    'position': (d1 >> 4) & 0x0F,
+                    'direction': d1 & 0x03,
+                    'type': d2
+                }
+                result['doors'].append(door)
+                if verbose:
+                    print(f"  Door: pos={door['position']}, dir={door['direction']}, type=0x{door['type']:02X}")
+                pos += 2
+            continue
+        
+        # Read 3rd byte for object
+        b3 = rom_data[pos + 2]
+        pos += 3
+        
+        obj = decode_object(b1, b2, b3, layer)
+        result['objects_by_layer'][layer].append(obj)
+        
+        if verbose:
+            desc = get_object_description(obj['id'])
+            routine = get_draw_routine(obj['id'])
+            layer_names = ["BG1_Main", "BG2_Overlay", "BG1_Priority"]
+            routine_str = f" [{routine}]" if routine else ""
+            print(f"  L{layer} ({layer_names[layer]}): [{b1:02X} {b2:02X} {b3:02X}] -> "
+                  f"T{obj['type']} ID=0x{obj['id']:03X} @ ({obj['x']:2d},{obj['y']:2d}) "
+                  f"sz={obj['size']:2d} - {desc}{routine_str}")
+    
+    # Summary
+    if verbose:
+        print(f"\n{'='*70}")
+        print("SUMMARY")
+        print(f"{'='*70}")
+        for layer_num, layer_name in [(0, "BG1 Main"), (1, "BG2 Overlay"), (2, "BG1 Priority")]:
+            objs = result['objects_by_layer'][layer_num]
+            print(f"Layer {layer_num} ({layer_name}): {len(objs)} objects")
+            if objs:
+                id_counts = {}
+                for obj in objs:
+                    id_counts[obj['id']] = id_counts.get(obj['id'], 0) + 1
+                for obj_id, count in sorted(id_counts.items()):
+                    desc = get_object_description(obj_id)
+                    print(f"  0x{obj_id:03X}: {count}x - {desc}")
+    
+    return result
+
+
+def analyze_layer_compositing(result: Dict, verbose: bool = True) -> Dict:
+    """Analyze layer compositing issues for a room."""
+    analysis = {
+        'has_bg2_objects': len(result['objects_by_layer'][1]) > 0,
+        'bg2_object_count': len(result['objects_by_layer'][1]),
+        'bg2_objects': result['objects_by_layer'][1],
+        'same_floor_graphics': result['floor1'] == result['floor2'],
+        'potential_issues': []
+    }
+    
+    if analysis['has_bg2_objects'] and analysis['same_floor_graphics']:
+        analysis['potential_issues'].append(
+            "BG2 overlay objects with same floor graphics - may have compositing issues"
+        )
+    
+    if verbose and analysis['has_bg2_objects']:
+        print(f"\n{'='*70}")
+        print("LAYER COMPOSITING ANALYSIS")
+        print(f"{'='*70}")
+        print(f"\nBG2 Overlay objects ({analysis['bg2_object_count']}):")
+        for obj in analysis['bg2_objects']:
+            desc = get_object_description(obj['id'])
+            print(f"  ID=0x{obj['id']:03X} @ ({obj['x']},{obj['y']}) size={obj['size']} - {desc}")
+        
+        if analysis['potential_issues']:
+            print("\nPotential Issues:")
+            for issue in analysis['potential_issues']:
+                print(f"  - {issue}")
+    
+    return analysis
+
+
+def find_rom_file(specified_path: Optional[str] = None) -> Optional[str]:
+    """Find a valid ROM file."""
+    if specified_path:
+        if os.path.isfile(specified_path):
+            return specified_path
+        print(f"Error: ROM file not found: {specified_path}")
+        return None
+    
+    # Try default paths relative to script location
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+    
+    for rel_path in DEFAULT_ROM_PATHS:
+        full_path = os.path.join(project_root, rel_path)
+        if os.path.isfile(full_path):
+            return full_path
+    
+    print("Error: Could not find ROM file. Please specify with --rom")
+    print("Tried paths:")
+    for rel_path in DEFAULT_ROM_PATHS:
+        print(f"  {os.path.join(project_root, rel_path)}")
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze dungeon room objects from ALTTP ROM",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s 1                    # Analyze room 001
+  %(prog)s 1 2 3                # Analyze rooms 001, 002, 003
+  %(prog)s --range 0 10         # Analyze rooms 0-10
+  %(prog)s --all                # Analyze all rooms (summary only)
+  %(prog)s --list-bg2           # List rooms with BG2 overlay objects
+  %(prog)s 1 --json             # Output as JSON
+  %(prog)s 1 --compositing      # Include layer compositing analysis
+        """
+    )
+    
+    parser.add_argument('rooms', nargs='*', type=int, help='Room ID(s) to analyze')
+    parser.add_argument('--rom', '-r', type=str, help='Path to ROM file')
+    parser.add_argument('--range', nargs=2, type=int, metavar=('START', 'END'),
+                        help='Analyze range of rooms (inclusive)')
+    parser.add_argument('--all', action='store_true', help='Analyze all rooms (summary only)')
+    parser.add_argument('--json', '-j', action='store_true', help='Output as JSON')
+    parser.add_argument('--quiet', '-q', action='store_true', help='Minimal output')
+    parser.add_argument('--compositing', '-c', action='store_true',
+                        help='Include layer compositing analysis')
+    parser.add_argument('--list-bg2', action='store_true',
+                        help='List all rooms with BG2 overlay objects')
+    parser.add_argument('--summary', '-s', action='store_true',
+                        help='Show summary only (object counts)')
+
+    # Collision offset features
+    parser.add_argument('--collision', action='store_true',
+                        help='Calculate collision map offsets for objects')
+    parser.add_argument('--filter-id', type=lambda x: int(x, 0), metavar='ID',
+                        help='Filter objects by ID (e.g., 0xD9 or 217)')
+    parser.add_argument('--asm', action='store_true',
+                        help='Output collision offsets in ASM format')
+    parser.add_argument('--area', action='store_true',
+                        help='Expand objects to full tile area (not just origin)')
+
+    args = parser.parse_args()
+    
+    # Find ROM file
+    rom_path = find_rom_file(args.rom)
+    if not rom_path:
+        sys.exit(1)
+    
+    # Load ROM
+    if not args.quiet:
+        print(f"Loading ROM: {rom_path}")
+    with open(rom_path, 'rb') as f:
+        rom_data = f.read()
+    if not args.quiet:
+        print(f"ROM size: {len(rom_data)} bytes")
+    
+    # Determine rooms to analyze
+    room_ids = []
+    if args.all or args.list_bg2:
+        room_ids = list(range(NUMBER_OF_ROOMS))
+    elif args.range:
+        room_ids = list(range(args.range[0], args.range[1] + 1))
+    elif args.rooms:
+        room_ids = args.rooms
+    else:
+        # Default to room 1 if nothing specified
+        room_ids = [1]
+    
+    # Validate room IDs
+    room_ids = [r for r in room_ids if 0 <= r < NUMBER_OF_ROOMS]
+    
+    if not room_ids:
+        print("Error: No valid room IDs specified")
+        sys.exit(1)
+    
+    # Analyze rooms
+    all_results = []
+    verbose = not (args.quiet or args.json or args.list_bg2 or args.all or args.asm)
+
+    for room_id in room_ids:
+        try:
+            result = parse_room_objects(rom_data, room_id, verbose=verbose)
+            
+            if args.compositing:
+                result['compositing'] = analyze_layer_compositing(result, verbose=verbose)
+
+            if args.collision:
+                collision_verbose = not (args.asm or args.quiet)
+                result['collision'] = analyze_collision_offsets(
+                    result,
+                    filter_id=args.filter_id,
+                    expand_area=args.area,
+                    asm_output=args.asm,
+                    verbose=collision_verbose
+                )
+
+            all_results.append(result)
+
+        except Exception as e:
+            if not args.quiet:
+                print(f"Error analyzing room {room_id}: {e}")
+
+    # Output results
+    if args.collision and args.asm:
+        # Already output by analyze_collision_offsets
+        pass
+
+    elif args.json:
+        # Convert to JSON-serializable format
+        for result in all_results:
+            result['objects_by_layer'] = {
+                str(k): v for k, v in result['objects_by_layer'].items()
+            }
+        print(json.dumps(all_results, indent=2))
+    
+    elif args.list_bg2:
+        print(f"\n{'='*70}")
+        print("ROOMS WITH BG2 OVERLAY OBJECTS")
+        print(f"{'='*70}")
+        rooms_with_bg2 = []
+        for result in all_results:
+            bg2_count = len(result['objects_by_layer'][1])
+            if bg2_count > 0:
+                rooms_with_bg2.append((result['room_id'], bg2_count))
+        
+        print(f"\nFound {len(rooms_with_bg2)} rooms with BG2 overlay objects:")
+        for room_id, count in sorted(rooms_with_bg2):
+            print(f"  Room {room_id:03d} (0x{room_id:03X}): {count} BG2 objects")
+    
+    elif args.all or args.summary:
+        print(f"\n{'='*70}")
+        print("ROOM SUMMARY")
+        print(f"{'='*70}")
+        print(f"{'Room':>6} {'L0':>4} {'L1':>4} {'L2':>4} {'Doors':>5} {'Floor':>8}")
+        print("-" * 40)
+        for result in all_results:
+            l0 = len(result['objects_by_layer'][0])
+            l1 = len(result['objects_by_layer'][1])
+            l2 = len(result['objects_by_layer'][2])
+            doors = len(result['doors'])
+            floor = f"{result['floor1']}/{result['floor2']}"
+            print(f"{result['room_id']:>6} {l0:>4} {l1:>4} {l2:>4} {doors:>5} {floor:>8}")
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/build-wasm.sh
+++ b/scripts/build-wasm.sh
@@ -0,0 +1,254 @@
+#!/bin/bash
+set -e
+
+usage() {
+    cat <<'EOF'
+Usage: scripts/build-wasm.sh [debug|release|ai] [--incremental] [--clean]
+Options:
+  debug|release|ai  Build mode (default: release). Use 'ai' for agent-enabled web build.
+  --incremental     Skip cleaning CMake cache/files to speed up incremental builds
+  --clean           Completely remove build directory and start fresh
+Note: debug/release/ai share the same build-wasm directory.
+EOF
+}
+
+# Defaults
+BUILD_MODE="release"
+CLEAN_CACHE=true
+FULL_CLEAN=false
+
+for arg in "$@"; do
+    case "$arg" in
+        debug|release|ai)
+            BUILD_MODE="$arg"
+            ;;
+        --incremental)
+            CLEAN_CACHE=false
+            ;;
+        --clean)
+            FULL_CLEAN=true
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $arg"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Directory of this script
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$DIR/.."
+
+# Set build directory and preset based on mode
+if [ "$BUILD_MODE" = "debug" ]; then
+    BUILD_DIR="$PROJECT_ROOT/build-wasm"
+    CMAKE_PRESET="wasm-debug"
+elif [ "$BUILD_MODE" = "ai" ]; then
+    BUILD_DIR="$PROJECT_ROOT/build-wasm"
+    CMAKE_PRESET="wasm-ai"
+else
+    BUILD_DIR="$PROJECT_ROOT/build-wasm"
+    CMAKE_PRESET="wasm-release"
+fi
+
+# Check for emcmake
+if ! command -v emcmake &> /dev/null; then
+    echo "Error: emcmake not found. Please activate Emscripten SDK environment."
+    echo "  source /path/to/emsdk/emsdk_env.sh"
+    exit 1
+fi
+
+echo "=== Building YAZE for Web (WASM) - $BUILD_MODE mode ==="
+echo "Build directory: $BUILD_DIR (shared for debug/release/ai)"
+
+# Handle build directory based on flags
+if [ -d "$BUILD_DIR" ]; then
+    if [ "$FULL_CLEAN" = true ]; then
+        echo "Full clean: removing entire build directory..."
+        rm -rf "$BUILD_DIR"
+    elif [ "$CLEAN_CACHE" = true ]; then
+        echo "Cleaning build directory (CMake cache/files)..."
+        rm -rf "$BUILD_DIR/CMakeCache.txt" "$BUILD_DIR/CMakeFiles" 2>/dev/null || true
+    else
+        echo "Incremental build: skipping CMake cache clean."
+    fi
+fi
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+# Configure with ccache if available
+echo "Configuring..."
+CMAKE_EXTRA_ARGS=""
+if command -v ccache &> /dev/null; then
+    echo "ccache detected - enabling compiler caching"
+    CMAKE_EXTRA_ARGS="-DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache"
+fi
+emcmake cmake "$PROJECT_ROOT" --preset $CMAKE_PRESET $CMAKE_EXTRA_ARGS
+
+# Build (use parallel jobs)
+echo "Building..."
+cmake --build . --parallel
+
+# Package / Organize output
+echo "Packaging..."
+mkdir -p dist
+
+# Copy helper (rsync if available; only --delete for directories)
+copy_item() {
+    src="$1"; dest="$2"
+    if command -v rsync >/dev/null 2>&1; then
+        if [ -d "$src" ]; then
+            mkdir -p "$dest"
+            rsync -a --delete "$src"/ "$dest"/
+        else
+            rsync -a "$src" "$dest"
+        fi
+    else
+        mkdir -p "$(dirname "$dest")"
+        cp -r "$src" "$dest"
+    fi
+}
+
+# Copy main WASM app
+if [ -f bin/index.html ]; then
+    copy_item bin/index.html dist/index.html
+else
+    copy_item bin/yaze.html dist/index.html
+fi
+copy_item bin/yaze.html dist/yaze.html
+copy_item bin/yaze.js dist/
+copy_item bin/yaze.wasm dist/
+copy_item bin/yaze.worker.js dist/ 2>/dev/null || true  # pthread worker script
+copy_item bin/yaze.data dist/ 2>/dev/null || true # might not exist if no assets packed
+
+# Copy web assets (organized in subdirectories)
+echo "Copying web assets..."
+
+# Helper function to copy all files from a source directory to destination
+# Usage: copy_directory_contents <src_dir> <dest_dir> [file_pattern]
+copy_directory_contents() {
+    local src_dir="$1"
+    local dest_dir="$2"
+    local pattern="${3:-*}"  # Default to all files
+
+    if [ ! -d "$src_dir" ]; then
+        echo "Warning: Source directory not found: $src_dir"
+        return 1
+    fi
+
+    mkdir -p "$dest_dir"
+    local count=0
+
+    # Use find to get all matching files (handles patterns better)
+    while IFS= read -r -d '' file; do
+        if [ -f "$file" ]; then
+            copy_item "$file" "$dest_dir/"
+            ((count++)) || true
+        fi
+    done < <(find "$src_dir" -maxdepth 1 -type f -name "$pattern" -print0 2>/dev/null)
+
+    if [ "$count" -eq 0 ]; then
+        echo "Warning: No files matching '$pattern' found in $src_dir"
+    else
+        echo "  Copied $count file(s)"
+    fi
+}
+
+# Copy styles directory (all CSS files)
+if [ -d "$PROJECT_ROOT/src/web/styles" ]; then
+    echo "Copying styles..."
+    copy_directory_contents "$PROJECT_ROOT/src/web/styles" "dist/styles" "*.css"
+fi
+
+# Copy components directory (all JS files)
+if [ -d "$PROJECT_ROOT/src/web/components" ]; then
+    echo "Copying components..."
+    copy_directory_contents "$PROJECT_ROOT/src/web/components" "dist/components" "*.js"
+fi
+
+# Copy core directory (all JS files)
+if [ -d "$PROJECT_ROOT/src/web/core" ]; then
+    echo "Copying core..."
+    copy_directory_contents "$PROJECT_ROOT/src/web/core" "dist/core" "*.js"
+fi
+
+# Copy PWA files (all files in the directory)
+if [ -d "$PROJECT_ROOT/src/web/pwa" ]; then
+    echo "Copying PWA files..."
+    mkdir -p dist/pwa
+    # Copy all JS files
+    copy_directory_contents "$PROJECT_ROOT/src/web/pwa" "dist/pwa" "*.js"
+    # Copy manifest.json
+    copy_directory_contents "$PROJECT_ROOT/src/web/pwa" "dist/pwa" "*.json"
+    # Copy HTML files
+    copy_directory_contents "$PROJECT_ROOT/src/web/pwa" "dist/pwa" "*.html"
+    # Copy markdown docs (optional, for reference)
+    copy_directory_contents "$PROJECT_ROOT/src/web/pwa" "dist/pwa" "*.md"
+    # Verify coi-serviceworker.js was copied (critical for SharedArrayBuffer support)
+    if [ -f "dist/pwa/coi-serviceworker.js" ]; then
+        echo "  coi-serviceworker.js present (required for SharedArrayBuffer/pthreads)"
+        # CRITICAL: Also copy to root for GitHub Pages (service worker scope must cover /)
+        cp "dist/pwa/coi-serviceworker.js" "dist/coi-serviceworker.js"
+        echo "  coi-serviceworker.js copied to root (for GitHub Pages)"
+    else
+        echo "Warning: coi-serviceworker.js not found - SharedArrayBuffer may not work"
+    fi
+fi
+
+# Copy debug tools
+if [ -d "$PROJECT_ROOT/src/web/debug" ]; then
+    echo "Copying debug tools..."
+    mkdir -p dist/debug
+    # Copy all files (could be .js, .cc, .html, etc.)
+    copy_directory_contents "$PROJECT_ROOT/src/web/debug" "dist/debug" "*"
+fi
+
+# Copy main app.js (stays at root)
+if [ -f "$PROJECT_ROOT/src/web/app.js" ]; then
+    copy_item "$PROJECT_ROOT/src/web/app.js" dist/
+fi
+
+# Copy shell UI helpers (dropdown/menu handlers referenced from HTML)
+if [ -f "$PROJECT_ROOT/src/web/shell_ui.js" ]; then
+    copy_item "$PROJECT_ROOT/src/web/shell_ui.js" dist/
+fi
+
+# Copy icons directory
+if [ -d "$PROJECT_ROOT/src/web/icons" ]; then
+    echo "Copying icons..."
+    copy_item "$PROJECT_ROOT/src/web/icons" dist/icons
+    if [ ! -d "dist/icons" ]; then
+        echo "Warning: icons directory not copied successfully"
+    fi
+else
+    echo "Warning: icons directory not found at $PROJECT_ROOT/src/web/icons"
+fi
+
+# Copy yaze icon
+if [ -f "$PROJECT_ROOT/assets/yaze.png" ]; then
+    mkdir -p dist/assets
+    copy_item "$PROJECT_ROOT/assets/yaze.png" dist/assets/
+    echo "yaze icon copied"
+fi
+
+# Copy z3ed WASM module if built
+if [ -f bin/z3ed.js ]; then
+    echo "Copying z3ed terminal module..."
+    copy_item bin/z3ed.js dist/
+    copy_item bin/z3ed.wasm dist/
+    copy_item bin/z3ed.worker.js dist/ 2>/dev/null || true
+fi
+
+echo "=== Build Complete ==="
+echo "Output in: $BUILD_DIR/dist/"
+echo ""
+echo "To serve the app, run:"
+echo "  scripts/serve-wasm.sh [port]"
+echo ""
+echo "Or manually:"
+echo "  cd $BUILD_DIR/dist && python3 -m http.server 8080"
--- a/scripts/build_z3ed_wasm.sh
+++ b/scripts/build_z3ed_wasm.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# Build script for z3ed WASM terminal mode
+# This script builds z3ed CLI for web browsers without TUI dependencies
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Building z3ed for WASM Terminal Mode${NC}"
+echo "======================================="
+
+# Check if emscripten is available
+if ! command -v emcc &> /dev/null; then
+    echo -e "${RED}Error: Emscripten (emcc) not found!${NC}"
+    echo "Please install and activate Emscripten SDK:"
+    echo "  git clone https://github.com/emscripten-core/emsdk.git"
+    echo "  cd emsdk"
+    echo "  ./emsdk install latest"
+    echo "  ./emsdk activate latest"
+    echo "  source ./emsdk_env.sh"
+    exit 1
+fi
+
+# Get the script directory and project root
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$( cd "$SCRIPT_DIR/.." && pwd )"
+
+# Build directory
+BUILD_DIR="${PROJECT_ROOT}/build-wasm"
+
+# Parse command line arguments
+CLEAN_BUILD=false
+BUILD_TYPE="Release"
+VERBOSE=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --clean)
+            CLEAN_BUILD=true
+            shift
+            ;;
+        --debug)
+            BUILD_TYPE="Debug"
+            shift
+            ;;
+        --verbose|-v)
+            VERBOSE="-v"
+            shift
+            ;;
+        --help|-h)
+            echo "Usage: $0 [options]"
+            echo "Options:"
+            echo "  --clean    Clean build directory before building"
+            echo "  --debug    Build in debug mode (default: release)"
+            echo "  --verbose  Enable verbose build output"
+            echo "  --help     Show this help message"
+            exit 0
+            ;;
+        *)
+            echo -e "${YELLOW}Unknown option: $1${NC}"
+            shift
+            ;;
+    esac
+done
+
+# Clean build directory if requested
+if [ "$CLEAN_BUILD" = true ]; then
+    echo -e "${YELLOW}Cleaning build directory...${NC}"
+    rm -rf "${BUILD_DIR}"
+fi
+
+# Create build directory
+mkdir -p "${BUILD_DIR}"
+
+# Configure with CMake
+echo -e "${GREEN}Configuring CMake...${NC}"
+cd "${PROJECT_ROOT}"
+
+if [ "$BUILD_TYPE" = "Debug" ]; then
+    # For debug builds, we could create a wasm-debug preset or modify flags
+    cmake --preset wasm-release \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DYAZE_BUILD_CLI=ON \
+        -DYAZE_BUILD_Z3ED=ON \
+        -DYAZE_WASM_TERMINAL=ON
+else
+    cmake --preset wasm-release
+fi
+
+# Build z3ed
+echo -e "${GREEN}Building z3ed...${NC}"
+cmake --build "${BUILD_DIR}" --target z3ed $VERBOSE
+
+# Check if build succeeded
+if [ -f "${BUILD_DIR}/bin/z3ed.js" ]; then
+    echo -e "${GREEN}✓ Build successful!${NC}"
+    echo ""
+    echo "Output files:"
+    echo "  - ${BUILD_DIR}/bin/z3ed.js"
+    echo "  - ${BUILD_DIR}/bin/z3ed.wasm"
+    echo ""
+    echo "To use z3ed in a web page:"
+    echo "1. Include z3ed.js in your HTML"
+    echo "2. Initialize the module:"
+    echo "   const Z3edTerminal = await Z3edTerminal();"
+    echo "3. Call exported functions:"
+    echo "   Z3edTerminal.ccall('z3ed_init', 'number', [], []);"
+    echo "   const result = Z3edTerminal.ccall('z3ed_execute_command', 'string', ['string'], ['help']);"
+else
+    echo -e "${RED}✗ Build failed!${NC}"
+    echo "Check the build output above for errors."
+    exit 1
+fi
+
+# Optional: Generate HTML test page
+if [ ! -f "${BUILD_DIR}/z3ed_test.html" ]; then
+    echo -e "${YELLOW}Generating test HTML page...${NC}"
+    cat > "${BUILD_DIR}/z3ed_test.html" << 'EOF'
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>z3ed WASM Terminal Test</title>
+    <style>
+        body {
+            font-family: 'Courier New', monospace;
+            background-color: #1e1e1e;
+            color: #d4d4d4;
+            margin: 20px;
+        }
+        #terminal {
+            background-color: #000;
+            border: 1px solid #444;
+            padding: 10px;
+            height: 400px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+        }
+        #input-line {
+            display: flex;
+            margin-top: 10px;
+        }
+        #prompt {
+            color: #4ec9b0;
+            margin-right: 5px;
+        }
+        #command-input {
+            flex-grow: 1;
+            background-color: #1e1e1e;
+            color: #d4d4d4;
+            border: 1px solid #444;
+            padding: 5px;
+            font-family: inherit;
+        }
+        .output {
+            color: #d4d4d4;
+        }
+        .error {
+            color: #f44747;
+        }
+        .info {
+            color: #4ec9b0;
+        }
+    </style>
+</head>
+<body>
+    <h1>z3ed WASM Terminal Test</h1>
+    <div id="terminal"></div>
+    <div id="input-line">
+        <span id="prompt">z3ed&gt;</span>
+        <input type="text" id="command-input" placeholder="Type 'help' for commands...">
+    </div>
+
+    <script src="bin/z3ed.js"></script>
+    <script>
+        let z3edModule;
+        const terminal = document.getElementById('terminal');
+        const commandInput = document.getElementById('command-input');
+
+        function addToTerminal(text, className = 'output') {
+            const line = document.createElement('div');
+            line.className = className;
+            line.textContent = text;
+            terminal.appendChild(line);
+            terminal.scrollTop = terminal.scrollHeight;
+        }
+
+        window.z3edTerminal = {
+            print: (text) => addToTerminal(text, 'output'),
+            printError: (text) => addToTerminal(text, 'error')
+        };
+
+        async function initZ3ed() {
+            try {
+                addToTerminal('Initializing z3ed WASM terminal...', 'info');
+                z3edModule = await Z3edTerminal();
+
+                // Initialize the terminal
+                const ready = z3edModule.ccall('Z3edIsReady', 'number', [], []);
+                if (ready) {
+                    addToTerminal('z3ed terminal ready!', 'info');
+                    addToTerminal('Type "help" for available commands', 'info');
+                } else {
+                    addToTerminal('Failed to initialize z3ed', 'error');
+                }
+            } catch (error) {
+                addToTerminal('Error loading z3ed: ' + error.message, 'error');
+            }
+        }
+
+        function executeCommand(command) {
+            if (!z3edModule) {
+                addToTerminal('z3ed not initialized', 'error');
+                return;
+            }
+
+            // Show the command in terminal
+            addToTerminal('z3ed> ' + command, 'output');
+
+            try {
+                // Execute the command
+                const result = z3edModule.ccall(
+                    'Z3edProcessCommand',
+                    'string',
+                    ['string'],
+                    [command]
+                );
+
+                if (result) {
+                    addToTerminal(result, 'output');
+                }
+            } catch (error) {
+                addToTerminal('Error: ' + error.message, 'error');
+            }
+        }
+
+        commandInput.addEventListener('keydown', (e) => {
+            if (e.key === 'Enter') {
+                const command = commandInput.value.trim();
+                if (command) {
+                    executeCommand(command);
+                    commandInput.value = '';
+                }
+            }
+        });
+
+        // Initialize on load
+        initZ3ed();
+    </script>
+</body>
+</html>
+EOF
+    echo -e "${GREEN}Test page created: ${BUILD_DIR}/z3ed_test.html${NC}"
+fi
+
+echo -e "${GREEN}Build complete!${NC}"
--- a/scripts/demo_agent_gui.sh
+++ b/scripts/demo_agent_gui.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set -e
+
+# Colors
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+BUILD_ROOT="${YAZE_BUILD_DIR:-$PROJECT_ROOT/build}"
+# Try Debug dir first (multi-config), then root bin
+if [ -d "$BUILD_ROOT/bin/Debug" ]; then
+    BUILD_DIR="$BUILD_ROOT/bin/Debug"
+else
+    BUILD_DIR="$BUILD_ROOT/bin"
+fi
+
+# Handle macOS bundle
+if [ -d "$BUILD_DIR/yaze.app" ]; then
+    YAZE_BIN="$BUILD_DIR/yaze.app/Contents/MacOS/yaze"
+else
+    YAZE_BIN="$BUILD_DIR/yaze"
+fi
+Z3ED_BIN="$BUILD_DIR/z3ed"
+
+# Check binaries
+if [ ! -f "$YAZE_BIN" ] || [ ! -f "$Z3ED_BIN" ]; then
+    echo -e "${RED}Error: Binaries not found in $BUILD_DIR${NC}"
+    echo "Please run: cmake --preset mac-ai && cmake --build build"
+    exit 1
+fi
+
+echo -e "${GREEN}Starting YAZE GUI with gRPC test harness...${NC}"
+# Start yaze in background with test harness enabled
+# We use a mock ROM to avoid needing a real file for this test, if supported
+PORT=50055
+echo "Launching YAZE binary: $YAZE_BIN"
+"$YAZE_BIN" --enable_test_harness --test_harness_port=$PORT --log_to_console &
+YAZE_PID=$!
+
+# Wait for server to start
+echo "Waiting for gRPC server on port $PORT (PID: $YAZE_PID)..."
+# Loop to check if port is actually listening
+for i in {1..20}; do
+    if lsof -Pi :$PORT -sTCP:LISTEN -t >/dev/null; then
+        echo -e "${GREEN}Server is listening!${NC}"
+        break
+    fi
+    echo "..."
+    sleep 1
+done
+
+# Check if process still alive
+if ! kill -0 $YAZE_PID 2>/dev/null; then
+    echo -e "${RED}Error: YAZE process died prematurely.${NC}"
+    exit 1
+fi
+
+cleanup() {
+    echo -e "${GREEN}Stopping YAZE GUI (PID: $YAZE_PID)...${NC}"
+    kill "$YAZE_PID" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo -e "${GREEN}Step 1: Discover Widgets${NC}"
+"$Z3ED_BIN" gui-discover-tool --format=text --mock-rom --gui_server_address="localhost:$PORT"
+
+echo -e "${GREEN}Step 2: Take Screenshot (Before Click)${NC}"
+"$Z3ED_BIN" gui-screenshot --region=full --format=json --mock-rom --gui_server_address="localhost:$PORT"
+
+echo -e "${GREEN}Step 3: Click 'File' Menu${NC}"
+"$Z3ED_BIN" gui-click --target="File" --format=text --mock-rom --gui_server_address="localhost:$PORT" || echo -e "${RED}Click failed (expected if ID wrong)${NC}"
+
+echo -e "${GREEN}Step 4: Take Screenshot (After Click)${NC}"
+"$Z3ED_BIN" gui-screenshot --region=full --format=json --mock-rom --gui_server_address="localhost:$PORT"
+
+echo -e "${GREEN}Demo Complete! Keeping YAZE open for 60 seconds...${NC}"
+sleep 60
--- a/scripts/dev_start_yaze.sh
+++ b/scripts/dev_start_yaze.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# scripts/dev_start_yaze.sh
+# Quickly builds and starts YAZE with gRPC enabled for Agent testing.
+
+# Exit on error
+set -e
+
+# Project root
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+BUILD_DIR="${YAZE_BUILD_DIR:-${PROJECT_ROOT}/build}"
+# Prefer Debug binary (agent preset builds Debug by default)
+YAZE_BIN="${BUILD_DIR}/bin/Debug/yaze.app/Contents/MacOS/yaze"
+TEST_HARNESS_PORT="${YAZE_GRPC_PORT:-50052}"
+
+# Fallbacks if layout differs
+if [ ! -x "$YAZE_BIN" ]; then
+    if [ -x "${BUILD_DIR}/bin/yaze" ]; then
+        YAZE_BIN="${BUILD_DIR}/bin/yaze"
+    elif [ -x "${BUILD_DIR}/bin/Debug/yaze" ]; then
+        YAZE_BIN="${BUILD_DIR}/bin/Debug/yaze"
+    elif [ -x "${BUILD_DIR}/bin/Release/yaze" ]; then
+        YAZE_BIN="${BUILD_DIR}/bin/Release/yaze"
+    else
+        echo "❌ Could not find yaze binary in ${BUILD_DIR}/bin (checked app and flat)." >&2
+        exit 1
+    fi
+fi
+# Default to oos168.sfc if available, otherwise check common locations or ask user
+ROM_PATH="/Users/scawful/Code/Oracle-of-Secrets/Roms/oos168.sfc"
+
+# If the hardcoded path doesn't exist, try to find one
+if [ ! -f "$ROM_PATH" ]; then
+    FOUND_ROM=$(find "${PROJECT_ROOT}/../Oracle-of-Secrets/Roms" -name "*.sfc" | head -n 1)
+    if [ -n "$FOUND_ROM" ]; then
+        ROM_PATH="$FOUND_ROM"
+    fi
+fi
+
+echo "=================================================="
+echo "🚀 YAZE Agent Environment Launcher"
+echo "=================================================="
+
+# Navigate to project root
+cd "${PROJECT_ROOT}" || exit 1
+
+# 1. Build (Fast)
+echo "📦 Building YAZE (Target: yaze)..."
+"./scripts/agent_build.sh" yaze
+
+# 2. Check ROM
+if [ ! -f "$ROM_PATH" ]; then
+    echo "❌ ROM not found at $ROM_PATH"
+    echo "   Please edit this script to set a valid ROM_PATH."
+    exit 1
+fi
+
+# 3. Start YAZE with gRPC and Debug flags
+echo "🎮 Launching YAZE..."
+echo "   - gRPC: Enabled (Port ${TEST_HARNESS_PORT})"
+echo "   - ROM: $(basename "$ROM_PATH")"
+echo "   - Editor: Dungeon"
+echo "   - Cards: Object Editor"
+echo "=================================================="
+
+"${YAZE_BIN}" \
+    --enable_test_harness \
+    --test_harness_port "${TEST_HARNESS_PORT}" \
+    --rom_file "$ROM_PATH" \
+    --debug \
+    --editor "Dungeon" \
+    --cards "Object Editor"
--- a/scripts/dump_object_handlers.py
+++ b/scripts/dump_object_handlers.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Dump ALTTP Dungeon Object Handler Tables
+
+This script reads the dungeon object handler tables from ROM and dumps:
+1. Handler addresses for Type 1, 2, and 3 objects
+2. First 20 Type 1 handler addresses
+3. Handler routine analysis
+
+Based on ALTTP ROM structure:
+- Type 1 handler table: Bank $01, $8200 (objects 0x00-0xFF)
+- Type 2 handler table: Bank $01, $8470 (objects 0x100-0x1FF)
+- Type 3 handler table: Bank $01, $85F0 (objects 0x200-0x2FF)
+
+Each entry is a 16-bit pointer (little-endian) to a handler routine in Bank $01.
+"""
+
+import sys
+import struct
+from pathlib import Path
+
+
+def read_rom(rom_path):
+    """Read ROM file and return data, skipping SMC header if present."""
+    with open(rom_path, 'rb') as f:
+        data = f.read()
+
+    # Check for SMC header (512 bytes)
+    if len(data) % 0x400 == 0x200:
+        print(f"[INFO] SMC header detected, skipping 512 bytes")
+        return data[0x200:]
+    return data
+
+
+def pc_to_snes(pc_addr):
+    """Convert PC address to SNES $01:xxxx format."""
+    # For LoROM, PC address maps to SNES as:
+    # PC 0x00000-0x7FFF -> $00:8000-$00:FFFF
+    # PC 0x08000-0x0FFFF -> $01:8000-$01:FFFF
+    bank = (pc_addr >> 15) & 0xFF
+    offset = (pc_addr & 0x7FFF) | 0x8000
+    return f"${bank:02X}:{offset:04X}"
+
+
+def snes_to_pc(bank, offset):
+    """Convert SNES address to PC address (LoROM mapping)."""
+    # Bank $01, offset $8000-$FFFF -> PC 0x08000 + (offset - 0x8000)
+    if offset < 0x8000:
+        raise ValueError(f"Invalid offset ${offset:04X}, must be >= $8000")
+    return (bank * 0x8000) + (offset - 0x8000)
+
+
+def dump_handler_table(rom_data, bank, start_offset, count, name):
+    """
+    Dump handler table from ROM.
+
+    Args:
+        rom_data: ROM data bytes
+        bank: SNES bank number
+        start_offset: SNES offset in bank
+        count: Number of entries to read
+        name: Table name for display
+
+    Returns:
+        List of handler addresses (as integers)
+    """
+    pc_addr = snes_to_pc(bank, start_offset)
+    print(f"\n{'='*70}")
+    print(f"{name}")
+    print(f"SNES Address: ${bank:02X}:{start_offset:04X}")
+    print(f"PC Address: 0x{pc_addr:06X}")
+    print(f"{'='*70}")
+
+    handlers = []
+    for i in range(count):
+        entry_pc = pc_addr + (i * 2)
+        if entry_pc + 1 >= len(rom_data):
+            print(f"[ERROR] PC address 0x{entry_pc:06X} out of bounds")
+            break
+
+        # Read 16-bit little-endian pointer
+        handler_offset = struct.unpack_from('<H', rom_data, entry_pc)[0]
+        handlers.append(handler_offset)
+
+        # Convert to full SNES address (same bank)
+        handler_snes = f"${bank:02X}:{handler_offset:04X}"
+
+        # Only print first 20 for Type 1
+        if i < 20 or name != "Type 1 Handler Table":
+            print(f"  Object 0x{i:03X}: {handler_snes} (PC: 0x{snes_to_pc(bank, handler_offset):06X})")
+
+    if name == "Type 1 Handler Table" and count > 20:
+        print(f"  ... ({count - 20} more entries)")
+
+    return handlers
+
+
+def analyze_handler_uniqueness(handlers, name):
+    """Analyze how many unique handlers exist."""
+    unique_handlers = set(handlers)
+    print(f"\n[ANALYSIS] {name}:")
+    print(f"  Total objects: {len(handlers)}")
+    print(f"  Unique handlers: {len(unique_handlers)}")
+    print(f"  Shared handlers: {len(handlers) - len(unique_handlers)}")
+
+    # Find most common handlers
+    from collections import Counter
+    handler_counts = Counter(handlers)
+    most_common = handler_counts.most_common(5)
+    print(f"  Most common handlers:")
+    for handler_offset, count in most_common:
+        print(f"    ${handler_offset:04X}: used by {count} objects")
+
+
+def dump_handler_bytes(rom_data, bank, handler_offset, byte_count=32):
+    """Dump first N bytes of a handler routine."""
+    try:
+        pc_addr = snes_to_pc(bank, handler_offset)
+        if pc_addr + byte_count >= len(rom_data):
+            byte_count = len(rom_data) - pc_addr
+
+        handler_bytes = rom_data[pc_addr:pc_addr + byte_count]
+        print(f"\n[HANDLER DUMP] ${bank:02X}:{handler_offset:04X} (PC: 0x{pc_addr:06X})")
+        print(f"  First {byte_count} bytes:")
+
+        # Print in hex rows of 16 bytes
+        for i in range(0, byte_count, 16):
+            row = handler_bytes[i:i+16]
+            hex_str = ' '.join(f'{b:02X}' for b in row)
+            ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in row)
+            print(f"    {i:04X}: {hex_str:<48} {ascii_str}")
+    except ValueError as e:
+        print(f"[ERROR] {e}")
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 dump_object_handlers.py <rom_path>")
+        print("Example: python3 dump_object_handlers.py zelda3.sfc")
+        sys.exit(1)
+
+    rom_path = Path(sys.argv[1])
+    if not rom_path.exists():
+        print(f"[ERROR] ROM file not found: {rom_path}")
+        sys.exit(1)
+
+    print(f"[INFO] Reading ROM: {rom_path}")
+    rom_data = read_rom(rom_path)
+    print(f"[INFO] ROM size: {len(rom_data)} bytes ({len(rom_data) / 1024 / 1024:.2f} MB)")
+
+    # Dump handler tables
+    type1_handlers = dump_handler_table(rom_data, 0x01, 0x8200, 256, "Type 1 Handler Table")
+    type2_handlers = dump_handler_table(rom_data, 0x01, 0x8470, 64, "Type 2 Handler Table")
+    type3_handlers = dump_handler_table(rom_data, 0x01, 0x85F0, 128, "Type 3 Handler Table")
+
+    # Analyze handler distribution
+    analyze_handler_uniqueness(type1_handlers, "Type 1")
+    analyze_handler_uniqueness(type2_handlers, "Type 2")
+    analyze_handler_uniqueness(type3_handlers, "Type 3")
+
+    # Dump first handler (object 0x00)
+    if type1_handlers:
+        print(f"\n{'='*70}")
+        print(f"INVESTIGATING OBJECT 0x00 HANDLER")
+        print(f"{'='*70}")
+        dump_handler_bytes(rom_data, 0x01, type1_handlers[0], 64)
+
+    # Dump a few more common handlers
+    print(f"\n{'='*70}")
+    print(f"SAMPLE HANDLER DUMPS")
+    print(f"{'='*70}")
+
+    # Object 0x01 (common wall object)
+    if len(type1_handlers) > 1:
+        dump_handler_bytes(rom_data, 0x01, type1_handlers[1], 32)
+
+    # Type 2 first handler
+    if type2_handlers:
+        dump_handler_bytes(rom_data, 0x01, type2_handlers[0], 32)
+
+    print(f"\n{'='*70}")
+    print(f"SUMMARY")
+    print(f"{'='*70}")
+    print(f"Handler tables successfully read from ROM.")
+    print(f"See documentation at docs/internal/alttp-object-handlers.md")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/find-unsafe-array-access.sh
+++ b/scripts/find-unsafe-array-access.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# Static analysis script to find potentially unsafe array accesses
+# that could cause "index out of bounds" errors in WASM
+#
+# Run from yaze root: ./scripts/find-unsafe-array-access.sh
+
+set -e
+
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+GREEN='\033[0;32m'
+NC='\033[0m' # No Color
+
+echo "====================================="
+echo "YAZE Unsafe Array Access Scanner"
+echo "====================================="
+echo ""
+
+# Directory to scan
+SCAN_DIR="${1:-src}"
+
+echo "Scanning: $SCAN_DIR"
+echo ""
+
+# Pattern categories - each needs manual review
+declare -a CRITICAL_PATTERNS=(
+    # Direct buffer access without bounds check
+    'tiledata\[[^]]+\]'
+    'gfx_sheets_\[[^]]+\]'
+    'canvas\[[^]]+\]'
+    'pixels\[[^]]+\]'
+    'buffer_\[[^]]+\]'
+    'tiles_\[[^]]+\]'
+    '\.data\(\)\[[^]]+\]'
+)
+
+declare -a HIGH_PATTERNS=(
+    # ROM data access
+    'rom\.data\(\)\[[^]]+\]'
+    'rom_data\[[^]]+\]'
+    # Palette access
+    'palette\[[^]]+\]'
+    '->colors\[[^]]+\]'
+    # Map/room access
+    'overworld_maps_\[[^]]+\]'
+    'rooms_\[[^]]+\]'
+    'sprites_\[[^]]+\]'
+)
+
+declare -a MEDIUM_PATTERNS=(
+    # Graphics sheet access
+    'gfx_sheet\([^)]+\)'
+    'mutable_gfx_sheet\([^)]+\)'
+    # VRAM/CGRAM/OAM (usually masked, but worth checking)
+    'vram\[[^]]+\]'
+    'cgram\[[^]]+\]'
+    'oam\[[^]]+\]'
+)
+
+echo "=== CRITICAL: Direct buffer access patterns ==="
+echo "(These are most likely to cause WASM crashes)"
+echo ""
+
+for pattern in "${CRITICAL_PATTERNS[@]}"; do
+    echo -e "${RED}Pattern: $pattern${NC}"
+    grep -rn --include="*.cc" --include="*.h" -E "$pattern" "$SCAN_DIR" 2>/dev/null | \
+        grep -v "test/" | grep -v "_test.cc" | head -20 || echo "  No matches"
+    echo ""
+done
+
+echo "=== HIGH: ROM/Map/Sprite data patterns ==="
+echo "(These access external data that may be corrupt)"
+echo ""
+
+for pattern in "${HIGH_PATTERNS[@]}"; do
+    echo -e "${YELLOW}Pattern: $pattern${NC}"
+    grep -rn --include="*.cc" --include="*.h" -E "$pattern" "$SCAN_DIR" 2>/dev/null | \
+        grep -v "test/" | grep -v "_test.cc" | head -20 || echo "  No matches"
+    echo ""
+done
+
+echo "=== MEDIUM: Graphics accessor patterns ==="
+echo "(Usually safe but verify bounds checks exist)"
+echo ""
+
+for pattern in "${MEDIUM_PATTERNS[@]}"; do
+    echo -e "${GREEN}Pattern: $pattern${NC}"
+    grep -rn --include="*.cc" --include="*.h" -E "$pattern" "$SCAN_DIR" 2>/dev/null | \
+        grep -v "test/" | grep -v "_test.cc" | head -20 || echo "  No matches"
+    echo ""
+done
+
+echo "====================================="
+echo "Analysis complete."
+echo ""
+echo "GUIDELINES FOR FIXES:"
+echo "1. Add bounds validation BEFORE array access"
+echo "2. Use early return for invalid indices"
+echo "3. Consider using .at() for checked access in debug builds"
+echo "4. For tile data: validate tile_id < 0x400 (64 rows * 16 cols)"
+echo "5. For palettes: validate index < palette_size"
+echo "6. For graphics sheets: validate index < 223"
+echo "7. For ROM data: validate offset < rom.size()"
+echo "====================================="
--- a/scripts/gemini_build.sh
+++ b/scripts/gemini_build.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# scripts/gemini_build.sh
+# Build script for Gemini AI agent - builds full yaze with all features
+# Usage: ./scripts/gemini_build.sh [target] [--fresh]
+#
+# Examples:
+#   ./scripts/gemini_build.sh              # Build yaze (default)
+#   ./scripts/gemini_build.sh yaze_test    # Build tests
+#   ./scripts/gemini_build.sh --fresh      # Clean reconfigure and build
+#   ./scripts/gemini_build.sh z3ed         # Build CLI tool
+
+set -e
+
+# Configuration
+BUILD_DIR="build_gemini"
+PRESET="mac-gemini"
+TARGET="${1:-yaze}"
+FRESH=""
+
+# Parse arguments
+for arg in "$@"; do
+    case $arg in
+        --fresh)
+            FRESH="--fresh"
+            shift
+            ;;
+        *)
+            TARGET="$arg"
+            ;;
+    esac
+done
+
+echo "=================================================="
+echo "Gemini Agent Build System"
+echo "Build Dir: ${BUILD_DIR}"
+echo "Preset:    ${PRESET}"
+echo "Target:    ${TARGET}"
+echo "=================================================="
+
+# Ensure we are in the project root
+if [ ! -f "CMakePresets.json" ]; then
+    echo "Error: CMakePresets.json not found. Must run from project root."
+    exit 1
+fi
+
+# Configure if needed or if --fresh specified
+if [ ! -d "${BUILD_DIR}" ] || [ -n "${FRESH}" ]; then
+    echo "Configuring ${PRESET}..."
+    cmake --preset "${PRESET}" ${FRESH}
+fi
+
+# Build
+echo "Building target: ${TARGET}..."
+cmake --build "${BUILD_DIR}" --target "${TARGET}" -j$(sysctl -n hw.ncpu)
+
+echo ""
+echo "Build complete: ${BUILD_DIR}/${TARGET}"
+echo ""
+echo "Run tests:   ctest --test-dir ${BUILD_DIR} -L stable -j4"
+echo "Run app:     ./${BUILD_DIR}/Debug/yaze"
--- a/scripts/lint.sh
+++ b/scripts/lint.sh
@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+#
+# Unified linting script for yaze
+# Wraps clang-format and clang-tidy with project-specific configuration
+#
+# Usage:
+#   scripts/lint.sh [check|fix] [files...]
+#
+#   check (default) - Check for issues without modifying files
+#   fix             - Automatically fix formatting and some tidy issues
+#   files...        - Optional list of files to process (defaults to all source files)
+
+set -e
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+# Configuration
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$PROJECT_ROOT"
+
+MODE="check"
+if [[ "$1" == "fix" ]]; then
+    MODE="fix"
+    shift
+elif [[ "$1" == "check" ]]; then
+    shift
+fi
+
+# Files to process
+FILES="$@"
+if [[ -z "$FILES" ]]; then
+    # Find all source files, excluding third-party libraries
+    # Using git ls-files if available to respect .gitignore
+    if git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+        FILES=$(git ls-files 'src/*.cc' 'src/*.h' 'test/*.cc' 'test/*.h' | grep -v "src/lib/")
+    else
+        FILES=$(find src test -name "*.cc" -o -name "*.h" | grep -v "src/lib/")
+    fi
+fi
+
+# Find tools
+find_tool() {
+    local names=("$@")
+    for name in "${names[@]}"; do
+        if command -v "$name" >/dev/null 2>&1; then
+            echo "$name"
+            return 0
+        fi
+    done
+    
+    # Check Homebrew LLVM paths on macOS
+    if [[ "$(uname)" == "Darwin" ]]; then
+        local brew_prefix
+        if command -v brew >/dev/null 2>&1; then
+            brew_prefix=$(brew --prefix llvm 2>/dev/null)
+            if [[ -n "$brew_prefix" ]]; then
+                for name in "${names[@]}"; do
+                    if [[ -x "$brew_prefix/bin/$name" ]]; then
+                        echo "$brew_prefix/bin/$name"
+                        return 0
+                    fi
+                done
+            fi
+        fi
+    fi
+    return 1
+}
+
+CLANG_FORMAT=$(find_tool clang-format-18 clang-format-17 clang-format)
+CLANG_TIDY=$(find_tool clang-tidy-18 clang-tidy-17 clang-tidy)
+
+if [[ -z "$CLANG_FORMAT" ]]; then
+    echo -e "${RED}Error: clang-format not found.${NC}"
+    exit 1
+fi
+
+if [[ -z "$CLANG_TIDY" ]]; then
+    echo -e "${YELLOW}Warning: clang-tidy not found. Skipping tidy checks.${NC}"
+fi
+
+echo -e "${BLUE}Using clang-format: $CLANG_FORMAT${NC}"
+[[ -n "$CLANG_TIDY" ]] && echo -e "${BLUE}Using clang-tidy: $CLANG_TIDY${NC}"
+
+# Run clang-format
+echo -e "\n${BLUE}=== Running clang-format ===${NC}"
+if [[ "$MODE" == "fix" ]]; then
+    echo "$FILES" | xargs "$CLANG_FORMAT" -i --style=file
+    echo -e "${GREEN}Formatting applied.${NC}"
+else
+    # --dry-run --Werror returns 0 if clean, non-zero if changes needed (or error)
+    # Actually --dry-run prints replacements, --Werror returns error on warnings (formatting violations are not warnings by default)
+    # To check if formatted: use --dry-run --Werror with output check or just check exit code if it supports it.
+    # Standard way: clang-format --dry-run --Werror <file>
+    
+    if echo "$FILES" | xargs "$CLANG_FORMAT" --dry-run --Werror --style=file 2>&1; then
+        echo -e "${GREEN}Format check passed.${NC}"
+    else
+        echo -e "${RED}Format check failed.${NC}"
+        echo -e "Run '${YELLOW}scripts/lint.sh fix${NC}' to apply formatting."
+        exit 1
+    fi
+fi
+
+# Run clang-tidy
+if [[ -n "$CLANG_TIDY" ]]; then
+    echo -e "\n${BLUE}=== Running clang-tidy ===${NC}"
+    
+    # Build compile_commands.json if missing (needed for clang-tidy)
+    if [[ ! -f "build/compile_commands.json" && ! -f "compile_commands.json" ]]; then
+         echo -e "${YELLOW}compile_commands.json not found. Attempting to generate...${NC}"
+         if command -v cmake >/dev/null; then
+             cmake -S . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=ON >/dev/null
+         else
+             echo -e "${RED}cmake not found. Cannot generate compile_commands.json.${NC}"
+         fi
+    fi
+
+    # Find compile_commands.json
+    BUILD_PATH=""
+    if [[ -f "build/compile_commands.json" ]]; then
+        BUILD_PATH="build"
+    elif [[ -f "compile_commands.json" ]]; then
+        BUILD_PATH="."
+    fi
+
+    if [[ -n "$BUILD_PATH" ]]; then
+        TIDY_ARGS="-p $BUILD_PATH --quiet"
+        [[ "$MODE" == "fix" ]] && TIDY_ARGS="$TIDY_ARGS --fix"
+        
+        # Use parallel if available
+        if command -v parallel >/dev/null 2>&1; then
+            # parallel processing would require a different invocation
+            # For now, just run simple xargs
+            echo "$FILES" | xargs "$CLANG_TIDY" $TIDY_ARGS
+        else
+            echo "$FILES" | xargs "$CLANG_TIDY" $TIDY_ARGS
+        fi
+        
+        echo -e "${GREEN}Clang-tidy finished.${NC}"
+    else
+        echo -e "${YELLOW}Skipping clang-tidy (compile_commands.json not found).${NC}"
+    fi
+fi
+
+echo -e "\n${GREEN}Linting complete.${NC}"
+
--- a/scripts/pre-push.sh
+++ b/scripts/pre-push.sh
@@ -179,11 +179,11 @@ main() {
    print_header "Step 3/3: Code Formatting"

    # Check if format-check target exists
-    if cmake --build "$BUILD_DIR" --target help 2>/dev/null | grep -q "format-check"; then
+    if cmake --build "$BUILD_DIR" --target help 2>/dev/null | grep -q "yaze-format-check"; then
      print_info "Checking code formatting..."
-      if ! cmake --build "$BUILD_DIR" --target format-check 2>&1 | tail -10; then
+      if ! cmake --build "$BUILD_DIR" --target yaze-format-check 2>&1 | tail -10; then
        print_error "Code formatting check failed!"
-        print_info "Fix with: cmake --build $BUILD_DIR --target format"
+        print_info "Fix with: scripts/lint.sh fix"
        exit 3
      fi
      print_success "Code formatting passed"
--- a/scripts/quality_check.sh
+++ b/scripts/quality_check.sh
@@ -23,13 +23,24 @@ if [ ! -f .clang-format ]; then
 fi

 echo "✅ Code formatting check..."
-# Check formatting without modifying files
-FORMATTING_ISSUES=$(find src test -name "*.cc" -o -name "*.h" | head -50 | xargs clang-format --dry-run --Werror --style=Google 2>&1 || true)
-if [ -n "$FORMATTING_ISSUES" ]; then
-    echo "⚠️  Formatting issues found. Run 'make format' to fix them."
-    echo "$FORMATTING_ISSUES" | head -20
+# Check formatting using unified lint script if available, otherwise fallback
+if [ -f "${SCRIPT_DIR}/lint.sh" ]; then
+    if ! "${SCRIPT_DIR}/lint.sh" check >/dev/null 2>&1; then
+         echo "⚠️  Formatting/Linting issues found. Run 'scripts/lint.sh fix' to fix formatting."
+         # We don't exit 1 here to avoid breaking existing workflows immediately, 
+         # but we warn.
+    else
+         echo "✅ All files are properly formatted and linted"
+    fi
 else
-    echo "✅ All files are properly formatted"
+    # Fallback to manual check
+    FORMATTING_ISSUES=$(find src test -name "*.cc" -o -name "*.h" | head -50 | xargs clang-format --dry-run --Werror --style=file 2>&1 || true)
+    if [ -n "$FORMATTING_ISSUES" ]; then
+        echo "⚠️  Formatting issues found. Run 'scripts/lint.sh fix' to fix them."
+        echo "$FORMATTING_ISSUES" | head -20
+    else
+        echo "✅ All files are properly formatted"
+    fi
 fi

 echo "🔍 Running static analysis..."
--- a/scripts/serve-wasm.sh
+++ b/scripts/serve-wasm.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+set -euo pipefail
+# Local dev server for the WASM build (supports release/debug builds)
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$SCRIPT_DIR/.."
+
+PORT="8080"
+MODE="release"
+DIST_DIR=""
+FORCE="false"
+
+usage() {
+    cat <<'EOF'
+Usage: scripts/serve-wasm.sh [--debug|--release] [--port N] [--dist PATH] [--force]
+       scripts/serve-wasm.sh [port]
+
+Options:
+  --debug, -d      Serve debug build (build-wasm/dist, configured via wasm-debug)
+  --release, -r    Serve release build (default)
+  --port, -p N     Port to bind (default: 8080). Bare number also works.
+  --dist, --dir    Custom dist directory to serve (overrides mode)
+  --force, -f      Kill any process already bound to the chosen port
+  --help, -h       Show this help text
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -p|--port)
+            [[ $# -lt 2 ]] && { echo "Error: --port requires a value" >&2; exit 1; }
+            PORT="$2"
+            shift
+            ;;
+        -d|--debug)
+            MODE="debug"
+            ;;
+        -r|--release)
+            MODE="release"
+            ;;
+        --dist|--dir)
+            [[ $# -lt 2 ]] && { echo "Error: --dist requires a value" >&2; exit 1; }
+            DIST_DIR="$2"
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        -f|--force)
+            FORCE="true"
+            ;;
+        *)
+            if [[ "$1" =~ ^[0-9]+$ ]]; then
+                PORT="$1"
+            else
+                echo "Unknown argument: $1" >&2
+                usage
+                exit 1
+            fi
+            ;;
+    esac
+    shift
+done
+
+if ! command -v python3 >/dev/null 2>&1; then
+    echo "Error: python3 not found. Install Python 3 to use the dev server." >&2
+    exit 1
+fi
+
+find_dist_dir() {
+    for path in "$@"; do
+        if [[ -d "$path" ]]; then
+            echo "$path"
+            return 0
+        fi
+    done
+    return 1
+}
+
+DIST_CANDIDATES=(
+    "$PROJECT_ROOT/build-wasm/dist"
+    "$PROJECT_ROOT/build_wasm/dist"
+)
+
+# Resolve dist directory
+if [[ -z "$DIST_DIR" ]]; then
+    if ! DIST_DIR="$(find_dist_dir "${DIST_CANDIDATES[@]}")"; then
+        echo "Error: WASM dist directory not found." >&2
+        echo "Tried:" >&2
+        printf '  - %s\n' "${DIST_CANDIDATES[@]}" >&2
+        echo "Run ./scripts/build-wasm.sh ${MODE} first." >&2
+        exit 1
+    fi
+fi
+
+if [[ ! -d "$DIST_DIR" ]]; then
+    echo "Error: dist directory not found at $DIST_DIR" >&2
+    exit 1
+fi
+
+if [[ ! -f "$DIST_DIR/index.html" ]]; then
+    echo "Error: index.html not found in $DIST_DIR" >&2
+    echo "Please run scripts/build-wasm.sh ${MODE}" >&2
+    exit 1
+fi
+
+# Free the port if requested
+EXISTING_PIDS="$(lsof -ti tcp:"$PORT" 2>/dev/null || true)"
+if [[ -n "$EXISTING_PIDS" ]]; then
+    if [[ "$FORCE" == "true" ]]; then
+        echo "Port $PORT is in use by PID(s): $EXISTING_PIDS — terminating..."
+        kill $EXISTING_PIDS 2>/dev/null || true
+        sleep 0.5
+        if lsof -ti tcp:"$PORT" >/dev/null 2>&1; then
+            echo "Error: failed to free port $PORT (process still listening)." >&2
+            exit 1
+        fi
+    else
+        echo "Error: port $PORT is already in use (PID(s): $EXISTING_PIDS)." >&2
+        echo "Use --force to terminate the existing process, or choose another port with --port N." >&2
+        exit 1
+    fi
+fi
+
+# Verify port availability to avoid noisy Python stack traces
+if ! python3 - "$PORT" <<'PY' >/dev/null 2>&1
+import socket, sys
+port = int(sys.argv[1])
+s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+try:
+    s.bind(("", port))
+finally:
+    s.close()
+PY
+then
+    echo "Error: port $PORT is already in use. Pick another port with --port N." >&2
+    exit 1
+fi
+
+echo "=== Serving YAZE WASM Build (${MODE}) ==="
+echo "Directory: $DIST_DIR"
+echo "Port: $PORT"
+echo ""
+echo "Open http://127.0.0.1:$PORT in your browser"
+echo "Press Ctrl+C to stop the server"
+echo ""
+
+# Use custom server with COOP/COEP headers for SharedArrayBuffer support
+python3 - "$PORT" "$DIST_DIR" <<'PYSERVER'
+import sys
+import os
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+
+PORT = int(sys.argv[1])
+DIRECTORY = sys.argv[2]
+
+class COOPCOEPHandler(SimpleHTTPRequestHandler):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, directory=DIRECTORY, **kwargs)
+
+    def end_headers(self):
+        # Required headers for SharedArrayBuffer support
+        self.send_header('Cross-Origin-Opener-Policy', 'same-origin')
+        self.send_header('Cross-Origin-Embedder-Policy', 'require-corp')
+        self.send_header('Cross-Origin-Resource-Policy', 'same-origin')
+        # Prevent caching during development
+        self.send_header('Cache-Control', 'no-store')
+        super().end_headers()
+
+    def log_message(self, format, *args):
+        # Color-coded logging
+        status = args[1] if len(args) > 1 else ""
+        if status.startswith('2'):
+            color = '\033[32m'  # Green
+        elif status.startswith('3'):
+            color = '\033[33m'  # Yellow
+        elif status.startswith('4') or status.startswith('5'):
+            color = '\033[31m'  # Red
+        else:
+            color = ''
+        reset = '\033[0m' if color else ''
+        print(f"{color}{self.address_string()} - {format % args}{reset}")
+
+print(f"Server running with COOP/COEP headers enabled")
+print(f"SharedArrayBuffer support: ENABLED")
+httpd = HTTPServer(('', PORT), COOPCOEPHandler)
+httpd.serve_forever()
+PYSERVER
--- a/scripts/test_runner.py
+++ b/scripts/test_runner.py
@@ -0,0 +1,434 @@
+#!/usr/bin/env python3
+"""
+Advanced test runner with automatic sharding and parallel execution for yaze.
+Optimizes test execution time by distributing tests across multiple processes.
+"""
+
+import multiprocessing
+import json
+import subprocess
+import time
+import argparse
+import sys
+import os
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass, asdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import hashlib
+
+@dataclass
+class TestResult:
+    """Container for test execution results."""
+    name: str
+    status: str  # passed, failed, skipped
+    duration: float
+    output: str
+    shard_id: int
+
+@dataclass
+class ShardResult:
+    """Results from a single test shard."""
+    shard_id: int
+    return_code: int
+    tests_run: int
+    tests_passed: int
+    tests_failed: int
+    duration: float
+    test_results: List[TestResult]
+
+class TestRunner:
+    """Advanced test runner with sharding and parallel execution."""
+
+    def __init__(self, test_binary: str, num_shards: int = None,
+                 cache_dir: str = None, verbose: bool = False):
+        self.test_binary = Path(test_binary).resolve()
+        if not self.test_binary.exists():
+            raise FileNotFoundError(f"Test binary not found: {test_binary}")
+
+        self.num_shards = num_shards or min(multiprocessing.cpu_count(), 8)
+        self.cache_dir = Path(cache_dir or Path.home() / ".yaze_test_cache")
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.verbose = verbose
+        self.test_times = self.load_test_times()
+
+    def load_test_times(self) -> Dict[str, float]:
+        """Load historical test execution times from cache."""
+        cache_file = self.cache_dir / "test_times.json"
+        if cache_file.exists():
+            try:
+                return json.loads(cache_file.read_text())
+            except (json.JSONDecodeError, IOError):
+                return {}
+        return {}
+
+    def save_test_times(self, test_times: Dict[str, float]):
+        """Save test execution times to cache."""
+        cache_file = self.cache_dir / "test_times.json"
+
+        # Merge with existing times
+        existing = self.load_test_times()
+        for test, time in test_times.items():
+            # Use exponential moving average for smoothing
+            if test in existing:
+                existing[test] = 0.7 * existing[test] + 0.3 * time
+            else:
+                existing[test] = time
+
+        cache_file.write_text(json.dumps(existing, indent=2))
+
+    def discover_tests(self, filter_pattern: str = None) -> List[str]:
+        """Discover all tests in the binary."""
+        cmd = [str(self.test_binary), "--gtest_list_tests"]
+        if filter_pattern:
+            cmd.append(f"--gtest_filter={filter_pattern}")
+
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True,
+                                  timeout=30, check=False)
+        except subprocess.TimeoutExpired:
+            print("Warning: Test discovery timed out", file=sys.stderr)
+            return []
+
+        if result.returncode != 0:
+            print(f"Warning: Test discovery failed: {result.stderr}", file=sys.stderr)
+            return []
+
+        # Parse gtest output
+        tests = []
+        current_suite = ""
+        for line in result.stdout.splitlines():
+            line = line.rstrip()
+            if not line or line.startswith("Running main()"):
+                continue
+
+            if line and not line.startswith(" "):
+                # Test suite name
+                current_suite = line.rstrip(".")
+            elif line.strip():
+                # Test case name
+                test_name = line.strip()
+                # Remove comments (e.g., "  TestName  # Comment")
+                if "#" in test_name:
+                    test_name = test_name.split("#")[0].strip()
+                if test_name:
+                    tests.append(f"{current_suite}.{test_name}")
+
+        if self.verbose:
+            print(f"Discovered {len(tests)} tests")
+
+        return tests
+
+    def create_balanced_shards(self, tests: List[str]) -> List[List[str]]:
+        """Create balanced shards based on historical execution times."""
+        if not tests:
+            return []
+
+        # Sort tests by execution time (longest first)
+        # Use historical times or default estimate
+        default_time = 0.1  # 100ms default per test
+        sorted_tests = sorted(
+            tests,
+            key=lambda t: self.test_times.get(t, default_time),
+            reverse=True
+        )
+
+        # Initialize shards
+        num_shards = min(self.num_shards, len(tests))
+        shards = [[] for _ in range(num_shards)]
+        shard_times = [0.0] * num_shards
+
+        # Distribute tests using greedy bin packing
+        for test in sorted_tests:
+            # Find shard with minimum total time
+            min_shard_idx = shard_times.index(min(shard_times))
+            shards[min_shard_idx].append(test)
+            shard_times[min_shard_idx] += self.test_times.get(test, default_time)
+
+        # Remove empty shards
+        shards = [s for s in shards if s]
+
+        if self.verbose:
+            print(f"Created {len(shards)} shards:")
+            for i, shard in enumerate(shards):
+                print(f"  Shard {i}: {len(shard)} tests, "
+                      f"estimated {shard_times[i]:.2f}s")
+
+        return shards
+
+    def run_shard(self, shard_id: int, tests: List[str],
+                  output_dir: Path = None) -> ShardResult:
+        """Run a single shard of tests."""
+        if not tests:
+            return ShardResult(shard_id, 0, 0, 0, 0, 0.0, [])
+
+        filter_str = ":".join(tests)
+        output_dir = output_dir or self.cache_dir / "results"
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Prepare command
+        json_output = output_dir / f"shard_{shard_id}_results.json"
+        xml_output = output_dir / f"shard_{shard_id}_results.xml"
+
+        cmd = [
+            str(self.test_binary),
+            f"--gtest_filter={filter_str}",
+            f"--gtest_output=json:{json_output}",
+            "--gtest_brief=1"
+        ]
+
+        # Run tests
+        start_time = time.time()
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=600  # 10 minute timeout per shard
+            )
+            duration = time.time() - start_time
+        except subprocess.TimeoutExpired:
+            print(f"Shard {shard_id} timed out!", file=sys.stderr)
+            return ShardResult(shard_id, -1, len(tests), 0, len(tests),
+                             600.0, [])
+
+        # Parse results
+        test_results = []
+        tests_run = 0
+        tests_passed = 0
+        tests_failed = 0
+
+        if json_output.exists():
+            try:
+                with open(json_output) as f:
+                    data = json.load(f)
+
+                for suite in data.get("testsuites", []):
+                    for testcase in suite.get("testsuite", []):
+                        test_name = f"{suite['name']}.{testcase['name']}"
+                        status = "passed" if testcase.get("result") == "COMPLETED" else "failed"
+                        test_duration = float(testcase.get("time", "0").rstrip("s"))
+
+                        test_results.append(TestResult(
+                            name=test_name,
+                            status=status,
+                            duration=test_duration,
+                            output=testcase.get("output", ""),
+                            shard_id=shard_id
+                        ))
+
+                        tests_run += 1
+                        if status == "passed":
+                            tests_passed += 1
+                        else:
+                            tests_failed += 1
+
+            except (json.JSONDecodeError, KeyError, IOError) as e:
+                print(f"Warning: Failed to parse results for shard {shard_id}: {e}",
+                      file=sys.stderr)
+
+        return ShardResult(
+            shard_id=shard_id,
+            return_code=result.returncode,
+            tests_run=tests_run,
+            tests_passed=tests_passed,
+            tests_failed=tests_failed,
+            duration=duration,
+            test_results=test_results
+        )
+
+    def run_parallel(self, filter_pattern: str = None,
+                     output_dir: str = None) -> Tuple[int, Dict]:
+        """Run tests in parallel shards."""
+        # Discover tests
+        tests = self.discover_tests(filter_pattern)
+        if not tests:
+            print("No tests found to run")
+            return 0, {}
+
+        print(f"Running {len(tests)} tests in up to {self.num_shards} shards...")
+
+        # Create shards
+        shards = self.create_balanced_shards(tests)
+        output_path = Path(output_dir) if output_dir else self.cache_dir / "results"
+
+        # Run shards in parallel
+        all_results = []
+        start_time = time.time()
+
+        with ProcessPoolExecutor(max_workers=len(shards)) as executor:
+            # Submit all shards
+            futures = {
+                executor.submit(self.run_shard, i, shard, output_path): i
+                for i, shard in enumerate(shards)
+            }
+
+            # Collect results
+            for future in as_completed(futures):
+                shard_id = futures[future]
+                try:
+                    result = future.result()
+                    all_results.append(result)
+
+                    if self.verbose:
+                        print(f"Shard {shard_id} completed: "
+                              f"{result.tests_passed}/{result.tests_run} passed "
+                              f"in {result.duration:.2f}s")
+                except Exception as e:
+                    print(f"Shard {shard_id} failed with exception: {e}",
+                          file=sys.stderr)
+
+        total_duration = time.time() - start_time
+
+        # Aggregate results
+        total_tests = sum(r.tests_run for r in all_results)
+        total_passed = sum(r.tests_passed for r in all_results)
+        total_failed = sum(r.tests_failed for r in all_results)
+        max_return_code = max((r.return_code for r in all_results), default=0)
+
+        # Update test times cache
+        new_times = {}
+        for result in all_results:
+            for test_result in result.test_results:
+                new_times[test_result.name] = test_result.duration
+        self.save_test_times(new_times)
+
+        # Generate summary
+        summary = {
+            "total_tests": total_tests,
+            "passed": total_passed,
+            "failed": total_failed,
+            "duration": total_duration,
+            "num_shards": len(shards),
+            "parallel_efficiency": (sum(r.duration for r in all_results) /
+                                   (total_duration * len(shards)) * 100)
+                                   if len(shards) > 0 else 0,
+            "shards": [asdict(r) for r in all_results]
+        }
+
+        # Save summary
+        summary_file = output_path / "summary.json"
+        summary_file.write_text(json.dumps(summary, indent=2))
+
+        # Print results
+        print(f"\n{'=' * 60}")
+        print(f"Test Execution Summary")
+        print(f"{'=' * 60}")
+        print(f"Total Tests:    {total_tests}")
+        print(f"Passed:         {total_passed} ({total_passed/total_tests*100:.1f}%)")
+        print(f"Failed:         {total_failed}")
+        print(f"Duration:       {total_duration:.2f}s")
+        print(f"Shards Used:    {len(shards)}")
+        print(f"Efficiency:     {summary['parallel_efficiency']:.1f}%")
+
+        if total_failed > 0:
+            print(f"\nFailed Tests:")
+            for result in all_results:
+                for test_result in result.test_results:
+                    if test_result.status == "failed":
+                        print(f"  - {test_result.name}")
+
+        return max_return_code, summary
+
+    def run_with_retry(self, filter_pattern: str = None,
+                       max_retries: int = 2) -> int:
+        """Run tests with automatic retry for flaky tests."""
+        failed_tests = set()
+        attempt = 0
+
+        while attempt <= max_retries:
+            if attempt > 0:
+                # Only retry failed tests
+                if not failed_tests:
+                    break
+                filter_pattern = ":".join(failed_tests)
+                print(f"\nRetry attempt {attempt} for {len(failed_tests)} failed tests")
+
+            return_code, summary = self.run_parallel(filter_pattern)
+
+            if return_code == 0:
+                if attempt > 0:
+                    print(f"All tests passed after {attempt} retries")
+                return 0
+
+            # Collect failed tests for retry
+            failed_tests.clear()
+            for shard in summary.get("shards", []):
+                for test_result in shard.get("test_results", []):
+                    if test_result.get("status") == "failed":
+                        failed_tests.add(test_result.get("name"))
+
+            attempt += 1
+
+        print(f"Tests still failing after {max_retries} retries")
+        return return_code
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Advanced test runner with parallel execution"
+    )
+    parser.add_argument(
+        "test_binary",
+        help="Path to the test binary"
+    )
+    parser.add_argument(
+        "--shards",
+        type=int,
+        help="Number of parallel shards (default: CPU count)"
+    )
+    parser.add_argument(
+        "--filter",
+        help="Test filter pattern (gtest format)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Directory for test results"
+    )
+    parser.add_argument(
+        "--cache-dir",
+        help="Directory for test cache (default: ~/.yaze_test_cache)"
+    )
+    parser.add_argument(
+        "--retry",
+        type=int,
+        default=0,
+        help="Number of retries for failed tests"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose output"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        runner = TestRunner(
+            test_binary=args.test_binary,
+            num_shards=args.shards,
+            cache_dir=args.cache_dir,
+            verbose=args.verbose
+        )
+
+        if args.retry > 0:
+            return_code = runner.run_with_retry(
+                filter_pattern=args.filter,
+                max_retries=args.retry
+            )
+        else:
+            return_code, _ = runner.run_parallel(
+                filter_pattern=args.filter,
+                output_dir=args.output_dir
+            )
+
+        sys.exit(return_code)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/verify-build-environment.ps1
+++ b/scripts/verify-build-environment.ps1
@@ -198,7 +198,21 @@ function Test-VcpkgCache {
 }

 function Test-CMakeCache {
-    $buildDirs = @("build", "build-windows", "build-test", "build-ai", "out/build")
+    $buildDirs = @(
+        "build",
+        "build-wasm",
+        "build-windows",
+        "build-test",
+        "build_ai",
+        "build_agent",
+        "build_ci",
+        "build_fast",
+        "build_test",
+        "build-wasm-debug",
+        "build_wasm_ai",
+        "build_wasm",
+        "out/build"
+    )
    $cacheIssues = $false
    
    foreach ($dir in $buildDirs) {
@@ -222,7 +236,21 @@ function Clean-CMakeCache {
    
    Write-Status "Cleaning CMake cache and build directories..." "Step"
    
-    $buildDirs = @("build", "build_test", "build-ai", "build_rooms", "out")
+    $buildDirs = @(
+        "build",
+        "build-wasm",
+        "build-test",
+        "build_rooms",
+        "build_ai",
+        "build_agent",
+        "build_ci",
+        "build_fast",
+        "build_test",
+        "build-wasm-debug",
+        "build_wasm_ai",
+        "build_wasm",
+        "out"
+    )
    $cleaned = $false
    
    foreach ($dir in $buildDirs) {
--- a/scripts/verify-build-environment.sh
+++ b/scripts/verify-build-environment.sh
@@ -114,7 +114,23 @@ function test_git_submodules() {
 }

 function test_cmake_cache() {
-    local build_dirs=("build" "build_test" "build-test" "build-grpc-test" "build-rooms" "build-windows" "build_ai" "build_ai_claude" "build_agent" "build_ci")
+    local build_dirs=(
+        "build"
+        "build-wasm"
+        "build-test"
+        "build-grpc-test"
+        "build-rooms"
+        "build-windows"
+        "build_ai"
+        "build_ai_claude"
+        "build_agent"
+        "build_ci"
+        "build_fast"
+        "build_test"
+        "build-wasm-debug"
+        "build_wasm_ai"
+        "build_wasm"
+    )
    local cache_issues=0
    
    for dir in "${build_dirs[@]}"; do
@@ -139,8 +155,8 @@ function test_agent_folder_structure() {
    local agent_files=(
        "src/app/editor/agent/agent_editor.h"
        "src/app/editor/agent/agent_editor.cc"
-        "src/app/editor/agent/agent_chat_widget.h"
-        "src/app/editor/agent/agent_chat_widget.cc"
+        "src/app/editor/agent/agent_chat.h"
+        "src/app/editor/agent/agent_chat.cc"
        "src/app/editor/agent/agent_chat_history_codec.h"
        "src/app/editor/agent/agent_chat_history_codec.cc"
        "src/app/editor/agent/agent_collaboration_coordinator.h"
@@ -148,9 +164,9 @@ function test_agent_folder_structure() {
        "src/app/editor/agent/network_collaboration_coordinator.h"
        "src/app/editor/agent/network_collaboration_coordinator.cc"
    )
-    
+
    local old_system_files=(
-        "src/app/editor/agent/agent_chat_widget.h"
+        "src/app/gui/app/agent_chat_widget.h"
        "src/app/editor/agent/agent_collaboration_coordinator.h"
    )
    
@@ -191,7 +207,23 @@ function test_agent_folder_structure() {
 function clean_cmake_cache() {
    write_status "Cleaning CMake cache and build directories..." "Step"
    
-    local build_dirs=("build" "build_test" "build-test" "build-grpc-test" "build-rooms" "build-windows" "build_ai" "build_ai_claude" "build_agent" "build_ci")
+    local build_dirs=(
+        "build"
+        "build-wasm"
+        "build-test"
+        "build-grpc-test"
+        "build-rooms"
+        "build-windows"
+        "build_ai"
+        "build_ai_claude"
+        "build_agent"
+        "build_ci"
+        "build_fast"
+        "build_test"
+        "build-wasm-debug"
+        "build_wasm_ai"
+        "build_wasm"
+    )
    local cleaned=0
    
    for dir in "${build_dirs[@]}"; do
--- a/scripts/verify-symbols.sh
+++ b/scripts/verify-symbols.sh
@@ -63,7 +63,7 @@ OPTIONS:

 EXAMPLES:
    $0                          # Scan default build directory
-    $0 --build-dir build_test   # Scan specific build directory
+    $0 --build-dir build   # Scan specific build directory
    $0 --verbose                # Show detailed output
    $0 --show-all               # Show all symbols (verbose)