yaze/scripts/extract-symbols.sh

#!/bin/bash
# Symbol Extraction Tool - Extract symbols from compiled object files
# Creates a JSON database of all symbols and their defining object files
#
# Usage: ./scripts/extract-symbols.sh [BUILD_DIR] [OUTPUT_FILE]
#   BUILD_DIR:   Path to CMake build directory (default: build)
#   OUTPUT_FILE: Path to output JSON file (default: build/symbol_database.json)

set -euo pipefail

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Configuration
BUILD_DIR="${1:-.}"
OUTPUT_FILE="${2:-${BUILD_DIR}/symbol_database.json}"
TEMP_SYMBOLS="${BUILD_DIR}/.temp_symbols.txt"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")"

# Platform detection
UNAME_S=$(uname -s)
IS_MACOS=false
IS_LINUX=false
IS_WINDOWS=false

case "${UNAME_S}" in
    Darwin*) IS_MACOS=true ;;
    Linux*) IS_LINUX=true ;;
    MINGW*|MSYS*|CYGWIN*) IS_WINDOWS=true ;;
esac

# Validation
if [[ ! -d "${BUILD_DIR}" ]]; then
    echo -e "${RED}Error: Build directory not found: ${BUILD_DIR}${NC}"
    exit 1
fi

echo -e "${BLUE}=== Symbol Extraction Tool ===${NC}"
echo -e "Build directory: ${BUILD_DIR}"
echo -e "Output file: ${OUTPUT_FILE}"
echo ""

# Function to extract symbols using nm (Unix/macOS)
extract_symbols_unix() {
    local obj_file="$1"
    local obj_name="${obj_file##*/}"

    if ! nm -P "${obj_file}" 2>/dev/null | while read -r sym rest; do
        # Filter out special symbols and undefined references
        if [[ -n "${sym}" ]] && [[ "${rest}" != *"U"* ]]; then
            # Get symbol type (T=text, D=data, R=read-only, etc.)
            local sym_type=$(echo "${rest}" | awk '{print $1}')
            if [[ "${sym_type}" != "U" ]]; then
                echo "${sym}|${obj_name}|${sym_type}"
            fi
        fi
    done; then
        return 1
    fi
    return 0
}

# Function to extract symbols using dumpbin (Windows)
extract_symbols_windows() {
    local obj_file="$1"
    local obj_name="${obj_file##*/}"

    # Use dumpbin to extract symbols
    if dumpbin /symbols "${obj_file}" 2>/dev/null | grep -E "^\s+[0-9A-F]+" | while read -r line; do
        # Parse dumpbin output
        local sym=$(echo "${line}" | awk '{print $NF}')
        if [[ -n "${sym}" ]]; then
            local sym_type="?" # Windows dumpbin doesn't clearly show type
            echo "${sym}|${obj_name}|${sym_type}"
        fi
    done; then
        return 1
    fi
    return 0
}

# Function to collect all object files
collect_object_files() {
    local obj_list="${BUILD_DIR}/.object_files.tmp"
    > "${obj_list}"

    # Find all .o files (Unix/macOS) and .obj files (Windows)
    if ${IS_WINDOWS}; then
        find "${BUILD_DIR}" -type f \( -name "*.obj" -o -name "*.o" \) 2>/dev/null >> "${obj_list}" || true
    else
        find "${BUILD_DIR}" -type f -name "*.o" 2>/dev/null >> "${obj_list}" || true
    fi

    echo "${obj_list}"
}

# Extract symbols from all object files
echo -e "${BLUE}Scanning for object files...${NC}"
OBJ_LIST=$(collect_object_files)
OBJ_COUNT=$(wc -l < "${OBJ_LIST}")

if [[ ${OBJ_COUNT} -eq 0 ]]; then
    echo -e "${YELLOW}Warning: No object files found in ${BUILD_DIR}${NC}"
    echo "Make sure to build the project first."
    exit 1
fi

echo -e "Found ${GREEN}${OBJ_COUNT}${NC} object files"
echo ""
echo -e "${BLUE}Extracting symbols (this may take a moment)...${NC}"

# Process object files and extract symbols
: > "${TEMP_SYMBOLS}"
PROCESSED=0
FAILED=0

while IFS= read -r obj_file; do
    [[ -z "${obj_file}" ]] && continue

    if [[ ! -f "${obj_file}" ]]; then
        echo -e "${YELLOW}Skipping (not found): ${obj_file}${NC}"
        ((FAILED++))
        continue
    fi

    # Extract symbols based on platform
    if ${IS_WINDOWS}; then
        if extract_symbols_windows "${obj_file}" >> "${TEMP_SYMBOLS}" 2>/dev/null; then
            ((PROCESSED++))
        else
            ((FAILED++))
        fi
    else
        if extract_symbols_unix "${obj_file}" >> "${TEMP_SYMBOLS}" 2>/dev/null; then
            ((PROCESSED++))
        else
            ((FAILED++))
        fi
    fi

    # Progress indicator
    if (( PROCESSED % 50 == 0 )); then
        echo -ne "\r  Processed: ${PROCESSED}/${OBJ_COUNT} objects"
    fi
done < "${OBJ_LIST}"

echo -ne "\r  Processed: ${GREEN}${PROCESSED}${NC}/${OBJ_COUNT} objects (${FAILED} failed)     \n"
echo ""

# Generate JSON output
echo -e "${BLUE}Generating symbol database...${NC}"

# Start JSON
cat > "${OUTPUT_FILE}" << 'EOF'
{
  "metadata": {
    "platform": "",
    "build_dir": "",
    "timestamp": "",
    "object_files_scanned": 0,
    "total_symbols": 0
  },
  "conflicts": [],
  "symbols": {}
}
EOF

# Use Python to generate proper JSON (more portable than jq)
python3 << PYTHON_EOF
import json
import sys
from datetime import datetime
from collections import defaultdict

# Read extracted symbols
symbol_dict = defaultdict(list)
total_symbols = 0

try:
    with open("${TEMP_SYMBOLS}", "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("|")
            if len(parts) >= 2:
                symbol = parts[0]
                obj_file = parts[1]
                sym_type = parts[2] if len(parts) > 2 else "?"

                symbol_dict[symbol].append({
                    "object_file": obj_file,
                    "type": sym_type
                })
                total_symbols += 1
except Exception as e:
    print(f"Error reading symbols: {e}", file=sys.stderr)
    sys.exit(1)

# Identify conflicts (symbols defined in multiple object files)
conflicts = []
for symbol, definitions in symbol_dict.items():
    if len(definitions) > 1:
        conflicts.append({
            "symbol": symbol,
            "count": len(definitions),
            "definitions": definitions
        })

# Sort conflicts by count (most duplicated first)
conflicts.sort(key=lambda x: x["count"], reverse=True)

# Build output JSON
output = {
    "metadata": {
        "platform": "${UNAME_S}",
        "build_dir": "${BUILD_DIR}",
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "object_files_scanned": ${PROCESSED},
        "total_symbols": total_symbols,
        "total_conflicts": len(conflicts)
    },
    "conflicts": conflicts,
    "symbols": {}
}

# Add symbols to output (optional - only include conflicted symbols for smaller file)
for symbol, definitions in symbol_dict.items():
    if len(definitions) > 1:
        output["symbols"][symbol] = definitions

# Write JSON
try:
    with open("${OUTPUT_FILE}", "w") as f:
        json.dump(output, f, indent=2)
except Exception as e:
    print(f"Error writing JSON: {e}", file=sys.stderr)
    sys.exit(1)

print(f"Symbol database written to: ${OUTPUT_FILE}")
print(f"Total symbols: {total_symbols}")
print(f"Conflicts found: {len(conflicts)}")
PYTHON_EOF

# Cleanup
rm -f "${TEMP_SYMBOLS}" "${OBJ_LIST}"

# Report results
if [[ -f "${OUTPUT_FILE}" ]]; then
    echo -e "${GREEN}Success!${NC}"
    CONFLICT_COUNT=$(python3 -c "import json; f = json.load(open('${OUTPUT_FILE}')); print(f['metadata'].get('total_conflicts', 0))" 2>/dev/null || echo "?")

    if [[ "${CONFLICT_COUNT}" -gt 0 ]]; then
        echo -e "${YELLOW}Found ${RED}${CONFLICT_COUNT}${YELLOW} symbol conflicts${NC}"
        exit 1  # Exit with error if conflicts found
    else
        echo -e "${GREEN}No symbol conflicts detected!${NC}"
        exit 0
    fi
else
    echo -e "${RED}Failed to generate symbol database${NC}"
    exit 1
fi