core: add training model and validators

2025-12-30 13:37:28 -05:00
parent 3de9c302ce
commit a6fd2591dd
9 changed files with 1442 additions and 1 deletions
--- a/src/afs_scawful/cli.py
+++ b/src/afs_scawful/cli.py
@@ -3,12 +3,16 @@
 from __future__ import annotations
 import argparse
 import asyncio
 import json
 from pathlib import Path
 from typing import Iterable
-from .registry import index_datasets, build_dataset_registry, write_dataset_registry
+from .registry import build_dataset_registry, index_datasets, write_dataset_registry
 from .resource_index import ResourceIndexer
 from .paths import resolve_datasets_root, resolve_index_root
 from .training import TrainingSample
 from .validators import default_validators
 def _datasets_index_command(args: argparse.Namespace) -> int:
@@ -43,6 +47,45 @@ def _resources_index_command(args: argparse.Namespace) -> int:
    return 0
 async def _run_validators(sample: TrainingSample, validators) -> list[tuple[str, object]]:
    results: list[tuple[str, object]] = []
    for validator in validators:
        if validator.can_validate(sample):
            result = await validator.validate(sample)
            results.append((validator.name, result))
    return results
 def _validators_list_command(args: argparse.Namespace) -> int:
    validators = default_validators()
    for validator in validators:
        print(f"{validator.name}\t{validator.domain}")
    return 0
 def _validators_run_command(args: argparse.Namespace) -> int:
    sample_path = Path(args.sample).expanduser().resolve()
    payload = json.loads(sample_path.read_text(encoding="utf-8"))
    sample = TrainingSample.from_dict(payload)
    validators = default_validators()
    if args.name:
        validators = [v for v in validators if v.name in args.name]
    results = asyncio.run(_run_validators(sample, validators))
    if not results:
        print("(no validators)")
        return 1
    overall_ok = True
    for name, result in results:
        status = "ok" if result.valid else "fail"
        if not result.valid:
            overall_ok = False
        print(f"{name}\t{status}\t{result.score:.2f}")
    return 0 if overall_ok else 1
 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(prog="afs_scawful")
    subparsers = parser.add_subparsers(dest="command")
@@ -77,6 +120,21 @@ def build_parser() -> argparse.ArgumentParser:
    resources_index.add_argument("--output", help="Output index path.")
    resources_index.set_defaults(func=_resources_index_command)
    validators_parser = subparsers.add_parser("validators", help="Validation tools.")
    validators_sub = validators_parser.add_subparsers(dest="validators_command")
    validators_list = validators_sub.add_parser("list", help="List validators.")
    validators_list.set_defaults(func=_validators_list_command)
    validators_run = validators_sub.add_parser("run", help="Validate a sample JSON.")
    validators_run.add_argument("sample", help="Path to sample JSON.")
    validators_run.add_argument(
        "--name",
        action="append",
        help="Validator name to run (repeatable).",
    )
    validators_run.set_defaults(func=_validators_run_command)
    return parser
@@ -92,6 +150,9 @@ def main(argv: Iterable[str] | None = None) -> int:
    if args.command == "resources" and not getattr(args, "resources_command", None):
        parser.print_help()
        return 1
    if args.command == "validators" and not getattr(args, "validators_command", None):
        parser.print_help()
        return 1
    return args.func(args)
--- a/src/afs_scawful/training.py
+++ b/src/afs_scawful/training.py
@@ -0,0 +1,73 @@
 """Training sample data models for AFS Scawful."""
 from __future__ import annotations
 import json
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Any
@dataclass
 class TrainingSample:
    instruction: str
    input: str
    output: str
    domain: str
    source: str = ""
    sample_id: str = ""
    timestamp: str = ""
    metadata: dict[str, Any] = field(default_factory=dict)
    kg_entities: list[str] = field(default_factory=list)
    kg_validated: bool = False
    def __post_init__(self) -> None:
        if not self.sample_id:
            self.sample_id = str(uuid.uuid4())
        if not self.timestamp:
            self.timestamp = datetime.now().isoformat()
    def to_dict(self) -> dict[str, Any]:
        return {
            "instruction": self.instruction,
            "input": self.input,
            "output": self.output,
            "domain": self.domain,
            "source": self.source,
            "sample_id": self.sample_id,
            "timestamp": self.timestamp,
            "metadata": self.metadata,
            "kg_entities": self.kg_entities,
            "kg_validated": self.kg_validated,
        }
    def to_jsonl_entry(self) -> str:
        payload = {
            "instruction": self.instruction,
            "output": self.output,
        }
        if self.input:
            payload["input"] = self.input
        payload["_metadata"] = {
            "sample_id": self.sample_id,
            "domain": self.domain,
            "source": self.source,
            "timestamp": self.timestamp,
        }
        return json.dumps(payload)
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "TrainingSample":
        return cls(
            instruction=data.get("instruction", ""),
            input=data.get("input", ""),
            output=data.get("output", ""),
            domain=data.get("domain", ""),
            source=data.get("source", ""),
            sample_id=data.get("sample_id", ""),
            timestamp=data.get("timestamp", ""),
            metadata=data.get("metadata", {}) or {},
            kg_entities=data.get("kg_entities", []) or [],
            kg_validated=bool(data.get("kg_validated", False)),
        )
--- a/src/afs_scawful/validators/init.py
+++ b/src/afs_scawful/validators/init.py
@@ -0,0 +1,27 @@
 """Validator registry for AFS Scawful."""
 from .asar_validator import AsarValidator
 from .asm_validator import AsmValidator
 from .base import CompositeValidator, ValidationResult, Validator
 from .cpp_validator import CppValidator
 from .kg_validator import KGValidator
 __all__ = [
    "AsarValidator",
    "AsmValidator",
    "CppValidator",
    "CompositeValidator",
    "KGValidator",
    "ValidationResult",
    "Validator",
    "default_validators",
 ]
 def default_validators() -> list[Validator]:
    return [
        AsmValidator(),
        AsarValidator(),
        CppValidator(),
        KGValidator(),
    ]
--- a/src/afs_scawful/validators/asar_validator.py
+++ b/src/afs_scawful/validators/asar_validator.py
@@ -0,0 +1,127 @@
 """Asar Validator for verifying 65816 assembly code.
 Uses the actual 'asar' binary to assemble code snippets against a dummy ROM.
 This provides 100% accurate syntax and label validation.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 import shutil
 import tempfile
 from pathlib import Path
 from typing import Optional
 from ..training import TrainingSample
 from .base import ValidationResult, Validator
 logger = logging.getLogger(__name__)
 def _resolve_env_path(env_var: str) -> Path | None:
    value = os.environ.get(env_var)
    if not value:
        return None
    return Path(value).expanduser().resolve()
 def _default_asar_path() -> Path:
    env = _resolve_env_path("AFS_ASAR_PATH")
    if env:
        return env
    found = shutil.which("asar")
    if found:
        return Path(found)
    return Path("asar")
 def _default_rom_path() -> Path:
    env = _resolve_env_path("AFS_ASAR_ROM")
    if env:
        return env
    candidate = Path.home() / "src" / "training" / "roms" / "dummy.sfc"
    if candidate.exists():
        return candidate
    return Path.home() / ".context" / "training" / "dummy.sfc"
 class AsarValidator(Validator):
    """Validates assembly code by running it through Asar."""
    def __init__(self, asar_path: Path | None = None, rom_path: Path | None = None):
        super().__init__("AsarValidator", "asm")
        self.asar_path = asar_path or _default_asar_path()
        self.rom_path = rom_path or _default_rom_path()
        if not self.asar_path.exists():
            logger.warning("Asar binary not found at %s", self.asar_path)
        if not self.rom_path.exists():
            logger.warning("Dummy ROM not found at %s", self.rom_path)
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        """Run asar on the sample output code."""
        if not self.asar_path.exists() or not self.rom_path.exists():
            return ValidationResult(
                valid=True,
                score=0.5,
                warnings=["Asar validator skipped: binary or ROM missing"],
            )
        # Extract code (simple heuristic: look for code blocks or use full output)
        code = self._extract_code(sample.output)
        if not code:
            return ValidationResult(valid=False, score=0.0, errors=["No code found"])
        with tempfile.TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            source_file = tmp_path / "test.asm"
            rom_file = tmp_path / "test.sfc"
            # Copy dummy ROM to temp (to avoid modifying the original)
            shutil.copy(self.rom_path, rom_file)
            # Wrap code in a safe patch structure
            # We assume the code is a snippet, so we hook it into free space
            wrapped_code = (
                "lorom\n"
                "org $008000\n"  # Hook into start of ROM
                f"{code}\n"
            )
            source_file.write_text(wrapped_code)
            # Run asar
            proc = await asyncio.create_subprocess_exec(
                str(self.asar_path),
                str(source_file),
                str(rom_file),
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            stdout, stderr = await proc.communicate()
            if proc.returncode == 0:
                return ValidationResult(valid=True, score=1.0)
            else:
                error_msg = stderr.decode() + stdout.decode()
                # Clean up error message
                lines = [l for l in error_msg.split('\n') if "error:" in l.lower()]
                return ValidationResult(
                    valid=False,
                    score=0.0,
                    errors=lines[:3] or ["Asar failed to assemble"],
                )
    def _extract_code(self, text: str) -> str:
        """Extract ASM code from markdown block or raw text."""
        if "```asm" in text:
            parts = text.split("```asm")
            if len(parts) > 1:
                return parts[1].split("```")[0].strip()
        if "```" in text:
            parts = text.split("```")
            if len(parts) > 1:
                return parts[1].strip()
        return text  # Assume raw code if no blocks
--- a/src/afs_scawful/validators/asm_validator.py
+++ b/src/afs_scawful/validators/asm_validator.py
@@ -0,0 +1,342 @@
 """ASM Validator for 65816 assembly training samples.
 Validates:
 - Instruction mnemonics
 - Addressing modes
 - Register usage
 - Memory addressing patterns
 - SNES-specific constructs
 """
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from typing import Optional
 from ..training import TrainingSample
 from .base import ValidationResult, Validator
@dataclass
 class InstructionInfo:
    """Information about a 65816 instruction."""
    mnemonic: str
    addressing_modes: list[str]
    description: str
 class AsmValidator(Validator):
    """Validator for 65816 assembly code in training samples."""
    # Valid 65816 instruction mnemonics
    VALID_MNEMONICS = {
        # Load/Store
        "LDA", "LDX", "LDY", "STA", "STX", "STY", "STZ",
        # Transfer
        "TAX", "TAY", "TXA", "TYA", "TXS", "TSX", "TCD", "TDC", "TCS", "TSC", "TXY", "TYX",
        # Stack
        "PHA", "PHP", "PHX", "PHY", "PHB", "PHD", "PHK",
        "PLA", "PLP", "PLX", "PLY", "PLB", "PLD",
        "PEA", "PEI", "PER",
        # Arithmetic
        "ADC", "SBC", "INC", "INX", "INY", "DEC", "DEX", "DEY",
        # Comparison
        "CMP", "CPX", "CPY",
        # Logical
        "AND", "ORA", "EOR", "BIT",
        # Shift/Rotate
        "ASL", "LSR", "ROL", "ROR",
        # Branch
        "BCC", "BCS", "BEQ", "BMI", "BNE", "BPL", "BVC", "BVS", "BRA", "BRL",
        # Jump
        "JMP", "JML", "JSR", "JSL", "RTS", "RTL", "RTI",
        # Flags
        "CLC", "CLD", "CLI", "CLV", "SEC", "SED", "SEI",
        "REP", "SEP",
        # Processor
        "NOP", "WDM", "STP", "WAI", "XBA", "XCE",
        # Block Move
        "MVP", "MVN",
        # Misc
        "BRK", "COP", "WDM",
        # 65C816 specific
        "TRB", "TSB",
    }
    # Valid addressing mode patterns
    ADDRESSING_PATTERNS = {
        "immediate_8": r"#\$[0-9A-Fa-f]{1,2}",  # #$XX
        "immediate_16": r"#\$[0-9A-Fa-f]{3,4}",  # #$XXXX
        "immediate_symbol": r"#[A-Za-z_]\w*",  # #SYMBOL
        "direct_page": r"\$[0-9A-Fa-f]{1,2}(?!\w)",  # $XX (not followed by more hex)
        "absolute": r"\$[0-9A-Fa-f]{4}(?!\w)",  # $XXXX
        "long": r"\$[0-9A-Fa-f]{6}",  # $XXXXXX
        "indexed_x": r",\s*[Xx]",  # ,X
        "indexed_y": r",\s*[Yy]",  # ,Y
        "indirect": r"\([^)]+\)",  # (...)
        "stack_relative": r"\$[0-9A-Fa-f]{1,2},\s*[Ss]",  # $XX,S
        "accumulator": r"[Aa](?:\s|$)",  # A
        "label": r"[A-Za-z_]\w*",  # Labels
    }
    # SNES-specific registers and addresses
    SNES_REGISTERS = {
        # PPU Registers
        "INIDISP", "OBSEL", "OAMADDL", "OAMADDH", "OAMDATA",
        "BGMODE", "MOSAIC", "BG1SC", "BG2SC", "BG3SC", "BG4SC",
        "BG12NBA", "BG34NBA", "BG1HOFS", "BG1VOFS", "BG2HOFS", "BG2VOFS",
        "BG3HOFS", "BG3VOFS", "BG4HOFS", "BG4VOFS",
        "VMAIN", "VMADDL", "VMADDH", "VMDATAL", "VMDATAH",
        "M7SEL", "M7A", "M7B", "M7C", "M7D", "M7X", "M7Y",
        "CGADD", "CGDATA", "W12SEL", "W34SEL", "WOBJSEL",
        "WH0", "WH1", "WH2", "WH3", "WBGLOG", "WOBJLOG",
        "TM", "TS", "TMW", "TSW", "CGWSEL", "CGADSUB",
        "COLDATA", "SETINI",
        # APU Registers
        "APUIO0", "APUIO1", "APUIO2", "APUIO3",
        # DMA Registers
        "MDMAEN", "HDMAEN", "MEMSEL",
        # CPU Registers
        "NMITIMEN", "WRIO", "WRMPYA", "WRMPYB", "WRDIVL", "WRDIVH",
        "WRDIVB", "HTIMEL", "HTIMEH", "VTIMEL", "VTIMEH",
        "RDNMI", "TIMEUP", "HVBJOY", "RDIO", "RDDIVL", "RDDIVH",
        "RDMPYL", "RDMPYH", "JOY1L", "JOY1H", "JOY2L", "JOY2H",
        "JOY3L", "JOY3H", "JOY4L", "JOY4H",
    }
    # Common ALTTP-specific labels
    ALTTP_LABELS = {
        "Module", "Submodule", "Link", "Player", "Sprite",
        "WRAM", "SRAM", "VRAM", "OAM", "CGRAM",
    }
    VALID_DOMAINS = {"asm", "hack_curated"}
    def __init__(self, strict: bool = False):
        """Initialize ASM validator.
        Args:
            strict: If True, apply stricter validation rules
        """
        super().__init__("AsmValidator", "asm")
        self.strict = strict
    def can_validate(self, sample: TrainingSample) -> bool:
        """Allow ASM validation for curated hack samples too."""
        return sample.domain in self.VALID_DOMAINS or sample.domain.startswith("asm")
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        """Validate 65816 assembly in the sample output."""
        errors: list[str] = []
        warnings: list[str] = []
        details: dict = {
            "instructions_found": 0,
            "valid_instructions": 0,
            "invalid_instructions": [],
            "snes_registers_used": [],
            "addressing_modes": [],
        }
        # Extract code from output
        code = sample.output
        # Parse instructions
        instructions = self._extract_instructions(code)
        details["instructions_found"] = len(instructions)
        if len(instructions) == 0:
            warnings.append("No assembly instructions found in output")
            return ValidationResult(
                valid=True,
                score=0.5,
                warnings=warnings,
                details=details,
            )
        # Validate each instruction
        for line_num, instr in instructions:
            result = self._validate_instruction(instr)
            if result.valid:
                details["valid_instructions"] += 1
                if result.addressing_mode:
                    details["addressing_modes"].append(result.addressing_mode)
            else:
                details["invalid_instructions"].append({
                    "line": line_num,
                    "instruction": instr,
                    "error": result.error,
                })
                if self.strict:
                    errors.append(f"Line {line_num}: {result.error}")
                else:
                    warnings.append(f"Line {line_num}: {result.error}")
        # Check for SNES registers
        for reg in self.SNES_REGISTERS:
            if reg in code:
                details["snes_registers_used"].append(reg)
        # Calculate score
        if details["instructions_found"] > 0:
            score = details["valid_instructions"] / details["instructions_found"]
        else:
            score = 0.5
        # Boost score if SNES-specific content found
        if details["snes_registers_used"]:
            score = min(1.0, score + 0.1)
        return ValidationResult(
            valid=len(errors) == 0,
            score=score,
            errors=errors,
            warnings=warnings,
            details=details,
        )
    def _extract_instructions(self, code: str) -> list[tuple[int, str]]:
        """Extract assembly instructions from code.
        Returns:
            List of (line_number, instruction) tuples
        """
        instructions = []
        lines = code.split("\n")
        for i, line in enumerate(lines, 1):
            # Remove comments
            if ";" in line:
                line = line[:line.index(";")]
            # Remove labels (lines ending with :)
            if ":" in line:
                # Check if it's a label definition
                parts = line.split(":")
                if len(parts) > 1:
                    line = parts[-1]
            # Remove address prefixes like #_008000:
            line = re.sub(r"#_[0-9A-Fa-f]+:\s*", "", line)
            line = line.strip()
            if not line:
                continue
            # Check if line starts with a valid mnemonic
            parts = line.split()
            if parts:
                mnemonic = parts[0].upper()
                if mnemonic in self.VALID_MNEMONICS:
                    instructions.append((i, line))
                elif re.match(r"[A-Za-z]{2,4}", mnemonic):
                    # Might be an instruction-like thing
                    instructions.append((i, line))
        return instructions
    def _validate_instruction(self, instruction: str) -> "_InstructionValidation":
        """Validate a single instruction."""
        parts = instruction.split(None, 1)
        if not parts:
            return _InstructionValidation(False, "Empty instruction")
        mnemonic = parts[0].upper()
        operand = parts[1] if len(parts) > 1 else ""
        # Check mnemonic
        if mnemonic not in self.VALID_MNEMONICS:
            # Check if it's close to a valid mnemonic (typo detection)
            close_matches = [m for m in self.VALID_MNEMONICS
                           if self._levenshtein_distance(mnemonic, m) <= 1]
            if close_matches:
                return _InstructionValidation(
                    False,
                    f"Unknown mnemonic '{mnemonic}' (did you mean {close_matches[0]}?)"
                )
            return _InstructionValidation(False, f"Unknown mnemonic '{mnemonic}'")
        # Validate operand if present
        addressing_mode = None
        if operand:
            addressing_mode = self._detect_addressing_mode(operand)
        return _InstructionValidation(True, None, addressing_mode)
    def _detect_addressing_mode(self, operand: str) -> Optional[str]:
        """Detect the addressing mode from the operand."""
        operand = operand.strip()
        # Check patterns in order of specificity
        if re.match(r"#", operand):
            if re.search(r"#\$[0-9A-Fa-f]{3,4}", operand):
                return "immediate_16"
            elif re.search(r"#\$[0-9A-Fa-f]{1,2}", operand):
                return "immediate_8"
            else:
                return "immediate_symbol"
        if re.search(r",\s*[Ss]", operand):
            return "stack_relative"
        if re.match(r"\([^)]+\)", operand):
            if ",X" in operand.upper():
                return "indexed_indirect_x"
            elif ",Y" in operand.upper():
                return "indirect_indexed_y"
            else:
                return "indirect"
        if re.search(r",\s*[Xx]", operand):
            return "indexed_x"
        if re.search(r",\s*[Yy]", operand):
            return "indexed_y"
        if re.match(r"\$[0-9A-Fa-f]{6}", operand):
            return "long"
        if re.match(r"\$[0-9A-Fa-f]{4}", operand):
            return "absolute"
        if re.match(r"\$[0-9A-Fa-f]{1,2}(?!\w)", operand):
            return "direct_page"
        if re.match(r"[Aa]$", operand):
            return "accumulator"
        if re.match(r"[A-Za-z_]\w*", operand):
            return "label"
        return None
    def _levenshtein_distance(self, s1: str, s2: str) -> int:
        """Calculate Levenshtein distance between two strings."""
        if len(s1) < len(s2):
            return self._levenshtein_distance(s2, s1)
        if len(s2) == 0:
            return len(s1)
        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]
@dataclass
 class _InstructionValidation:
    """Internal result of validating a single instruction."""
    valid: bool
    error: Optional[str] = None
    addressing_mode: Optional[str] = None
--- a/src/afs_scawful/validators/base.py
+++ b/src/afs_scawful/validators/base.py
@@ -0,0 +1,90 @@
 """Base validator interfaces for AFS Scawful."""
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import Any
 from ..training import TrainingSample
@dataclass
 class ValidationResult:
    valid: bool
    score: float
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)
    details: dict[str, Any] = field(default_factory=dict)
    def to_dict(self) -> dict[str, Any]:
        return {
            "valid": self.valid,
            "score": self.score,
            "errors": list(self.errors),
            "warnings": list(self.warnings),
            "details": dict(self.details),
        }
 class Validator(ABC):
    def __init__(self, name: str, domain: str) -> None:
        self.name = name
        self.domain = domain
    @abstractmethod
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        raise NotImplementedError
    def can_validate(self, sample: TrainingSample) -> bool:
        return sample.domain == self.domain
    async def validate_batch(self, samples: list[TrainingSample]) -> list[ValidationResult]:
        results: list[ValidationResult] = []
        for sample in samples:
            if self.can_validate(sample):
                results.append(await self.validate(sample))
            else:
                results.append(
                    ValidationResult(
                        valid=True,
                        score=1.0,
                        warnings=[f"{self.name} skipped: domain mismatch"],
                    )
                )
        return results
 class CompositeValidator(Validator):
    def __init__(self, validators: list[Validator]) -> None:
        super().__init__("CompositeValidator", "all")
        self.validators = validators
    def can_validate(self, sample: TrainingSample) -> bool:
        return any(validator.can_validate(sample) for validator in self.validators)
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        applicable = [v for v in self.validators if v.can_validate(sample)]
        if not applicable:
            return ValidationResult(valid=True, score=1.0, warnings=["No applicable validators"])
        errors: list[str] = []
        warnings: list[str] = []
        details: dict[str, Any] = {}
        scores: list[float] = []
        for validator in applicable:
            result = await validator.validate(sample)
            errors.extend(result.errors)
            warnings.extend(result.warnings)
            details[validator.name] = result.to_dict()
            scores.append(result.score)
        score = sum(scores) / len(scores) if scores else 1.0
        return ValidationResult(
            valid=len(errors) == 0,
            score=score,
            errors=errors,
            warnings=warnings,
            details=details,
        )
--- a/src/afs_scawful/validators/cpp_validator.py
+++ b/src/afs_scawful/validators/cpp_validator.py
@@ -0,0 +1,340 @@
 """C++ Validator for training samples.
 Validates:
 - Basic syntax checks (brackets, braces, semicolons)
 - Keyword usage
 - Common patterns
 - Optional: Compile check with clang (if available)
 """
 from __future__ import annotations
 import asyncio
 import re
 import shutil
 import tempfile
 from pathlib import Path
 from typing import Optional
 from ..training import TrainingSample
 from .base import ValidationResult, Validator
 class CppValidator(Validator):
    """Validator for C++ code in training samples."""
    # C++ keywords
    KEYWORDS = {
        # Storage class
        "auto", "register", "static", "extern", "mutable", "thread_local",
        # Type specifiers
        "void", "bool", "char", "short", "int", "long", "float", "double",
        "signed", "unsigned", "wchar_t", "char8_t", "char16_t", "char32_t",
        # Type qualifiers
        "const", "volatile", "constexpr", "consteval", "constinit",
        # Control flow
        "if", "else", "switch", "case", "default", "while", "do", "for",
        "break", "continue", "return", "goto",
        # Declarations
        "class", "struct", "union", "enum", "typedef", "using", "namespace",
        "template", "typename", "concept", "requires",
        # Access specifiers
        "public", "private", "protected",
        # Other keywords
        "virtual", "override", "final", "explicit", "inline", "friend",
        "operator", "sizeof", "alignof", "decltype", "typeid",
        "new", "delete", "this", "nullptr", "true", "false",
        "try", "catch", "throw", "noexcept",
        "static_assert", "static_cast", "dynamic_cast", "const_cast", "reinterpret_cast",
        "co_await", "co_return", "co_yield",
        # Modules (C++20)
        "module", "import", "export",
    }
    # Common C++ standard library types
    STD_TYPES = {
        "string", "vector", "map", "unordered_map", "set", "unordered_set",
        "list", "deque", "array", "pair", "tuple", "optional", "variant",
        "shared_ptr", "unique_ptr", "weak_ptr", "function", "any",
        "thread", "mutex", "lock_guard", "unique_lock", "condition_variable",
        "future", "promise", "async", "atomic",
        "ifstream", "ofstream", "fstream", "stringstream", "ostringstream",
        "iostream", "cin", "cout", "cerr", "endl",
        "size_t", "ptrdiff_t", "nullptr_t", "byte",
        "int8_t", "int16_t", "int32_t", "int64_t",
        "uint8_t", "uint16_t", "uint32_t", "uint64_t",
    }
    def __init__(
        self,
        check_compile: bool = False,
        compiler: str = "clang++",
        strict: bool = False,
    ):
        """Initialize C++ validator.
        Args:
            check_compile: If True, attempt to compile the code
            compiler: Compiler to use for compile checks
            strict: If True, apply stricter validation
        """
        super().__init__("CppValidator", "cpp")
        self.check_compile = check_compile
        self.compiler = compiler
        self.strict = strict
        # Check if compiler is available
        self._compiler_available = shutil.which(compiler) is not None
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        """Validate C++ code in the sample output."""
        errors: list[str] = []
        warnings: list[str] = []
        details: dict = {
            "syntax_issues": [],
            "keywords_found": [],
            "std_types_found": [],
            "bracket_balance": True,
            "compile_checked": False,
            "compile_result": None,
        }
        code = sample.output
        # Basic syntax checks
        syntax_result = self._check_syntax(code)
        details["syntax_issues"] = syntax_result["issues"]
        details["bracket_balance"] = syntax_result["balanced"]
        if not syntax_result["balanced"]:
            errors.append("Unbalanced brackets/braces/parentheses")
        for issue in syntax_result["issues"]:
            if self.strict:
                errors.append(issue)
            else:
                warnings.append(issue)
        # Check for keywords and types
        details["keywords_found"] = self._find_keywords(code)
        details["std_types_found"] = self._find_std_types(code)
        # Compile check if enabled and available
        if self.check_compile and self._compiler_available:
            compile_result = await self._check_compile(code)
            details["compile_checked"] = True
            details["compile_result"] = compile_result
            if not compile_result["success"]:
                if self.strict:
                    errors.append(f"Compile error: {compile_result['error'][:200]}")
                else:
                    warnings.append(f"Compile warning: {compile_result['error'][:100]}")
        # Calculate score
        score = 1.0
        # Deduct for syntax issues
        score -= len(details["syntax_issues"]) * 0.1
        score = max(0.0, score)
        # Deduct for bracket imbalance
        if not details["bracket_balance"]:
            score -= 0.3
        # Bonus for using C++ features
        if details["keywords_found"]:
            score = min(1.0, score + 0.05)
        if details["std_types_found"]:
            score = min(1.0, score + 0.05)
        # Deduct for compile failure
        if details["compile_checked"] and not details["compile_result"]["success"]:
            score -= 0.2
        score = max(0.0, min(1.0, score))
        return ValidationResult(
            valid=len(errors) == 0,
            score=score,
            errors=errors,
            warnings=warnings,
            details=details,
        )
    def _check_syntax(self, code: str) -> dict:
        """Check basic C++ syntax."""
        issues = []
        balanced = True
        # Check bracket balance
        stack = []
        pairs = {"(": ")", "[": "]", "{": "}"}
        in_string = False
        in_char = False
        in_comment = False
        in_block_comment = False
        i = 0
        while i < len(code):
            c = code[i]
            # Handle comments
            if not in_string and not in_char:
                if i < len(code) - 1:
                    two_char = code[i:i+2]
                    if two_char == "//":
                        # Skip to end of line
                        while i < len(code) and code[i] != "\n":
                            i += 1
                        continue
                    elif two_char == "/*":
                        in_block_comment = True
                        i += 2
                        continue
                    elif two_char == "*/" and in_block_comment:
                        in_block_comment = False
                        i += 2
                        continue
            if in_block_comment:
                i += 1
                continue
            # Handle strings
            if c == '"' and not in_char and (i == 0 or code[i-1] != '\\'):
                in_string = not in_string
            elif c == "'" and not in_string and (i == 0 or code[i-1] != '\\'):
                in_char = not in_char
            if not in_string and not in_char:
                if c in pairs:
                    stack.append(c)
                elif c in pairs.values():
                    if not stack:
                        balanced = False
                        issues.append(f"Unexpected closing bracket '{c}'")
                    else:
                        expected = pairs[stack.pop()]
                        if c != expected:
                            balanced = False
                            issues.append(f"Mismatched brackets: expected '{expected}', got '{c}'")
            i += 1
        if stack:
            balanced = False
            issues.append(f"Unclosed brackets: {stack}")
        # Check for common issues
        # Missing semicolons after statements (heuristic)
        lines = code.split("\n")
        for i, line in enumerate(lines):
            stripped = line.strip()
            # Skip empty lines, comments, preprocessor
            if not stripped or stripped.startswith("//") or stripped.startswith("#"):
                continue
            # Skip lines that end with block characters
            if stripped.endswith("{") or stripped.endswith("}") or stripped.endswith(":"):
                continue
            # Skip lines that are likely continuations
            if stripped.endswith(",") or stripped.endswith("\\"):
                continue
            # Check for statements that should end with semicolon
            # This is a heuristic and may have false positives
            statement_patterns = [
                r"return\s+.+[^;]$",  # return without semicolon
                r"break$",  # break without semicolon
                r"continue$",  # continue without semicolon
            ]
            for pattern in statement_patterns:
                if re.search(pattern, stripped):
                    issues.append(f"Line {i+1}: Possibly missing semicolon")
                    break
        return {"issues": issues, "balanced": balanced}
    def _find_keywords(self, code: str) -> list[str]:
        """Find C++ keywords in code."""
        found = []
        # Use word boundaries to find keywords
        for keyword in self.KEYWORDS:
            if re.search(rf"\b{keyword}\b", code):
                found.append(keyword)
        return found
    def _find_std_types(self, code: str) -> list[str]:
        """Find standard library types in code."""
        found = []
        for type_name in self.STD_TYPES:
            # Check for std::type or just type in common contexts
            if re.search(rf"std::{type_name}\b", code) or re.search(rf"\b{type_name}<", code):
                found.append(type_name)
        return found
    async def _check_compile(self, code: str) -> dict:
        """Attempt to compile the code."""
        # Create temporary file
        with tempfile.NamedTemporaryFile(
            mode="w", suffix=".cpp", delete=False
        ) as f:
            # Add minimal includes for standalone compilation
            wrapped_code = """
 #include <cstdint>
 #include <string>
 #include <vector>
 #include <memory>
 // Sample code below
 """ + code
            f.write(wrapped_code)
            temp_path = Path(f.name)
        try:
            # Run compiler with syntax-only check
            process = await asyncio.create_subprocess_exec(
                self.compiler,
                "-fsyntax-only",
                "-std=c++17",
                "-Wall",
                str(temp_path),
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            stdout, stderr = await asyncio.wait_for(
                process.communicate(), timeout=10.0
            )
            success = process.returncode == 0
            error = stderr.decode("utf-8", errors="replace") if stderr else ""
            return {
                "success": success,
                "error": error,
                "returncode": process.returncode,
            }
        except asyncio.TimeoutError:
            return {
                "success": False,
                "error": "Compilation timed out",
                "returncode": -1,
            }
        except Exception as e:
            return {
                "success": False,
                "error": str(e),
                "returncode": -1,
            }
        finally:
            # Clean up temp file
            try:
                temp_path.unlink()
            except Exception:
                pass
--- a/src/afs_scawful/validators/kg_validator.py
+++ b/src/afs_scawful/validators/kg_validator.py
@@ -0,0 +1,349 @@
 """Knowledge Graph Validator for training samples.
 Validates:
 - Entity presence in knowledge graph
 - Relationship consistency
 - Cross-reference validity
 - Domain alignment
 """
 from __future__ import annotations
 import json
 import re
 from pathlib import Path
 from typing import Any, Optional
 from ..training import TrainingSample
 from .base import ValidationResult, Validator
 def _default_graph_path() -> Path:
    candidate = Path.home() / "src" / "context" / "memory" / "knowledge_graph.json"
    if candidate.exists():
        return candidate
    return Path.home() / ".context" / "memory" / "knowledge_graph.json"
 class KGValidator(Validator):
    """Validator for knowledge graph consistency in training samples."""
    def __init__(
        self,
        graph_path: Optional[Path] = None,
        strict: bool = False,
        min_entity_coverage: float = 0.3,
    ):
        """Initialize KG validator.
        Args:
            graph_path: Path to knowledge graph JSON. Defaults to ~/src/context/memory/knowledge_graph.json
                (fallback: ~/.context/memory/knowledge_graph.json).
            strict: If True, apply stricter validation (missing entities are errors)
            min_entity_coverage: Minimum fraction of mentioned entities that must be in KG
        """
        super().__init__("KGValidator", "all")  # Applies to all domains
        self.graph_path = graph_path or _default_graph_path()
        self.strict = strict
        self.min_entity_coverage = min_entity_coverage
        # Lazy load graph
        self._graph: Optional[dict] = None
        self._nodes: dict[str, Any] = {}
        self._edges: list[dict[str, Any]] = []
        self._node_names: set[str] = set()
        self._routines: set[str] = set()
        self._symbols: set[str] = set()
    def _load_graph(self) -> None:
        """Load knowledge graph from disk."""
        if self._graph is not None:
            return
        if not self.graph_path.exists():
            self._graph = {"nodes": {}, "edges": []}
            return
        try:
            data = json.loads(self.graph_path.read_text())
            self._graph = data
            self._nodes = data.get("nodes", {})
            self._edges = data.get("edges", [])
            # Build lookup sets
            for node_id, node_data in self._nodes.items():
                self._node_names.add(node_id.lower())
                # Extract name from node data
                if isinstance(node_data, dict):
                    name = node_data.get("name", "")
                    if name:
                        self._node_names.add(name.lower())
                    # Track routines and symbols specifically
                    node_type = node_data.get("type", "")
                    if node_type == "routine":
                        self._routines.add(name.lower())
                    elif node_type == "symbol":
                        self._symbols.add(name.lower())
        except Exception:
            self._graph = {"nodes": {}, "edges": []}
    def can_validate(self, sample: TrainingSample) -> bool:
        """KG validator can validate any sample with kg_entities."""
        return True  # Applies to all domains
    async def validate(self, sample: TrainingSample) -> ValidationResult:
        """Validate knowledge graph consistency in the sample."""
        self._load_graph()
        errors: list[str] = []
        warnings: list[str] = []
        details: dict = {
            "entities_mentioned": [],
            "entities_found": [],
            "entities_missing": [],
            "routines_mentioned": [],
            "symbols_mentioned": [],
            "relationships_valid": True,
            "coverage": 0.0,
        }
        # Extract entities from sample
        text = f"{sample.instruction} {sample.input} {sample.output}"
        mentioned = self._extract_entities(text, sample.domain)
        details["entities_mentioned"] = mentioned
        # Check which entities exist in KG
        found = []
        missing = []
        for entity in mentioned:
            entity_lower = entity.lower()
            if self._entity_exists(entity_lower):
                found.append(entity)
            else:
                missing.append(entity)
        details["entities_found"] = found
        details["entities_missing"] = missing
        # Calculate coverage
        if mentioned:
            coverage = len(found) / len(mentioned)
        else:
            coverage = 1.0  # No entities to validate
        details["coverage"] = coverage
        # Check for routine/symbol references in ASM samples
        if sample.domain.startswith("asm"):
            routines = self._extract_routine_references(sample.output)
            symbols = self._extract_symbol_references(sample.output)
            details["routines_mentioned"] = routines
            details["symbols_mentioned"] = symbols
            # Check routine validity
            for routine in routines:
                if routine.lower() not in self._routines and routine.lower() not in self._node_names:
                    if self.strict:
                        errors.append(f"Unknown routine: {routine}")
                    else:
                        warnings.append(f"Routine not in KG: {routine}")
        # Check kg_entities from sample metadata
        if sample.kg_entities:
            for entity in sample.kg_entities:
                if not self._entity_exists(entity.lower()):
                    if self.strict:
                        errors.append(f"Tagged entity not in KG: {entity}")
                    else:
                        warnings.append(f"Tagged entity not in KG: {entity}")
        # Validate coverage threshold
        if coverage < self.min_entity_coverage and mentioned:
            msg = f"Entity coverage {coverage:.1%} below threshold {self.min_entity_coverage:.1%}"
            if self.strict:
                errors.append(msg)
            else:
                warnings.append(msg)
        # Calculate score
        score = 1.0
        # Base score on coverage
        score = min(1.0, coverage + 0.3)  # Coverage contributes up to 0.7
        # Bonus for having KG entities tagged
        if sample.kg_entities and sample.kg_validated:
            score = min(1.0, score + 0.1)
        # Penalty for missing entities
        if missing:
            penalty = len(missing) * 0.05
            score = max(0.3, score - penalty)
        return ValidationResult(
            valid=len(errors) == 0,
            score=score,
            errors=errors,
            warnings=warnings,
            details=details,
        )
    def _entity_exists(self, entity: str) -> bool:
        """Check if an entity exists in the knowledge graph."""
        entity_lower = entity.lower()
        # Direct match
        if entity_lower in self._node_names:
            return True
        # Check with common prefixes
        prefixes = ["alttp:", "oracle-of-secrets:", "project:", "routine:", "symbol:"]
        for prefix in prefixes:
            if f"{prefix}{entity_lower}" in self._node_names:
                return True
            # Also check node IDs directly
            for node_id in self._nodes:
                if node_id.lower().endswith(f":{entity_lower}"):
                    return True
        return False
    def _extract_entities(self, text: str, domain: str) -> list[str]:
        """Extract potential entity references from text."""
        entities = []
        # Common patterns for entity references
        patterns = [
            # Code references like `EntityName` or `RoutineName`
            r'`([A-Z][a-zA-Z0-9_]+)`',
            # Capitalized terms that look like identifiers
            r'\b([A-Z][a-z]+(?:[A-Z][a-z]+)+)\b',  # CamelCase
            # Routine names (common in ASM)
            r'\b(Link_[A-Za-z0-9_]+)\b',
            r'\b(Player_[A-Za-z0-9_]+)\b',
            r'\b(Sprite_[A-Za-z0-9_]+)\b',
            r'\b(Module_[A-Za-z0-9_]+)\b',
            # Memory addresses with labels
            r'\b([A-Z][A-Za-z0-9]+_[A-Z][A-Za-z0-9]+)\b',
        ]
        for pattern in patterns:
            matches = re.findall(pattern, text)
            entities.extend(matches)
        # Domain-specific extraction
        if domain.startswith("asm"):
            # Extract ASM-specific references
            asm_patterns = [
                r'\b([A-Z][a-z]+_[A-Z][a-z_0-9]+)\b',  # Link_HandleSword
                r'@([A-Za-z_][A-Za-z0-9_]+)',  # @Labels
            ]
            for pattern in asm_patterns:
                matches = re.findall(pattern, text)
                entities.extend(matches)
        elif domain == "cpp":
            # Extract C++ class/function names
            cpp_patterns = [
                r'\bclass\s+([A-Z][a-zA-Z0-9_]+)\b',
                r'\b([A-Z][a-z]+(?:[A-Z][a-z]+)+)::\w+',  # ClassName::method
            ]
            for pattern in cpp_patterns:
                matches = re.findall(pattern, text)
                entities.extend(matches)
        # Deduplicate while preserving order
        seen = set()
        unique = []
        for e in entities:
            if e.lower() not in seen:
                seen.add(e.lower())
                unique.append(e)
        return unique
    def _extract_routine_references(self, code: str) -> list[str]:
        """Extract routine/label references from ASM code."""
        routines = []
        # JSR/JSL targets
        jsr_pattern = r'\b(?:JSR|JSL|JMP|JML)\s+([A-Za-z_][A-Za-z0-9_]+)\b'
        matches = re.findall(jsr_pattern, code, re.IGNORECASE)
        routines.extend(matches)
        # BRA/BRL targets
        branch_pattern = r'\b(?:BRA|BRL|BEQ|BNE|BCC|BCS|BMI|BPL)\s+([A-Za-z_][A-Za-z0-9_]+)\b'
        matches = re.findall(branch_pattern, code, re.IGNORECASE)
        routines.extend(matches)
        return list(set(routines))
    def _extract_symbol_references(self, code: str) -> list[str]:
        """Extract symbol/variable references from ASM code."""
        symbols = []
        # LDA/STA with labels
        load_store_pattern = r'\b(?:LDA|LDX|LDY|STA|STX|STY)\s+([A-Za-z_][A-Za-z0-9_]+)\b'
        matches = re.findall(load_store_pattern, code, re.IGNORECASE)
        symbols.extend(matches)
        # Filter out common non-symbol patterns
        filtered = []
        for sym in symbols:
            # Skip if it looks like a routine name
            if sym.lower() in self._routines:
                continue
            # Skip common mnemonics that might be captured
            if sym.upper() in {'A', 'X', 'Y', 'S'}:
                continue
            filtered.append(sym)
        return list(set(filtered))
    def get_related_entities(self, entity: str) -> list[dict[str, Any]]:
        """Get entities related to a given entity in the KG."""
        self._load_graph()
        related = []
        entity_lower = entity.lower()
        for edge in self._edges:
            source = str(edge.get("source", "")).lower()
            target = str(edge.get("target", "")).lower()
            relation = edge.get("relation", "")
            if entity_lower in source:
                related.append({
                    "entity": edge.get("target"),
                    "relation": relation,
                    "direction": "outgoing",
                })
            elif entity_lower in target:
                related.append({
                    "entity": edge.get("source"),
                    "relation": relation,
                    "direction": "incoming",
                })
        return related
    def suggest_entities(self, partial: str, limit: int = 10) -> list[str]:
        """Suggest entity names matching a partial string."""
        self._load_graph()
        partial_lower = partial.lower()
        matches = []
        for node_id in self._nodes:
            if partial_lower in node_id.lower():
                matches.append(node_id)
                if len(matches) >= limit:
                    break
        return matches
--- a/tests/test_validators.py
+++ b/tests/test_validators.py
@@ -0,0 +1,32 @@
 from __future__ import annotations
 import asyncio
 from afs_scawful.training import TrainingSample
 from afs_scawful.validators import AsmValidator, CppValidator
 def test_asm_validator_basic() -> None:
    sample = TrainingSample(
        instruction="",
        input="",
        output="LDA #$01\nSTA $7E0000\n",
        domain="asm",
        source="test",
    )
    result = asyncio.run(AsmValidator().validate(sample))
    assert result.valid
    assert result.score > 0.0
 def test_cpp_validator_basic() -> None:
    sample = TrainingSample(
        instruction="",
        input="",
        output="int main() { return 0; }\n",
        domain="cpp",
        source="test",
    )
    result = asyncio.run(CppValidator(check_compile=False).validate(sample))
    assert result.valid
    assert result.score > 0.0