# YAZE AI Model Evaluation Tasks # # This file defines evaluation tasks for comparing different AI models # used with the z3ed CLI agent system. # # Usage: # ./scripts/ai/run-model-eval.sh --models "llama3,qwen2.5,codellama" --tasks all # ./scripts/ai/run-model-eval.sh --tasks rom_inspection --models "llama3" # # Scoring: # Each task is scored on a 0-10 scale across multiple dimensions: # - accuracy: Did the model answer correctly? # - completeness: Did it include all relevant information? # - tool_usage: Did it use tools appropriately? # - response_time: Measured in seconds (lower is better) version: "1.0" # Models to evaluate by default default_models: - name: "llama3.2:latest" description: "Meta's Llama 3.2 - default baseline" type: "baseline" - name: "qwen2.5-coder:7b" description: "Qwen 2.5 Coder - optimized for code" type: "code" - name: "codellama:7b" description: "Meta's CodeLlama - code generation" type: "code" - name: "mistral:7b" description: "Mistral 7B - general purpose" type: "general" - name: "phi3:medium" description: "Microsoft Phi-3 - efficient" type: "efficient" # Scoring weights for overall score calculation scoring_weights: accuracy: 0.4 completeness: 0.3 tool_usage: 0.2 response_time: 0.1 # Maximum response time before timeout (seconds) timeout: 120 # Evaluation task categories categories: rom_inspection: description: "Tasks that inspect ROM data structures" tasks: - id: "list_dungeons" name: "List Dungeons" prompt: "What dungeons are in this ROM? List their names and IDs." expected_patterns: - "eastern palace|palace of darkness|desert palace" - "tower of hera|swamp palace|skull woods" - "thieves|ice palace|misery mire" required_tool: null scoring: accuracy_criteria: "Lists at least 8 dungeons with correct names" completeness_criteria: "Includes dungeon IDs or entrance info" - id: "describe_overworld" name: "Describe Overworld Map" prompt: "Describe overworld map 0 (Light World). What areas and features are visible?" expected_patterns: - "light world|hyrule" - "castle|sanctuary|kakariko" required_tool: null scoring: accuracy_criteria: "Correctly identifies the Light World" completeness_criteria: "Mentions multiple notable locations" - id: "find_sprites" name: "Find Sprites in Room" prompt: "What sprites are present in dungeon room 0? List their types and positions." expected_patterns: - "sprite|enemy|npc" - "position|coordinate|x|y" required_tool: null scoring: accuracy_criteria: "Lists sprites with correct types" completeness_criteria: "Includes position data" - id: "entrance_info" name: "Get Entrance Information" prompt: "Where is the entrance to the Eastern Palace?" expected_patterns: - "eastern|palace|entrance" - "east|light world" required_tool: null scoring: accuracy_criteria: "Correctly identifies entrance location" completeness_criteria: "Provides coordinates or map reference" code_analysis: description: "Tasks that analyze or generate code" tasks: - id: "explain_function" name: "Explain Function" prompt: "Explain what the function LoadDungeonRoom does in the codebase." expected_patterns: - "dungeon|room|load" - "tilemap|object|sprite" required_tool: "filesystem-read" scoring: accuracy_criteria: "Correctly describes the function purpose" completeness_criteria: "Explains key steps or data flows" - id: "find_bugs" name: "Find Potential Issues" prompt: "Are there any potential issues with how sprite coordinates are handled in room loading?" expected_patterns: - "bounds|overflow|check" - "coordinate|position" required_tool: "filesystem-read" scoring: accuracy_criteria: "Identifies real or plausible issues" completeness_criteria: "Explains why the issue matters" - id: "suggest_refactor" name: "Suggest Refactoring" prompt: "How could the dungeon editor's room rendering be improved for performance?" expected_patterns: - "cache|batch|optimize" - "render|draw|update" required_tool: "filesystem-read" scoring: accuracy_criteria: "Suggests valid optimization strategies" completeness_criteria: "Explains implementation approach" tool_calling: description: "Tasks that require proper tool usage" tasks: - id: "list_files" name: "List Source Files" prompt: "List all .cc files in src/app/editor/" expected_patterns: - "\\.cc" - "editor" required_tool: "filesystem-list" scoring: accuracy_criteria: "Uses filesystem-list tool correctly" completeness_criteria: "Lists files in correct directory" - id: "read_file" name: "Read File Contents" prompt: "What are the first 20 lines of src/app/rom.h?" expected_patterns: - "#ifndef|#define|#include" - "rom|Rom" required_tool: "filesystem-read" scoring: accuracy_criteria: "Uses filesystem-read with correct path" completeness_criteria: "Shows actual file content" - id: "check_existence" name: "Check File Existence" prompt: "Does the file src/app/editor/dungeon/dungeon_editor.cc exist?" expected_patterns: - "exists|found|yes" required_tool: "filesystem-exists" scoring: accuracy_criteria: "Uses filesystem-exists tool" completeness_criteria: "Provides clear yes/no answer" - id: "build_status" name: "Get Build Status" prompt: "What build presets are available for macOS?" expected_patterns: - "mac-dbg|mac-rel|mac-ai|mac-test" - "preset|configure" required_tool: "build-configure" scoring: accuracy_criteria: "Lists valid macOS presets" completeness_criteria: "Describes preset purposes" visual_analysis: description: "Tasks for visual analysis and pattern recognition" tasks: - id: "find_similar_tiles" name: "Find Similar Tiles" prompt: "Find tiles similar to tile 42 in the ROM. Use a similarity threshold of 85%." expected_patterns: - "similar|match|tile" - "similarity|score|percent" required_tool: "visual-find-similar-tiles" scoring: accuracy_criteria: "Uses visual-find-similar-tiles with correct parameters" completeness_criteria: "Returns list of matching tiles with scores" - id: "analyze_spritesheet" name: "Analyze Spritesheet" prompt: "Analyze graphics sheet 10 to find unused regions that could be used for custom graphics." expected_patterns: - "unused|empty|free" - "region|space|tile" required_tool: "visual-analyze-spritesheet" scoring: accuracy_criteria: "Uses visual-analyze-spritesheet tool" completeness_criteria: "Reports locations and sizes of free regions" - id: "palette_usage" name: "Palette Usage Analysis" prompt: "Analyze which palettes are used most frequently in the overworld maps." expected_patterns: - "palette|color" - "usage|count|percent" required_tool: "visual-palette-usage" scoring: accuracy_criteria: "Uses visual-palette-usage with overworld type" completeness_criteria: "Shows palette usage statistics" - id: "tile_histogram" name: "Tile Usage Histogram" prompt: "Generate a histogram of the top 20 most used tiles in dungeon rooms." expected_patterns: - "tile|usage|histogram" - "count|frequency|top" required_tool: "visual-tile-histogram" scoring: accuracy_criteria: "Uses visual-tile-histogram with dungeon type" completeness_criteria: "Lists top tiles with usage counts" project_management: description: "Tasks for project state and snapshot management" tasks: - id: "project_status" name: "Get Project Status" prompt: "What is the current project status? Show me any pending edits and available snapshots." expected_patterns: - "project|status|snapshot" - "edit|pending|initialized" required_tool: "project-status" scoring: accuracy_criteria: "Uses project-status tool correctly" completeness_criteria: "Reports project state, snapshots, and ROM checksum" - id: "create_snapshot" name: "Create Project Snapshot" prompt: "Create a snapshot named 'v1.0' with description 'Initial sprite modifications'." expected_patterns: - "snapshot|created|v1.0" - "edit|delta|saved" required_tool: "project-snapshot" scoring: accuracy_criteria: "Uses project-snapshot with correct name parameter" completeness_criteria: "Confirms snapshot creation with details" - id: "compare_snapshots" name: "Compare Snapshots" prompt: "Compare snapshots 'before-fix' and 'after-fix' to see what changed." expected_patterns: - "diff|compare|changed" - "added|removed|modified" required_tool: "project-diff" scoring: accuracy_criteria: "Uses project-diff with both snapshot names" completeness_criteria: "Shows detailed comparison of edits" - id: "restore_checkpoint" name: "Restore to Checkpoint" prompt: "Restore the ROM to the 'stable' snapshot." expected_patterns: - "restore|snapshot|stable" - "applied|reverted|edit" required_tool: "project-restore" scoring: accuracy_criteria: "Uses project-restore with correct snapshot name" completeness_criteria: "Confirms restoration and lists applied edits" code_generation: description: "Tasks for ASM code generation and patching" tasks: - id: "generate_hook" name: "Generate ASM Hook" prompt: "Generate an ASM hook at address $008040 with label MyCustomHook and 2 NOPs for alignment." expected_patterns: - "hook|JSL|008040" - "MyCustomHook|NOP" required_tool: "codegen-asm-hook" scoring: accuracy_criteria: "Uses codegen-asm-hook with correct address and label" completeness_criteria: "Generates valid ASM with proper hook structure" - id: "find_freespace" name: "Find Freespace for Patch" prompt: "Generate a freespace patch for 256 bytes of code labeled 'NewSpriteCode', preferring bank $3F." expected_patterns: - "freespace|org|NewSpriteCode" - "1F8000|bank|free" required_tool: "codegen-freespace-patch" scoring: accuracy_criteria: "Uses codegen-freespace-patch with size and label" completeness_criteria: "Reports available regions and generates allocation code" - id: "sprite_template" name: "Generate Sprite Template" prompt: "Generate a sprite template named 'FollowerSprite' with init code that sets sprite state and main code that follows the player." expected_patterns: - "sprite|FollowerSprite|template" - "init|main|0DD0" required_tool: "codegen-sprite-template" scoring: accuracy_criteria: "Uses codegen-sprite-template with name and custom code" completeness_criteria: "Generates complete sprite with init and main sections" - id: "event_handler" name: "Generate Event Handler" prompt: "Generate an NMI event handler labeled 'FrameCounter' that increments a counter each frame." expected_patterns: - "NMI|event|handler" - "FrameCounter|INC|counter" required_tool: "codegen-event-handler" scoring: accuracy_criteria: "Uses codegen-event-handler with type=nmi and label" completeness_criteria: "Generates handler with state preservation and custom code" conversation: description: "Tasks testing multi-turn dialog and context" tasks: - id: "follow_up" name: "Follow-up Questions" multi_turn: true prompts: - "What is the main purpose of the Rom class?" - "What methods does it have for loading data?" - "Can you show me an example of using LoadFromFile?" expected_patterns: - "rom|ROM|file" - "load|read|parse" - "example|code|usage" scoring: accuracy_criteria: "Maintains context across turns" completeness_criteria: "Each response builds on previous" - id: "clarification" name: "Handle Clarification" multi_turn: true prompts: - "How do I add a new sprite?" - "I mean in the dungeon editor, not the overworld" expected_patterns: - "sprite|dungeon|editor" - "add|create|place" scoring: accuracy_criteria: "Adjusts response based on clarification" completeness_criteria: "Provides dungeon-specific instructions" # Scoring rubric definitions scoring_rubric: accuracy: 10: "Perfect - completely correct with no errors" 8: "Excellent - minor inaccuracies that don't affect understanding" 6: "Good - mostly correct with some notable errors" 4: "Fair - partially correct but missing key points" 2: "Poor - significant errors or misunderstandings" 0: "Incorrect - completely wrong or off-topic" completeness: 10: "Comprehensive - covers all aspects thoroughly" 8: "Very complete - covers most aspects well" 6: "Adequate - covers main points but missing some details" 4: "Partial - covers some points but lacks depth" 2: "Minimal - barely addresses the question" 0: "Incomplete - doesn't meaningfully address the question" tool_usage: 10: "Perfect - uses correct tools with proper parameters" 8: "Good - uses appropriate tools with minor parameter issues" 6: "Adequate - uses tools but not optimally" 4: "Fair - attempts tool use but with errors" 2: "Poor - wrong tool or significant usage errors" 0: "Failed - doesn't use required tools or fails completely" # Report configuration reporting: output_format: "table" # table, json, markdown show_individual_scores: true show_response_samples: true max_sample_length: 500