boocode/apps/control/data/suite-chat-quality.yaml

id: chat-quality
name: Chat Assistant Quality
kind: chat
version: 1
description: Curated prompts scored by LLM-as-judge using rubric criteria.
judge_model: null
tasks:
  - id: code-explanation
    prompt: "Explain what this function does in plain English: function fibonacci(n: number): number { if (n <= 1) return n; return fibonacci(n - 1) + fibonacci(n - 2); }"
    rubric:
      criteria:
        - criterion: accuracy
          description: "Correctly identifies the function computes Fibonacci numbers"
          weight: 3
        - criterion: clarity
          description: "Explanation is clear and accessible to a non-expert"
          weight: 2
        - criterion: completeness
          description: "Mentions recursion, base case, and performance concern"
          weight: 2
      max_score: 7
  - id: debugging-help
    prompt: "My React component re-renders infinitely. Here's the code: function Counter() { const [count, setCount] = useState(0); useEffect(() => { setCount(c => c + 1); }); return <div>{count}</div>; } What's wrong and how do I fix it?"
    rubric:
      criteria:
        - criterion: accuracy
          description: "Identifies the useEffect missing dependency array causing infinite loop"
          weight: 3
        - criterion: solution
          description: "Provides correct fix with dependency array or removed effect"
          weight: 3
        - criterion: explanation
          description: "Explains why the fix works"
          weight: 1
      max_score: 7
  - id: creative-writing
    prompt: "Write a short haiku about debugging software at 3 AM."
    rubric:
      criteria:
        - criterion: form
          description: "Follows 5-7-5 syllable structure"
          weight: 2
        - criterion: relevance
          description: "Topic relates to late-night debugging"
          weight: 2
        - criterion: quality
          description: "Poetic language, not just literal description"
          weight: 2
      max_score: 6
  - id: technical-comparison
    prompt: "Compare Docker containers vs VMs for running a Node.js API. Give me pros and cons of each for this specific use case."
    rubric:
      criteria:
        - criterion: accuracy
          description: "Technically correct comparison points"
          weight: 3
        - criterion: balance
          description: "Covers both pros and cons for each option"
          weight: 2
        - criterion: specificity
          description: "Tailored to Node.js API use case, not generic"
          weight: 2
      max_score: 7
  - id: sql-query-help
    prompt: "I have a users table (id, name, created_at) and orders table (id, user_id, total, created_at). Write a SQL query to find the top 5 users by total spending in the last 30 days."
    rubric:
      criteria:
        - criterion: correctness
          description: "Query is syntactically valid and produces correct results"
          weight: 3
        - criterion: date-filter
          description: "Properly filters to last 30 days"
          weight: 2
        - criterion: aggregation
          description: "Correctly aggregates and orders by total spending"
          weight: 2
      max_score: 7