feat(booterm): structured pty_exited WS notifications. Plan-validated, impl-validated, code-reviewed green (contracts build clean, contracts test 29/29, booterm + web typecheck clean). wip: in-progress inference/provider refactor (agents.ts, provider.ts, new llama-providers.ts, removed llama-args-validator), plus arena, dispatcher, compaction, schema changes. openspec: pty-exit-notifications complete; x-agent-flags planned (not yet implemented).
47 lines
2.4 KiB
YAML
47 lines
2.4 KiB
YAML
id: long-context-retrieval
|
|
name: Long Context Retrieval
|
|
kind: chat
|
|
version: 1
|
|
description: Needle-in-haystack and document-QA tasks for file-heavy sessions.
|
|
judge_model: null
|
|
tasks:
|
|
- id: needle-in-haystack
|
|
prompt: "Here is a long document. Find the value for 'target_key' and return nothing else."
|
|
prompt_template: "Here is a long document. Find the value for 'target_key' and return nothing else.\n\n{context}\n\nWhat is the value of target_key?"
|
|
context_generator: "Generate ~4000 words of technical documentation about PostgreSQL performance tuning. Embed the sentence 'target_key: 42' exactly once somewhere in the middle."
|
|
rubric:
|
|
criteria:
|
|
- criterion: accuracy
|
|
description: "Returns exactly '42' or 'target_key: 42'"
|
|
weight: 3
|
|
- criterion: conciseness
|
|
description: "Answer is brief, not a long explanation"
|
|
weight: 1
|
|
max_score: 4
|
|
- id: multi-doc-qa
|
|
prompt: "Based on these three documents, answer: What is the recommended maximum heap size for the application?"
|
|
prompt_template: "Based on these three documents, answer: What is the recommended maximum heap size for the application?\n\n{context}"
|
|
context_generator: "Generate three ~1000-word technical documents about JVM tuning, with conflicting recommendations. The correct answer is 4GB mentioned in document 2."
|
|
rubric:
|
|
criteria:
|
|
- criterion: accuracy
|
|
description: "Identifies 4GB as the recommended value"
|
|
weight: 3
|
|
- criterion: source-attribution
|
|
description: "References which document contains the answer"
|
|
weight: 2
|
|
max_score: 5
|
|
- id: codebase-navigation
|
|
prompt: "In this codebase excerpt, find the function that handles WebSocket connections and explain its parameters."
|
|
prompt_template: "In this codebase excerpt, find the function that handles WebSocket connections and explain its parameters.\n\n{context}"
|
|
context_generator: "Generate ~3000 words of TypeScript source code with multiple classes. One class contains a 'handleWebSocket' method with (ws, sessionId, broker) parameters."
|
|
rubric:
|
|
criteria:
|
|
- criterion: accuracy
|
|
description: "Correctly identifies the handleWebSocket function"
|
|
weight: 3
|
|
- criterion: parameters
|
|
description: "Lists all three parameters correctly"
|
|
weight: 2
|
|
max_score: 5
|