id: long-context-retrieval name: Long Context Retrieval kind: chat version: 1 description: Needle-in-haystack and document-QA tasks for file-heavy sessions. judge_model: null tasks: - id: needle-in-haystack prompt: "Here is a long document. Find the value for 'target_key' and return nothing else." prompt_template: "Here is a long document. Find the value for 'target_key' and return nothing else.\n\n{context}\n\nWhat is the value of target_key?" context_generator: "Generate ~4000 words of technical documentation about PostgreSQL performance tuning. Embed the sentence 'target_key: 42' exactly once somewhere in the middle." rubric: criteria: - criterion: accuracy description: "Returns exactly '42' or 'target_key: 42'" weight: 3 - criterion: conciseness description: "Answer is brief, not a long explanation" weight: 1 max_score: 4 - id: multi-doc-qa prompt: "Based on these three documents, answer: What is the recommended maximum heap size for the application?" prompt_template: "Based on these three documents, answer: What is the recommended maximum heap size for the application?\n\n{context}" context_generator: "Generate three ~1000-word technical documents about JVM tuning, with conflicting recommendations. The correct answer is 4GB mentioned in document 2." rubric: criteria: - criterion: accuracy description: "Identifies 4GB as the recommended value" weight: 3 - criterion: source-attribution description: "References which document contains the answer" weight: 2 max_score: 5 - id: codebase-navigation prompt: "In this codebase excerpt, find the function that handles WebSocket connections and explain its parameters." prompt_template: "In this codebase excerpt, find the function that handles WebSocket connections and explain its parameters.\n\n{context}" context_generator: "Generate ~3000 words of TypeScript source code with multiple classes. One class contains a 'handleWebSocket' method with (ws, sessionId, broker) parameters." rubric: criteria: - criterion: accuracy description: "Correctly identifies the handleWebSocket function" weight: 3 - criterion: parameters description: "Lists all three parameters correctly" weight: 2 max_score: 5