v1.9.7: ask_user_input elicitation tool

2026-05-18 02:15:18 +00:00
parent adb5d7b3bb
commit d85b17081e
9 changed files with 710 additions and 4 deletions
--- a/apps/server/src/services/agents.ts
+++ b/apps/server/src/services/agents.ts
@@ -15,9 +15,12 @@ const CACHE_TTL_MS = 60_000;
 // explicit `tools:` field inherit the full default set (which now includes
 // the skill tools); agents with an explicit `tools:` array must list any
 // skill tool they want to use — strict opt-in.
+// Batch 9.7: ask_user_input added — same opt-in semantics. Agents with an
+// explicit tools list that omits it cannot trigger the interactive picker.
 const ALL_TOOL_NAMES = [
  'view_file', 'list_dir', 'grep', 'find_files', 'git_status',
  'skill_find', 'skill_use', 'skill_resource',
+  'ask_user_input',
 ] as const;
 const DEFAULT_TOOLS: string[] = [...ALL_TOOL_NAMES];
 const DEFAULT_TEMPERATURE = 0.7;
--- a/apps/server/src/services/inference.ts
+++ b/apps/server/src/services/inference.ts
@@ -665,6 +665,12 @@ async function executeToolPhase(
    model: session.model,
  });

+  // Batch 9.7: ask_user_input pauses the loop. The tool row is still inserted
+  // (the answer endpoint needs a target row to UPDATE), but tool_results is
+  // pre-stamped with output=null as a "pending" sentinel and no tool_result
+  // frame goes out — the card renders from the tool_call frame alone. Mixed
+  // batches still execute the other tools normally.
+  let pausingForUserInput = false;
  await Promise.all(
    toolCalls.map(async (tc) => {
      const [toolRow] = await ctx.sql<{ id: string }[]>`
@@ -673,6 +679,16 @@ async function executeToolPhase(
        RETURNING id
      `;
      const toolMessageId = toolRow!.id;
+      if (tc.name === 'ask_user_input') {
+        pausingForUserInput = true;
+        const sentinel = { tool_call_id: tc.id, output: null, truncated: false };
+        await ctx.sql`
+          UPDATE messages
+          SET tool_results = ${ctx.sql.json(sentinel as never)}
+          WHERE id = ${toolMessageId}
+        `;
+        return;
+      }
      const tres = await executeToolCall(projectRoot, tc);
      const stored = {
        tool_call_id: tc.id,
@@ -697,6 +713,23 @@ async function executeToolPhase(
    })
  );

+  if (pausingForUserInput) {
+    // Drop the dot back to idle — the card is the actionable surface now.
+    // The next inference turn fires from POST /api/chats/:id/answer_user_input
+    // once the user submits their answers.
+    ctx.publishUser({
+      type: 'chat_status',
+      chat_id: chatId,
+      status: 'idle',
+      at: new Date().toISOString(),
+    });
+    ctx.log.info(
+      { sessionId, chatId, assistantMessageId },
+      'inference paused awaiting user input',
+    );
+    return;
+  }
+
  const [nextAssistant] = await ctx.sql<{ id: string }[]>`
    INSERT INTO messages (session_id, chat_id, role, content, status, created_at)
    VALUES (${sessionId}, ${chatId}, 'assistant', '', 'streaming', clock_timestamp())
--- a/apps/server/src/services/tools.ts
+++ b/apps/server/src/services/tools.ts
@@ -405,6 +405,81 @@ export const skillResource: ToolDef<SkillResourceInputT> = {
  },
 };

+// Batch 9.7: ask_user_input. Interactive elicitation. The model emits a tool
+// call with 1-3 structured questions; the inference loop PAUSES (does not
+// execute the tool server-side, does not recurse) and waits for the frontend
+// to POST /api/chats/:id/answer_user_input with the user's selections. See
+// routes/messages.ts for the resume path and services/inference.ts for the
+// pause branch in executeToolPhase.
+const AskUserInputInput = z.object({
+  questions: z
+    .array(
+      z.object({
+        question: z.string().min(1).max(200),
+        type: z.enum(['single_select', 'multi_select']),
+        options: z.array(z.string().min(1).max(80)).min(2).max(6),
+      }),
+    )
+    .min(1)
+    .max(3),
+});
+type AskUserInputInputT = z.infer<typeof AskUserInputInput>;
+
+export const askUserInput: ToolDef<AskUserInputInputT> = {
+  name: 'ask_user_input',
+  description:
+    "Ask the user 1-3 structured questions through an inline picker UI. Use when you genuinely need a choice the user must make (e.g. scope, options, preferences) before continuing. Each question has 2-6 options and accepts free-text answers in addition. The tool call pauses the conversation until the user submits — the next assistant turn sees their answers as the tool result. Do not use for trivial yes/no clarifications you could infer; prefer it over multi-paragraph speculation about what the user might want.",
+  inputSchema: AskUserInputInput,
+  jsonSchema: {
+    type: 'function',
+    function: {
+      name: 'ask_user_input',
+      description:
+        'Ask the user 1-3 structured questions through an inline picker. Pauses the conversation until the user answers; the next turn sees their selections.',
+      parameters: {
+        type: 'object',
+        properties: {
+          questions: {
+            type: 'array',
+            minItems: 1,
+            maxItems: 3,
+            items: {
+              type: 'object',
+              properties: {
+                question: { type: 'string', description: '<=200 chars, shown to the user' },
+                type: {
+                  type: 'string',
+                  enum: ['single_select', 'multi_select'],
+                  description: 'single_select = at most one option; multi_select = any subset',
+                },
+                options: {
+                  type: 'array',
+                  minItems: 2,
+                  maxItems: 6,
+                  items: { type: 'string' },
+                  description: '2-6 strings, each <=80 chars; free-text input is always available alongside',
+                },
+              },
+              required: ['question', 'type', 'options'],
+              additionalProperties: false,
+            },
+          },
+        },
+        required: ['questions'],
+        additionalProperties: false,
+      },
+    },
+  },
+  // Server-side no-op. The "execution" of ask_user_input is the user's
+  // response, captured client-side and posted to /api/chats/:id/answer_user_input.
+  // The inference loop detects this tool by name and pauses before reaching
+  // executeToolCall — this fallback only runs if something bypasses that
+  // branch, in which case the pending sentinel matches the pause-path shape.
+  async execute(input) {
+    return { _pending: true, questions: input.questions };
+  },
+};
+
 export const ALL_TOOLS: ReadonlyArray<ToolDef<unknown>> = [
  viewFile as ToolDef<unknown>,
  listDir as ToolDef<unknown>,
@@ -414,6 +489,7 @@ export const ALL_TOOLS: ReadonlyArray<ToolDef<unknown>> = [
  skillFind as ToolDef<unknown>,
  skillUse as ToolDef<unknown>,
  skillResource as ToolDef<unknown>,
+  askUserInput as ToolDef<unknown>,
 ];

 // v1.8.2: forward-compatible read-only whitelist. An agent whose `tools` is
@@ -422,6 +498,8 @@ export const ALL_TOOLS: ReadonlyArray<ToolDef<unknown>> = [
 // default (10). Every tool in v1.8.2 happens to be read-only, so the
 // non-RO branch only takes effect once BooCoder lands write tools.
 // Batch 9.6: skill_* added; all still read-only.
+// Batch 9.7: ask_user_input added — it pauses execution but doesn't mutate
+// project state, so it belongs in the read-only set for budget purposes.
 export const READ_ONLY_TOOL_NAMES = [
  'view_file',
  'list_dir',
@@ -431,6 +509,7 @@ export const READ_ONLY_TOOL_NAMES = [
  'skill_find',
  'skill_use',
  'skill_resource',
+  'ask_user_input',
 ] as const;

 export const TOOLS_BY_NAME: Record<string, ToolDef<unknown>> = Object.fromEntries(