v2.5.0-task-model: lightweight task model services + tasks table

Task model infrastructure for cheap LLM calls (auto-naming, search rewrite, tags, summaries) via a dedicated llama-server instance at TASK_MODEL_URL, falling back to LLAMA_SWAP_URL with FAST_MODEL when unset. Replaces the inline fetch in auto_name.ts with taskModelCompletion. Adds search query rewriting: on step 0 when web tools are enabled, the user's message is summarized into a search intent hint appended to the system prompt, improving web_search relevance. Schema: tasks table for provider dispatch and arena, sessions.tags column. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-27 21:44:39 +00:00
parent bcfc94fa47
commit fcc7c5a86e
8 changed files with 194 additions and 54 deletions
--- a/apps/server/src/services/auto_name.ts
+++ b/apps/server/src/services/auto_name.ts
@@ -1,9 +1,10 @@
 import type { InferenceContext } from './inference/index.js';
+import { taskModelCompletion } from './task-model.js';

 const NAMING_SYSTEM_PROMPT =
-  'You name chat sessions based on what the assistant did. Summarize the topic or outcome — do NOT copy the first few words verbatim. Reply directly with no thinking, reasoning, or explanation. Output ONLY the title, 4 words max, no quotes, no punctuation, no prefix like "Title:".';
+  'You name chat sessions. Reply with ONLY the title. 4 to 6 words. No quotes, no punctuation, no prefix.';

-const MAX_TITLE_CHARS = 60;
+const MAX_TITLE_CHARS = 80;

 function cleanTitle(raw: string): string {
  let name = raw.trim();
@@ -18,27 +19,7 @@ function cleanTitle(raw: string): string {
  return name;
 }

-interface NamingResponse {
-  choices?: Array<{
-    message?: {
-      content?: string;
-      reasoning_content?: string;
-    };
-  }>;
-}
-
-function pickTitleSource(data: NamingResponse): string {
-  const choice = data.choices?.[0]?.message;
-  if (!choice) return '';
-  if (choice.content && choice.content.trim().length > 0) return choice.content;
-  const reasoning = choice.reasoning_content ?? '';
-  if (reasoning.length === 0) return '';
-  const lines = reasoning
-    .split('\n')
-    .map((l) => l.trim())
-    .filter((l) => l.length > 0);
-  return lines[lines.length - 1] ?? '';
-}
+// TODO: wire suggestTags after task model validation

 export async function maybeAutoNameChat(
  ctx: InferenceContext,
@@ -64,13 +45,6 @@ export async function maybeAutoNameChat(
  if (!chat) return;
  if (chat.name !== null && chat.name !== '') return;

-  const sessionRows = await ctx.sql<{ model: string }[]>`
-    SELECT model FROM sessions WHERE id = ${sessionId}
-  `;
-  // v2.0.5: prefer FAST_MODEL for cheap LLM calls (titles, summaries).
-  const model = ctx.config.FAST_MODEL ?? sessionRows[0]?.model;
-  if (!model) return;
-
  const assistantMsg = await ctx.sql<{ content: string }[]>`
    SELECT content FROM messages
    WHERE chat_id = ${chatId}
@@ -84,32 +58,12 @@ export async function maybeAutoNameChat(

  const assistantText = assistantMsg[0].content.slice(0, 2000);

-  const body = {
-    model,
-    messages: [
-      { role: 'system', content: NAMING_SYSTEM_PROMPT },
-      {
-        role: 'user',
-        content: assistantText,
-      },
-    ],
-    max_tokens: 30,
+  const raw = await taskModelCompletion({
+    system: NAMING_SYSTEM_PROMPT,
+    user: assistantText,
+    maxTokens: 30,
    temperature: 0.3,
-    stream: false,
-    chat_template_kwargs: { enable_thinking: false },
-  };
-
-  const res = await fetch(`${ctx.config.LLAMA_SWAP_URL}/v1/chat/completions`, {
-    method: 'POST',
-    headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify(body),
  });
-  if (!res.ok) {
-    const text = await res.text().catch(() => '');
-    throw new Error(`naming request failed: ${res.status} ${text.slice(0, 200)}`);
-  }
-  const data = (await res.json()) as NamingResponse;
-  const raw = pickTitleSource(data);
  const name = cleanTitle(raw);
  if (!name) {
    ctx.log.warn({ chatId, raw }, 'auto-name: empty title from model');
--- a/apps/server/src/services/inference/turn.ts
+++ b/apps/server/src/services/inference/turn.ts
@@ -14,6 +14,7 @@ import type {
 import { ALL_TOOLS } from '../tools.js';
 import { resolveProjectRoot } from '../path_guard.js';
 import { maybeAutoNameChat } from '../auto_name.js';
+import { rewriteSearchQuery } from '../task-search-rewrite.js';
 import { getAgentById } from '../agents.js';
 import * as compaction from '../compaction.js';
 import type { Broker } from '../broker.js';
@@ -254,6 +255,16 @@ export async function runAssistantTurn(
    const webToolsEnabled =
      iterSession.web_search_enabled ?? iterProject.default_web_search_enabled ?? false;

+    if (stepNumber === 0 && webToolsEnabled && messages.length >= 2) {
+      const lastUserMsg = [...messages].reverse().find((m) => m.role === 'user');
+      if (lastUserMsg?.content) {
+        const hint = await rewriteSearchQuery(lastUserMsg.content);
+        if (hint && messages[0]?.role === 'system' && messages[0].content) {
+          messages[0].content += `\n\nThe user's search intent can be summarized as: "${hint}"`;
+        }
+      }
+    }
+
    const iterArgs: TurnArgs = { sessionId, chatId, assistantMessageId, toolsUsed, recentToolCalls, signal };
    const state: StreamPhaseState = { accumulated: '', startedAt: null };
    let result: StreamResult;
--- a/apps/server/src/services/task-model.ts
+++ b/apps/server/src/services/task-model.ts
@@ -0,0 +1,68 @@
+import { loadConfig, type Config } from '../config.js';
+
+const TIMEOUT_MS = 10_000;
+
+export async function taskModelCompletion(opts: {
+  system: string;
+  user: string;
+  maxTokens?: number;
+  temperature?: number;
+  fallbackModel?: string;
+}): Promise<string> {
+  const config = loadConfig();
+  const maxTokens = opts.maxTokens ?? 30;
+  const temperature = opts.temperature ?? 0.3;
+
+  const { url, model } = resolveEndpoint(config, opts.fallbackModel);
+
+  try {
+    const res = await fetch(`${url}/v1/chat/completions`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        model,
+        messages: [
+          { role: 'system', content: opts.system },
+          { role: 'user', content: opts.user },
+        ],
+        max_tokens: maxTokens,
+        temperature,
+        stream: false,
+        chat_template_kwargs: { enable_thinking: false },
+      }),
+      signal: AbortSignal.timeout(TIMEOUT_MS),
+    });
+    if (!res.ok) {
+      const text = await res.text().catch(() => '');
+      console.warn(`task-model: ${res.status} ${text.slice(0, 200)}`);
+      return '';
+    }
+    const data = (await res.json()) as {
+      choices?: Array<{
+        message?: { content?: string; reasoning_content?: string };
+      }>;
+    };
+    const choice = data.choices?.[0]?.message;
+    if (!choice) return '';
+    const content = (choice.content ?? '').trim();
+    if (content.length > 0) return content;
+    const reasoning = choice.reasoning_content ?? '';
+    if (reasoning.length === 0) return '';
+    const lines = reasoning.split('\n').map((l) => l.trim()).filter((l) => l.length > 0);
+    return lines[lines.length - 1] ?? '';
+  } catch (err) {
+    console.warn('task-model: request failed', err);
+    return '';
+  }
+}
+
+function resolveEndpoint(
+  config: Config,
+  fallbackModel?: string,
+): { url: string; model: string } {
+  if (config.TASK_MODEL_URL) {
+    return { url: config.TASK_MODEL_URL, model: 'gemma-3-270m-it' };
+  }
+  const model = config.FAST_MODEL ?? fallbackModel ?? config.DEFAULT_MODEL;
+  return { url: config.LLAMA_SWAP_URL, model };
+}
--- a/apps/server/src/services/task-search-rewrite.ts
+++ b/apps/server/src/services/task-search-rewrite.ts
@@ -0,0 +1,19 @@
+import { taskModelCompletion } from './task-model.js';
+
+const SYSTEM_PROMPT =
+  'You rewrite user messages into concise web search queries. Reply with ONLY the search query. 3 to 6 words. No quotes, no explanation.';
+
+const MAX_INPUT_CHARS = 500;
+const FALLBACK_CHARS = 60;
+
+export async function rewriteSearchQuery(userMessage: string): Promise<string> {
+  const input = userMessage.slice(0, MAX_INPUT_CHARS);
+  const result = await taskModelCompletion({
+    system: SYSTEM_PROMPT,
+    user: input,
+    maxTokens: 20,
+    temperature: 0.2,
+  });
+  if (result.length > 0) return result;
+  return userMessage.slice(0, FALLBACK_CHARS).trim();
+}
--- a/apps/server/src/services/task-summary.ts
+++ b/apps/server/src/services/task-summary.ts
@@ -0,0 +1,24 @@
+import { taskModelCompletion } from './task-model.js';
+
+const SYSTEM_PROMPT =
+  'Summarize this conversation in one sentence, 15 words max. No quotes, no prefix.';
+
+const MAX_INPUT_CHARS = 1000;
+
+export async function oneLineSummary(
+  messages: Array<{ role: string; content: string }>,
+): Promise<string> {
+  const lastPairs = messages.slice(-6);
+  let input = lastPairs
+    .map((m) => `${m.role}: ${m.content}`)
+    .join('\n');
+  if (input.length > MAX_INPUT_CHARS) {
+    input = input.slice(0, MAX_INPUT_CHARS);
+  }
+  return taskModelCompletion({
+    system: SYSTEM_PROMPT,
+    user: input,
+    maxTokens: 30,
+    temperature: 0.3,
+  });
+}
--- a/apps/server/src/services/task-tags.ts
+++ b/apps/server/src/services/task-tags.ts
@@ -0,0 +1,22 @@
+import { taskModelCompletion } from './task-model.js';
+
+const SYSTEM_PROMPT =
+  'You tag chat sessions. Reply with 1 to 3 lowercase tags separated by commas. Tags should describe the topic. No explanation. Examples: "docker, deployment", "python, debugging", "react, styling".';
+
+export async function suggestTags(
+  userMessage: string,
+  assistantReply: string,
+): Promise<string[]> {
+  const input = `User: ${userMessage.slice(0, 300)}\nAssistant: ${assistantReply.slice(0, 300)}`;
+  const result = await taskModelCompletion({
+    system: SYSTEM_PROMPT,
+    user: input,
+    maxTokens: 30,
+    temperature: 0.3,
+  });
+  if (result.length === 0) return [];
+  return result
+    .split(',')
+    .map((t) => t.trim().toLowerCase())
+    .filter((t) => t.length > 0 && t.length <= 30);
+}