feat: sampling knobs + live PTY stream-json + token UI (v2.7.3)

Three small wins from boocode_code_review_v2 §1 #11/#7/#8. #11 sampling knobs: top_n_sigma + dry_* family as first-class Agent fields, threaded into the request body via providerOptions.openaiCompatible. Fixes a latent bug — top_k (rejected by the AI-SDK provider) and min_p (never passed to streamText) were dead on the wire; both now route through the same channel. --reasoning-budget documented in data/AGENTS.md. #7 live PTY stream-json: new stream-json-parser.ts line-buffers qwen/claude NDJSON and emits text/reasoning/tool frames live + persists, with a fallback to the old opaque slice. claude gets --output-format stream-json --verbose. #8 token UI: agent_sessions input/output_tokens/cost now flow through the route + type and render beside the AgentComposerBar session chip. Built by 3 parallel agents. Server 523 + coder 245 tests passing; builds + web tsc clean. Builds on v2.7.2. openspec sampling-streamjson-tokens. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 12:47:17 +00:00
parent 5651f56039
commit a584dd16b0
15 changed files with 945 additions and 22 deletions
--- a/apps/server/src/services/tests/agents.test.ts
+++ b/apps/server/src/services/tests/agents.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, vi, afterEach } from 'vitest';
 import { isAgentRegistryMarkdown, parseAgentsMd } from '../agents.js';

 describe('isAgentRegistryMarkdown', () => {
@@ -31,3 +31,87 @@ Start here
    expect(r.errors.length).toBeGreaterThan(0);
  });
 });
+
+// v2.6 sampling-streamjson-tokens (#11): per-agent llama.cpp sampler extensions.
+describe('parseAgentsMd: v2.6 sampling knobs', () => {
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  const withFrontmatter = (lines: string) => `# Agents
+
+## Sampler
+---
+temperature: 0.6
+${lines}
+tools: [view_file]
+description: test
+---
+You sample.
+`;
+
+  it('parses top_n_sigma and the dry_* family from frontmatter', () => {
+    const md = withFrontmatter(
+      [
+        'top_n_sigma: 1.5',
+        'dry_multiplier: 0.8',
+        'dry_base: 1.75',
+        'dry_allowed_length: 2',
+        'dry_penalty_last_n: -1',
+      ].join('\n'),
+    );
+    const { agents, errors } = parseAgentsMd(md);
+    expect(errors).toHaveLength(0);
+    expect(agents).toHaveLength(1);
+    const a = agents[0]!;
+    expect(a.top_n_sigma).toBe(1.5);
+    expect(a.dry_multiplier).toBe(0.8);
+    expect(a.dry_base).toBe(1.75);
+    expect(a.dry_allowed_length).toBe(2);
+    expect(a.dry_penalty_last_n).toBe(-1);
+  });
+
+  it('defaults the new sampler fields to null when omitted', () => {
+    const { agents } = parseAgentsMd(withFrontmatter('top_p: 0.95'));
+    const a = agents[0]!;
+    expect(a.top_n_sigma).toBeNull();
+    expect(a.dry_multiplier).toBeNull();
+    expect(a.dry_base).toBeNull();
+    expect(a.dry_allowed_length).toBeNull();
+    expect(a.dry_penalty_last_n).toBeNull();
+  });
+
+  it('warns (does not error) on out-of-range top_n_sigma / dry_* values', () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const md = withFrontmatter(
+      [
+        'top_n_sigma: -1',
+        'dry_multiplier: -0.5',
+        'dry_base: -2',
+        'dry_allowed_length: -3',
+        'dry_penalty_last_n: -5',
+      ].join('\n'),
+    );
+    const { agents, errors } = parseAgentsMd(md);
+    expect(errors).toHaveLength(0);
+    expect(agents).toHaveLength(1);
+    // Mirrors top_k/min_p: out-of-range still stored, with a warning.
+    expect(warn).toHaveBeenCalled();
+    const warnings = warn.mock.calls.map((c) => String(c[0])).join('\n');
+    expect(warnings).toContain('top_n_sigma');
+    expect(warnings).toContain('dry_multiplier');
+    expect(warnings).toContain('dry_base');
+    expect(warnings).toContain('dry_allowed_length');
+    expect(warnings).toContain('dry_penalty_last_n');
+  });
+
+  it('errors on non-numeric / non-integer sampler values', () => {
+    const md = withFrontmatter(
+      ['top_n_sigma: high', 'dry_allowed_length: 2.5'].join('\n'),
+    );
+    const { errors } = parseAgentsMd(md);
+    const joined = errors.map((e) => e.reason).join('\n');
+    expect(joined).toContain('top_n_sigma must be a number');
+    expect(joined).toContain('dry_allowed_length must be an integer');
+  });
+});
--- a/apps/server/src/services/agents.ts
+++ b/apps/server/src/services/agents.ts
@@ -88,6 +88,12 @@ interface ParsedFrontmatter {
  top_k?: number;
  min_p?: number;
  presence_penalty?: number;
+  // v2.6 sampling-streamjson-tokens (#11): llama.cpp sampler extensions.
+  top_n_sigma?: number;
+  dry_multiplier?: number;
+  dry_base?: number;
+  dry_allowed_length?: number;
+  dry_penalty_last_n?: number;
  tools?: string[];
  description?: string;
  model?: string;
@@ -178,6 +184,63 @@ function parseFrontmatter(yaml: string): { data: ParsedFrontmatter; errors: stri
      } else {
        errors.push(`presence_penalty must be a number (got "${valueRaw}")`);
      }
+    } else if (key === 'top_n_sigma') {
+      // v2.6 #11: llama.cpp top-n-sigma sampler. Float ≥ 0 (typical 0-3).
+      // Mirrors top_p/min_p: store then warn on out-of-range (non-numeric
+      // hard-fails the block).
+      const n = Number(valueRaw);
+      if (Number.isFinite(n)) {
+        data.top_n_sigma = n;
+        if (n < 0) {
+          console.warn(`agents: top_n_sigma ${n} out of range (≥0), ignoring (falling back to default)`);
+        }
+      } else {
+        errors.push(`top_n_sigma must be a number (got "${valueRaw}")`);
+      }
+    } else if (key === 'dry_multiplier') {
+      // v2.6 #11: DRY repetition-penalty multiplier. Float ≥ 0 (0 disables DRY).
+      const n = Number(valueRaw);
+      if (Number.isFinite(n)) {
+        data.dry_multiplier = n;
+        if (n < 0) {
+          console.warn(`agents: dry_multiplier ${n} out of range (≥0), ignoring (falling back to default)`);
+        }
+      } else {
+        errors.push(`dry_multiplier must be a number (got "${valueRaw}")`);
+      }
+    } else if (key === 'dry_base') {
+      // v2.6 #11: DRY penalty growth base. Float ≥ 0.
+      const n = Number(valueRaw);
+      if (Number.isFinite(n)) {
+        data.dry_base = n;
+        if (n < 0) {
+          console.warn(`agents: dry_base ${n} out of range (≥0), ignoring (falling back to default)`);
+        }
+      } else {
+        errors.push(`dry_base must be a number (got "${valueRaw}")`);
+      }
+    } else if (key === 'dry_allowed_length') {
+      // v2.6 #11: DRY max sequence length not penalized. Integer ≥ 0.
+      const n = Number(valueRaw);
+      if (Number.isInteger(n)) {
+        data.dry_allowed_length = n;
+        if (n < 0) {
+          console.warn(`agents: dry_allowed_length ${n} out of range (≥0), ignoring (falling back to default)`);
+        }
+      } else {
+        errors.push(`dry_allowed_length must be an integer (got "${valueRaw}")`);
+      }
+    } else if (key === 'dry_penalty_last_n') {
+      // v2.6 #11: DRY lookback window. Integer ≥ -1 (-1 = whole context, 0 = off).
+      const n = Number(valueRaw);
+      if (Number.isInteger(n)) {
+        data.dry_penalty_last_n = n;
+        if (n < -1) {
+          console.warn(`agents: dry_penalty_last_n ${n} out of range (≥-1), ignoring (falling back to default)`);
+        }
+      } else {
+        errors.push(`dry_penalty_last_n must be an integer (got "${valueRaw}")`);
+      }
    } else if (key === 'tools') {
      if (valueRaw === '') {
        data.tools = [];
@@ -354,6 +417,11 @@ function parseAgentSection(section: RawSection): Omit<Agent, 'source'> {
    top_k: typeof fm.top_k === 'number' ? fm.top_k : null,
    min_p: typeof fm.min_p === 'number' ? fm.min_p : null,
    presence_penalty: typeof fm.presence_penalty === 'number' ? fm.presence_penalty : null,
+    top_n_sigma: typeof fm.top_n_sigma === 'number' ? fm.top_n_sigma : null,
+    dry_multiplier: typeof fm.dry_multiplier === 'number' ? fm.dry_multiplier : null,
+    dry_base: typeof fm.dry_base === 'number' ? fm.dry_base : null,
+    dry_allowed_length: typeof fm.dry_allowed_length === 'number' ? fm.dry_allowed_length : null,
+    dry_penalty_last_n: typeof fm.dry_penalty_last_n === 'number' ? fm.dry_penalty_last_n : null,
    tools: filteredTools,
    model: typeof fm.model === 'string' && fm.model.length > 0 ? fm.model : null,
    max_tool_calls: typeof fm.max_tool_calls === 'number' ? fm.max_tool_calls : null,
--- a/apps/server/src/services/inference/sentinel-summaries.ts
+++ b/apps/server/src/services/inference/sentinel-summaries.ts
@@ -86,7 +86,7 @@ export async function runCapHitSummary(
      ctx,
      session.model,
      messages,
-      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined },
+      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined, top_n_sigma: agent?.top_n_sigma ?? undefined, dry_multiplier: agent?.dry_multiplier ?? undefined, dry_base: agent?.dry_base ?? undefined, dry_allowed_length: agent?.dry_allowed_length ?? undefined, dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined },
      (delta) => {
        accumulated += delta;
        ctx.publish(sessionId, {
@@ -346,7 +346,7 @@ export async function runDoomLoopSummary(
      ctx,
      session.model,
      messages,
-      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined },
+      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined, top_n_sigma: agent?.top_n_sigma ?? undefined, dry_multiplier: agent?.dry_multiplier ?? undefined, dry_base: agent?.dry_base ?? undefined, dry_allowed_length: agent?.dry_allowed_length ?? undefined, dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined },
      (delta) => {
        accumulated += delta;
        ctx.publish(sessionId, {
@@ -545,7 +545,7 @@ export async function runStepCapSummary(
      ctx,
      session.model,
      messages,
-      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined },
+      { tools: null, temperature: agent?.temperature, top_p: agent?.top_p ?? undefined, top_k: agent?.top_k ?? undefined, min_p: agent?.min_p ?? undefined, presence_penalty: agent?.presence_penalty ?? undefined, top_n_sigma: agent?.top_n_sigma ?? undefined, dry_multiplier: agent?.dry_multiplier ?? undefined, dry_base: agent?.dry_base ?? undefined, dry_allowed_length: agent?.dry_allowed_length ?? undefined, dry_penalty_last_n: agent?.dry_penalty_last_n ?? undefined },
      (delta) => {
        accumulated += delta;
        ctx.publish(sessionId, {
--- a/apps/server/src/services/inference/stream-phase.ts
+++ b/apps/server/src/services/inference/stream-phase.ts
@@ -33,6 +33,39 @@ interface StreamOptions {
  top_k?: number | null;
  min_p?: number | null;
  presence_penalty?: number | null;
+  // v2.6 sampling-streamjson-tokens (#11): llama.cpp sampler extensions. These
+  // are NOT standard AI-SDK streamText options and are NOT serialized by the
+  // openai-compatible provider's standardized-settings path (topK is even
+  // explicitly dropped with an "unsupported feature: topK" warning). They reach
+  // llama-server only via providerOptions.openaiCompatible (see buildSamplerProviderOptions).
+  top_n_sigma?: number | null;
+  dry_multiplier?: number | null;
+  dry_base?: number | null;
+  dry_allowed_length?: number | null;
+  dry_penalty_last_n?: number | null;
+}
+
+// v2.6 #11: build the providerOptions.openaiCompatible extraBody object for the
+// llama.cpp sampler extensions. @ai-sdk/openai-compatible (2.0.47) merges every
+// non-reserved key under providerOptions.openaiCompatible straight into the
+// chat-completion request body (see its getArgs: the Object.fromEntries spread
+// filtered against openaiCompatibleLanguageModelChatOptions.shape). This is the
+// ONLY working passthrough for these params:
+//   - top_k / min_p were latently dropped before this: top_k was passed as the
+//     AI-SDK `topK` setting which the openai-compatible provider rejects as
+//     unsupported; min_p was never passed to streamText at all.
+//   - top_n_sigma + the dry_* family have no AI-SDK equivalent.
+// Keys use llama-server's snake_case body names so they land verbatim.
+function buildSamplerProviderOptions(opts: StreamOptions): Record<string, number> | undefined {
+  const body: Record<string, number> = {};
+  if (typeof opts.top_k === 'number') body.top_k = opts.top_k;
+  if (typeof opts.min_p === 'number') body.min_p = opts.min_p;
+  if (typeof opts.top_n_sigma === 'number') body.top_n_sigma = opts.top_n_sigma;
+  if (typeof opts.dry_multiplier === 'number') body.dry_multiplier = opts.dry_multiplier;
+  if (typeof opts.dry_base === 'number') body.dry_base = opts.dry_base;
+  if (typeof opts.dry_allowed_length === 'number') body.dry_allowed_length = opts.dry_allowed_length;
+  if (typeof opts.dry_penalty_last_n === 'number') body.dry_penalty_last_n = opts.dry_penalty_last_n;
+  return Object.keys(body).length > 0 ? body : undefined;
 }

 // v1.13.1-A: convert BooCode's OpenAI-shaped history into AI SDK
@@ -195,6 +228,14 @@ export async function streamCompletion(
    return toolCall;
  };

+  // v2.6 #11: llama.cpp sampler extensions (top_k, min_p, top_n_sigma, dry_*)
+  // ride providerOptions.openaiCompatible — they are NOT standardized streamText
+  // settings. NB: top_k used to be passed below as the AI-SDK `topK` setting;
+  // the openai-compatible provider dropped it with an "unsupported feature: topK"
+  // warning and min_p was never wired at all, so both were dead on the wire
+  // before this. They now go through the same extraBody path as the new params.
+  const samplerBody = buildSamplerProviderOptions(opts);
+
  const result = streamText({
    model: upstreamModel(ctx.config, model, agent ?? null),
    messages: aiMessages,
@@ -203,8 +244,8 @@ export async function streamCompletion(
      : {}),
    ...(typeof opts.temperature === 'number' ? { temperature: opts.temperature } : {}),
    ...(typeof opts.top_p === 'number' ? { topP: opts.top_p } : {}),
-    ...(typeof opts.top_k === 'number' ? { topK: opts.top_k } : {}),
    ...(typeof opts.presence_penalty === 'number' ? { presencePenalty: opts.presence_penalty } : {}),
+    ...(samplerBody ? { providerOptions: { openaiCompatible: samplerBody } } : {}),
    abortSignal: signal,
  });

@@ -398,6 +439,12 @@ export async function executeStreamPhase(
  const effectiveTopK = agent?.top_k ?? undefined;
  const effectiveMinP = agent?.min_p ?? undefined;
  const effectivePresencePenalty = agent?.presence_penalty ?? undefined;
+  // v2.6 #11: llama.cpp sampler extensions, threaded the same way as top_k/min_p.
+  const effectiveTopNSigma = agent?.top_n_sigma ?? undefined;
+  const effectiveDryMultiplier = agent?.dry_multiplier ?? undefined;
+  const effectiveDryBase = agent?.dry_base ?? undefined;
+  const effectiveDryAllowedLength = agent?.dry_allowed_length ?? undefined;
+  const effectiveDryPenaltyLastN = agent?.dry_penalty_last_n ?? undefined;

  // v1.12.2: ctx_max lookup is cached after the first hit per model, so this
  // is a Map probe in steady state. We capture nCtx once at the top of the
@@ -435,7 +482,19 @@ export async function executeStreamPhase(
      ctx,
      session.model,
      messages,
-      { tools: effectiveTools, temperature: effectiveTemperature, top_p: effectiveTopP, top_k: effectiveTopK, min_p: effectiveMinP, presence_penalty: effectivePresencePenalty },
+      {
+        tools: effectiveTools,
+        temperature: effectiveTemperature,
+        top_p: effectiveTopP,
+        top_k: effectiveTopK,
+        min_p: effectiveMinP,
+        presence_penalty: effectivePresencePenalty,
+        top_n_sigma: effectiveTopNSigma,
+        dry_multiplier: effectiveDryMultiplier,
+        dry_base: effectiveDryBase,
+        dry_allowed_length: effectiveDryAllowedLength,
+        dry_penalty_last_n: effectiveDryPenaltyLastN,
+      },
      (delta) => {
        state.accumulated += delta;
        ctx.publish(sessionId, {
--- a/apps/server/src/types/api.ts
+++ b/apps/server/src/types/api.ts
@@ -117,6 +117,15 @@ export interface Agent {
  top_k: number | null;  // null means omit from request body
  min_p: number | null;  // null means omit from request body
  presence_penalty: number | null;  // null means omit from request body
+  // v2.6 sampling-streamjson-tokens (#11): llama.cpp sampler extensions.
+  // null = omit from request body. top_n_sigma + the DRY repetition family
+  // help the doom-loop-prone local model. All travel via the same
+  // providerOptions.openaiCompatible extraBody channel as top_k/min_p.
+  top_n_sigma: number | null;
+  dry_multiplier: number | null;
+  dry_base: number | null;
+  dry_allowed_length: number | null;
+  dry_penalty_last_n: number | null;
  tools: string[];       // whitelist of tool names; empty = no tools allowed
  model: string | null;  // null means "session.model wins"
  source: AgentSource;