v1.11.3: fix ctx_max capture via /props endpoint
- llama-server does not emit n_ctx in timings (confirmed empirically); dead code at inference.ts:479 and compaction.ts:300 never fired - New model-context.ts: cached fetch of /upstream/<model>/props with positive-cache (no TTL) and 60s negative-cache - Wired into all 4 ctx_max write sites: 3 in inference.ts (executeToolPhase, finalizeCompletion, runCapHitSummary) and 1 in compaction.ts (summary row INSERT) - AbortController 3s timeout, lenient parsing with sensible defaults - 12 new vitest cases for the cache module (59 total) - 7 historical assistant rows backfilled manually (see notes)
This commit is contained in:
@@ -21,6 +21,7 @@ import type { Sql } from '../db.js';
|
||||
import type { Config } from '../config.js';
|
||||
import type { Broker } from './broker.js';
|
||||
import { SUMMARY_TEMPLATE } from './compaction-prompt.js';
|
||||
import * as modelContextLookup from './model-context.js';
|
||||
|
||||
const COMPACTION_BUFFER = 20_000;
|
||||
const MIN_PRESERVE_RECENT_TOKENS = 2_000;
|
||||
@@ -271,7 +272,6 @@ interface CompletionResult {
|
||||
content: string;
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
nCtx: number | null;
|
||||
}
|
||||
|
||||
async function callLlamaSwap(
|
||||
@@ -292,14 +292,15 @@ async function callLlamaSwap(
|
||||
const json = (await res.json()) as {
|
||||
choices?: Array<{ message?: { content?: string } }>;
|
||||
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
||||
timings?: { n_ctx?: number };
|
||||
};
|
||||
// v1.11.3: removed the dead `json.timings?.n_ctx` read — llama-server's
|
||||
// completions don't emit n_ctx in timings. ctx_max on the summary row
|
||||
// comes from model-context.getModelContext below in process().
|
||||
const content = json.choices?.[0]?.message?.content ?? '';
|
||||
const promptTokens = json.usage?.prompt_tokens ?? 0;
|
||||
const completionTokens = json.usage?.completion_tokens ?? 0;
|
||||
const nCtx = typeof json.timings?.n_ctx === 'number' ? json.timings.n_ctx : null;
|
||||
log.debug({ promptTokens, completionTokens, nCtx, chars: content.length }, 'compaction llm complete');
|
||||
return { content, promptTokens, completionTokens, nCtx };
|
||||
log.debug({ promptTokens, completionTokens, chars: content.length }, 'compaction llm complete');
|
||||
return { content, promptTokens, completionTokens };
|
||||
}
|
||||
|
||||
// === entry point ===
|
||||
@@ -422,6 +423,12 @@ export async function process(input: ProcessInput): Promise<void> {
|
||||
// 7. Single completion (no tools). Throws on llama-swap failure.
|
||||
result = await callLlamaSwap(config, session.model, payload, log);
|
||||
|
||||
// 7b. v1.11.3: fetch the model's true context window from llama-swap's
|
||||
// /upstream/<model>/props (the streaming completion doesn't carry it).
|
||||
// Same pattern as inference.ts; the cache makes repeated calls free.
|
||||
const mctx = await modelContextLookup.getModelContext(session.model);
|
||||
const nCtx = mctx?.n_ctx ?? null;
|
||||
|
||||
// 8. Insert the new anchored summary row. role='assistant' per spec; the
|
||||
// UI distinguishes via summary=true. tail_start_id points at the first
|
||||
// preserved tail message so debug surfaces / future tools can reason
|
||||
@@ -436,7 +443,7 @@ export async function process(input: ProcessInput): Promise<void> {
|
||||
VALUES (
|
||||
${sessionId}, ${chatId}, 'assistant', ${result.content}, 'message', 'complete',
|
||||
true, ${sel.tail_start_id},
|
||||
${result.completionTokens}, ${result.promptTokens}, ${result.nCtx},
|
||||
${result.completionTokens}, ${result.promptTokens}, ${nCtx},
|
||||
clock_timestamp(), clock_timestamp()
|
||||
)
|
||||
RETURNING id
|
||||
|
||||
Reference in New Issue
Block a user