From a8e475fdf4150baaec5d326582e1115af80cef26 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Sun, 7 Jun 2026 22:40:23 +0000 Subject: [PATCH] perf(llama): unshadow cache-type + spec-decoding flags for agent opt-in KV cache quantization (--cache-type-k q4_0) and ngram speculative decoding (--spec-type ngram-mod) are high-value llama.cpp features that improve VRAM usage and tokens/sec. Removing them from the shadowing lists allows agents to enable them via llama_extra_args. --- .../__tests__/llama-args-validator.test.ts | 8 +++---- .../inference/llama-args-validator.ts | 23 +++++-------------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/apps/server/src/services/__tests__/llama-args-validator.test.ts b/apps/server/src/services/__tests__/llama-args-validator.test.ts index b1c2792..3794198 100644 --- a/apps/server/src/services/__tests__/llama-args-validator.test.ts +++ b/apps/server/src/services/__tests__/llama-args-validator.test.ts @@ -112,14 +112,14 @@ describe('stripShadowingFlags', () => { expect(result).toEqual(['-c', '4096']); }); - it('strips cache flags by default', () => { + it('passes through cache flags (no longer shadowed)', () => { const result = stripShadowingFlags(['--cache-type-k', 'q8_0']); - expect(result).toEqual([]); + expect(result).toEqual(['--cache-type-k', 'q8_0']); }); - it('strips spec flags by default', () => { + it('passes through spec flags (no longer shadowed)', () => { const result = stripShadowingFlags(['--spec-draft-n-max', '16']); - expect(result).toEqual([]); + expect(result).toEqual(['--spec-draft-n-max', '16']); }); }); diff --git a/apps/server/src/services/inference/llama-args-validator.ts b/apps/server/src/services/inference/llama-args-validator.ts index 78bd86f..127c408 100644 --- a/apps/server/src/services/inference/llama-args-validator.ts +++ b/apps/server/src/services/inference/llama-args-validator.ts @@ -131,23 +131,13 @@ export function isManagedFlag(flag: string): boolean { const SHADOW_CONTEXT = ['-c', '--ctx-size']; -const SHADOW_CACHE = ['-ctk', '--cache-type-k', '-ctv', '--cache-type-v']; +// Empty: agents should be able to opt into cache-type flags (lift analysis +// found these are high-value features, not safety concerns). +const SHADOW_CACHE: string[] = []; -const SHADOW_SPEC = [ - '--spec-default', - '--spec-type', - '--spec-ngram-size-n', - '--spec-ngram-size', - '--draft-min', - '--draft-max', - '--spec-draft-n-max', - '--spec-draft-n-min', - '--spec-draft-p-min', - '--spec-draft-p-split', - '--spec-ngram-mod-n-match', - '--spec-ngram-mod-n-min', - '--spec-ngram-mod-n-max', -]; +// Empty: ngram speculative decoding is a performance feature agents should +// be able to enable. +const SHADOW_SPEC: string[] = []; const SHADOW_TEMPLATE = [ '--chat-template', @@ -160,7 +150,6 @@ const SHADOW_TEMPLATE = [ // Shadowing flags that take no value — a boolean switch — so the stripper must // not also drop the following token. const VALUELESS_SHADOW_FLAGS: ReadonlySet = new Set([ - '--spec-default', '--jinja', '--no-jinja', ]);