perf(llama): unshadow cache-type + spec-decoding flags for agent opt-in

KV cache quantization (--cache-type-k q4_0) and ngram speculative decoding (--spec-type ngram-mod) are high-value llama.cpp features that improve VRAM usage and tokens/sec. Removing them from the shadowing lists allows agents to enable them via llama_extra_args.
2026-06-07 22:40:23 +00:00
parent 02063072ab
commit a8e475fdf4
2 changed files with 10 additions and 21 deletions
--- a/apps/server/src/services/tests/llama-args-validator.test.ts
+++ b/apps/server/src/services/tests/llama-args-validator.test.ts
@@ -112,14 +112,14 @@ describe('stripShadowingFlags', () => {
    expect(result).toEqual(['-c', '4096']);
  });

-  it('strips cache flags by default', () => {
+  it('passes through cache flags (no longer shadowed)', () => {
    const result = stripShadowingFlags(['--cache-type-k', 'q8_0']);
-    expect(result).toEqual([]);
+    expect(result).toEqual(['--cache-type-k', 'q8_0']);
  });

-  it('strips spec flags by default', () => {
+  it('passes through spec flags (no longer shadowed)', () => {
    const result = stripShadowingFlags(['--spec-draft-n-max', '16']);
-    expect(result).toEqual([]);
+    expect(result).toEqual(['--spec-draft-n-max', '16']);
  });
 });