perf(llama): unshadow cache-type + spec-decoding flags for agent opt-in
KV cache quantization (--cache-type-k q4_0) and ngram speculative decoding (--spec-type ngram-mod) are high-value llama.cpp features that improve VRAM usage and tokens/sec. Removing them from the shadowing lists allows agents to enable them via llama_extra_args.
This commit is contained in:
@@ -112,14 +112,14 @@ describe('stripShadowingFlags', () => {
|
||||
expect(result).toEqual(['-c', '4096']);
|
||||
});
|
||||
|
||||
it('strips cache flags by default', () => {
|
||||
it('passes through cache flags (no longer shadowed)', () => {
|
||||
const result = stripShadowingFlags(['--cache-type-k', 'q8_0']);
|
||||
expect(result).toEqual([]);
|
||||
expect(result).toEqual(['--cache-type-k', 'q8_0']);
|
||||
});
|
||||
|
||||
it('strips spec flags by default', () => {
|
||||
it('passes through spec flags (no longer shadowed)', () => {
|
||||
const result = stripShadowingFlags(['--spec-draft-n-max', '16']);
|
||||
expect(result).toEqual([]);
|
||||
expect(result).toEqual(['--spec-draft-n-max', '16']);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user