boocode/apps/server/src/services/inference/provider.ts

import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
import type { LanguageModel } from 'ai';

// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
// upstream without touching env vars. No apiKey — llama-swap is unauth in our
// Tailscale topology and exposing it over the public internet is gated by
// Authelia at the Caddy layer, not by API keys.

const cache = new Map<string, ReturnType<typeof createOpenAICompatible>>();

function getProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
  let provider = cache.get(baseURL);
  if (!provider) {
    provider = createOpenAICompatible({
      name: 'llama-swap',
      baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
      // v1.13.7: @ai-sdk/openai-compatible defaults includeUsage=false, which
      // omits `stream_options.include_usage` from the request body. Without
      // it, llama.cpp / llama-swap never emits the trailing usage block, so
      // `result.usage` resolves with inputTokens=outputTokens=undefined and
      // tokens_used / ctx_used land as NULL in every messages row. Setting
      // true here re-enables the per-stream usage payload across all models
      // served via the llama-swap provider.
      includeUsage: true,
    });
    cache.set(baseURL, provider);
  }
  return provider;
}

export function upstreamModel(baseURL: string, modelId: string): LanguageModel {
  return getProvider(baseURL).chatModel(modelId);
}