boocode/apps/server/src/services/inference/provider.ts

import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
import type { LanguageModel } from 'ai';

// v1.13.1-A: AI SDK provider against llama-swap. baseURL is threaded from
// config.LLAMA_SWAP_URL at call time (not module-load) so tests can stub the
// upstream without touching env vars. No apiKey — llama-swap is unauth in our
// Tailscale topology and exposing it over the public internet is gated by
// Authelia at the Caddy layer, not by API keys.
//
// v2.4.1-sidecar: when the agent has llama_extra_args, route through
// llama-sidecar instead. A fresh provider is created per call (not cached)
// because the X-Agent-Flags header varies per agent. The llama-swap path
// stays cached since it has no per-request headers.

const swapCache = new Map<string, ReturnType<typeof createOpenAICompatible>>();

function getSwapProvider(baseURL: string): ReturnType<typeof createOpenAICompatible> {
  let provider = swapCache.get(baseURL);
  if (!provider) {
    provider = createOpenAICompatible({
      name: 'llama-swap',
      baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
      includeUsage: true,
    });
    swapCache.set(baseURL, provider);
  }
  return provider;
}

function sidecarProvider(
  baseURL: string,
  flags: string[],
): ReturnType<typeof createOpenAICompatible> {
  return createOpenAICompatible({
    name: 'llama-sidecar',
    baseURL: baseURL.endsWith('/v1') ? baseURL : `${baseURL}/v1`,
    includeUsage: true,
    headers: {
      'X-Agent-Flags': flags.join(' '),
    },
  });
}

export type InferenceRoute = 'swap' | 'sidecar';

export interface RoutingInfo {
  route: InferenceRoute;
  flags: string[] | null;
}

interface AgentLike {
  llama_extra_args: string[] | null;
}

interface ConfigLike {
  LLAMA_SWAP_URL: string;
  LLAMA_SIDECAR_URL?: string;
}

export function resolveRoute(
  agent: AgentLike | null,
  config?: ConfigLike,
): RoutingInfo {
  // When llama_extra_args are explicitly set, route through sidecar with them.
  const flags = agent?.llama_extra_args;
  if (flags && flags.length > 0) {
    return { route: 'sidecar', flags };
  }
  // When LLAMA_SIDECAR_URL is configured (even without per-agent flags),
  // route through sidecar to pick up the default base args (cache quant,
  // spec decoding, slot save, etc.). Fall back to llama-swap otherwise.
  if (config?.LLAMA_SIDECAR_URL) {
    return { route: 'sidecar', flags: [] };
  }
  return { route: 'swap', flags: null };
}

export function upstreamModel(
  config: ConfigLike,
  modelId: string,
  agent?: AgentLike | null,
): LanguageModel {
  const { route, flags } = resolveRoute(agent ?? null, config);
  if (route === 'sidecar') {
    const url = config.LLAMA_SIDECAR_URL;
    if (!url) {
      throw new Error(`Sidecar route selected but LLAMA_SIDECAR_URL is not set`);
    }
    return sidecarProvider(url, (flags ?? [])).chatModel(modelId);
  }
  return getSwapProvider(config.LLAMA_SWAP_URL).chatModel(modelId);
}