Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
57 lines
1.3 KiB
Go
57 lines
1.3 KiB
Go
package server
|
|
|
|
import (
|
|
"log/slog"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/indifferentketchup/llama-sidecar/internal/config"
|
|
"github.com/indifferentketchup/llama-sidecar/internal/pool"
|
|
)
|
|
|
|
func New(cfg *config.Config, p *pool.Pool, startedAt time.Time) *http.Server {
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("GET /health", healthHandler(p, cfg, startedAt))
|
|
mux.HandleFunc("GET /sidecars", listSidecarsHandler(p))
|
|
mux.HandleFunc("DELETE /sidecars/{hash}", deleteSidecarHandler(p))
|
|
mux.HandleFunc("POST /v1/chat/completions", proxyHandler(p))
|
|
mux.HandleFunc("POST /v1/completions", proxyHandler(p))
|
|
|
|
handler := requestLogger(mux)
|
|
|
|
return &http.Server{
|
|
Addr: cfg.Bind,
|
|
Handler: handler,
|
|
}
|
|
}
|
|
|
|
func requestLogger(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
start := time.Now()
|
|
rw := &statusRecorder{ResponseWriter: w, status: 200}
|
|
next.ServeHTTP(rw, r)
|
|
slog.Info("request",
|
|
"method", r.Method,
|
|
"path", r.URL.Path,
|
|
"status", rw.status,
|
|
"duration_ms", time.Since(start).Milliseconds(),
|
|
)
|
|
})
|
|
}
|
|
|
|
type statusRecorder struct {
|
|
http.ResponseWriter
|
|
status int
|
|
}
|
|
|
|
func (sr *statusRecorder) WriteHeader(code int) {
|
|
sr.status = code
|
|
sr.ResponseWriter.WriteHeader(code)
|
|
}
|
|
|
|
func (sr *statusRecorder) Flush() {
|
|
if f, ok := sr.ResponseWriter.(http.Flusher); ok {
|
|
f.Flush()
|
|
}
|
|
}
|