llama-sidecar v0.1.0: daemon + benchmarks + eval suite
Go daemon (cmd/llama-sidecar): per-agent llama-server process pool with LRU eviction, OpenAI-compatible proxy, flag validation (Unsloth port), deterministic hash-keyed sidecar reuse. Windows service support via schtasks/NSSM with DETACHED_PROCESS, stdout pipe drain, and request-ctx decoupled child lifetime. Bug fixes (3b.1–3b5): -c flag drop from StripShadowingFlags, UTF-8 BOM in JSON config, -fa → --flash-attn on default, child process exit after one request (stdin devnull, stdout pipe, CREATE_NO_WINDOW → DETACHED, context.Background for child lifetime, background reaper goroutine). bench/: MTP on/off throughput sweep across 8 GGUFs via SSH+schtasks automation to sam-desktop. Per-GGUF production flags from llama-swap config with --ctx-size 32768 override. eval/: accuracy benchmarks (MMLU 100q, GSM8K 50q, HumanEval 164) + A/B model comparison (14 agent-typed prompts × 8 models). All scripts resumable at individual question level. 94 Go tests, race detector clean. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
42
internal/server/admin.go
Normal file
42
internal/server/admin.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/indifferentketchup/llama-sidecar/internal/config"
|
||||
"github.com/indifferentketchup/llama-sidecar/internal/pool"
|
||||
)
|
||||
|
||||
func healthHandler(p *pool.Pool, cfg *config.Config, startedAt time.Time) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
sidecars := p.List()
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"status": "ok",
|
||||
"sidecars": len(sidecars),
|
||||
"max": cfg.MaxSidecars,
|
||||
"uptime_seconds": int(time.Since(startedAt).Seconds()),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func listSidecarsHandler(p *pool.Pool) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, http.StatusOK, p.List())
|
||||
}
|
||||
}
|
||||
|
||||
func deleteSidecarHandler(p *pool.Pool) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
hash := r.PathValue("hash")
|
||||
if hash == "" {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "hash required"})
|
||||
return
|
||||
}
|
||||
if err := p.Remove(hash); err != nil {
|
||||
writeJSON(w, http.StatusNotFound, map[string]string{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "removed"})
|
||||
}
|
||||
}
|
||||
111
internal/server/proxy.go
Normal file
111
internal/server/proxy.go
Normal file
@@ -0,0 +1,111 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"github.com/indifferentketchup/llama-sidecar/internal/pool"
|
||||
)
|
||||
|
||||
var shellUnsafe = strings.NewReplacer(
|
||||
"`", "", "$", "", "|", "", ";", "", "&", "", "\n", "",
|
||||
)
|
||||
|
||||
func parseFlags(raw string) ([]string, error) {
|
||||
cleaned := shellUnsafe.Replace(raw)
|
||||
if cleaned != raw {
|
||||
return nil, fmt.Errorf("flags contain unsafe characters")
|
||||
}
|
||||
return splitArgs(strings.TrimSpace(raw)), nil
|
||||
}
|
||||
|
||||
func splitArgs(s string) []string {
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
return strings.Fields(s)
|
||||
}
|
||||
|
||||
func proxyHandler(p *pool.Pool) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
flagsRaw := r.Header.Get("X-Agent-Flags")
|
||||
var flags []string
|
||||
if flagsRaw != "" {
|
||||
var err error
|
||||
flags, err = parseFlags(flagsRaw)
|
||||
if err != nil {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{
|
||||
"error": err.Error(),
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
modelID := r.Header.Get("X-Model-Id")
|
||||
if modelID == "" {
|
||||
body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
|
||||
if err != nil {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "failed to read body"})
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Model string `json:"model"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &req); err == nil && req.Model != "" {
|
||||
modelID = req.Model
|
||||
}
|
||||
r.Body = io.NopCloser(strings.NewReader(string(body)))
|
||||
r.ContentLength = int64(len(body))
|
||||
}
|
||||
if modelID == "" {
|
||||
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "model not specified (X-Model-Id header or body.model)"})
|
||||
return
|
||||
}
|
||||
|
||||
sidecar, err := p.Acquire(r.Context(), modelID, flags)
|
||||
if err != nil {
|
||||
errMsg := err.Error()
|
||||
status := http.StatusInternalServerError
|
||||
if strings.Contains(errMsg, "validation:") {
|
||||
status = http.StatusBadRequest
|
||||
} else if strings.Contains(errMsg, "unknown model:") {
|
||||
status = http.StatusNotFound
|
||||
} else if strings.Contains(errMsg, "port allocation:") {
|
||||
status = http.StatusServiceUnavailable
|
||||
}
|
||||
writeJSON(w, status, map[string]string{"error": errMsg})
|
||||
return
|
||||
}
|
||||
|
||||
target := &url.URL{
|
||||
Scheme: "http",
|
||||
Host: fmt.Sprintf("127.0.0.1:%d", sidecar.Port),
|
||||
}
|
||||
proxy := httputil.NewSingleHostReverseProxy(target)
|
||||
proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, err error) {
|
||||
slog.Error("upstream error", "hash", sidecar.Hash, "port", sidecar.Port, "err", err)
|
||||
writeJSON(rw, http.StatusBadGateway, map[string]any{
|
||||
"error": "upstream unavailable",
|
||||
"error_detail": err.Error(),
|
||||
"sidecar_hash": sidecar.Hash,
|
||||
"sidecar_port": sidecar.Port,
|
||||
"last_stderr": sidecar.LastStderr(),
|
||||
})
|
||||
}
|
||||
|
||||
sidecar.TouchLastUsed()
|
||||
proxy.ServeHTTP(w, r)
|
||||
}
|
||||
}
|
||||
|
||||
func writeJSON(w http.ResponseWriter, status int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
56
internal/server/server.go
Normal file
56
internal/server/server.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/indifferentketchup/llama-sidecar/internal/config"
|
||||
"github.com/indifferentketchup/llama-sidecar/internal/pool"
|
||||
)
|
||||
|
||||
func New(cfg *config.Config, p *pool.Pool, startedAt time.Time) *http.Server {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /health", healthHandler(p, cfg, startedAt))
|
||||
mux.HandleFunc("GET /sidecars", listSidecarsHandler(p))
|
||||
mux.HandleFunc("DELETE /sidecars/{hash}", deleteSidecarHandler(p))
|
||||
mux.HandleFunc("POST /v1/chat/completions", proxyHandler(p))
|
||||
mux.HandleFunc("POST /v1/completions", proxyHandler(p))
|
||||
|
||||
handler := requestLogger(mux)
|
||||
|
||||
return &http.Server{
|
||||
Addr: cfg.Bind,
|
||||
Handler: handler,
|
||||
}
|
||||
}
|
||||
|
||||
func requestLogger(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
start := time.Now()
|
||||
rw := &statusRecorder{ResponseWriter: w, status: 200}
|
||||
next.ServeHTTP(rw, r)
|
||||
slog.Info("request",
|
||||
"method", r.Method,
|
||||
"path", r.URL.Path,
|
||||
"status", rw.status,
|
||||
"duration_ms", time.Since(start).Milliseconds(),
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
type statusRecorder struct {
|
||||
http.ResponseWriter
|
||||
status int
|
||||
}
|
||||
|
||||
func (sr *statusRecorder) WriteHeader(code int) {
|
||||
sr.status = code
|
||||
sr.ResponseWriter.WriteHeader(code)
|
||||
}
|
||||
|
||||
func (sr *statusRecorder) Flush() {
|
||||
if f, ok := sr.ResponseWriter.(http.Flusher); ok {
|
||||
f.Flush()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user