diff --git a/apps/coder/src/routes/arena.ts b/apps/coder/src/routes/arena.ts index 7a481fc..ecff236 100644 --- a/apps/coder/src/routes/arena.ts +++ b/apps/coder/src/routes/arena.ts @@ -205,7 +205,7 @@ export function registerArenaRoutes( const contestants = await sql` SELECT id, battle_id, identity, model, lane, task_id, worktree_id, - status, duration_ms, tokens_per_sec, cost_tokens, result_path, error, + status, duration_ms, tokens_per_sec, cost_tokens, token_breakdown, result_path, error, created_at, updated_at FROM contestants WHERE battle_id = ${id} diff --git a/apps/server/CLAUDE.md b/apps/server/CLAUDE.md index f3dca50..9aa95fe 100644 --- a/apps/server/CLAUDE.md +++ b/apps/server/CLAUDE.md @@ -17,6 +17,8 @@ - **Tools have NO `execute` field.** BooCode dispatches tools in tool-phase.ts, not the AI SDK loop — only `description` + `inputSchema: jsonSchema(parameters)`. - **`includeUsage: true` MUST be set on `createOpenAICompatible`** in `provider.ts`. The adapter defaults it false → no `stream_options.include_usage` → llama-swap emits no usage block → `result.usage` resolves `undefined` (NULL token counts). Don't remove during refactor. - **Tool-call-only turns may emit a leading `\n` text-delta.** `MessageList.flatten`'s `hasText` and `MessageBubble`'s `hasContent` both `.trim()` before the length check, else whitespace-only content renders an empty bubble + ActionRow between tool calls. `buildMessagesPayload` also skips `status='failed'` and complete-but-empty assistant rows (avoids "Cannot have 2 or more assistant messages at the end of the list" upstream rejection after cap-hit + Continue). +- **`services/inference/tool-shim.ts`** — Recovers structured tool calls from plain-text model output. Some models (notably Qwen) emit `......` inline text instead of structured JSON. `extractToolCalls(text)` parses both XML and JSON inline formats. `hasToolCallMarkup(text)` is a fast pre-check. Used as a fallback in the stream phase when structured `tool_calls` parse fails. Does NOT require `FAST_MODEL` — operates on the existing turn's output text. +- **`services/inference/loop-detectors.ts`** — Six detectors that catch repetitive model behavior: `detectContentRepeat` (same content N times), `detectToolLoop` (same tool called consecutively). `detectDoomLoop` combines both. These are additive to the existing `sentinels.ts` doom-loop detection. - **AI SDK ModelMessage conversion** (`toModelMessages` in stream-phase.ts). Tool messages need a `toolName` for `ToolResultPart`; BooCode's OpenAI-shape history lacks it, so a forward-scan builds a `tool_call_id → toolName` map from prior assistant `tool_calls`. Tool outputs wrapped as `{ type: 'json' | 'text', value }` (v6 `ToolResultOutput`). Reasoning emits a `ReasoningPart` first in the content array. - **`experimental_repairToolCall`** wired into `streamText` to keep the stream alive when qwen3.6 emits malformed tool args. Pass-through: logs the bad call, returns it unmodified; `executeToolPhase`'s zod-reject path routes it back to the model next turn. - **`chat_status` frame** (via `broker.publishUser`) — `status: 'streaming' | 'tool_running' | 'waiting_for_input' | 'idle' | 'error'`. Frontend `useChatStatus` derives `idle_warm` (<30s since idle) vs `idle_cold`. `ChatThroughput` renders beside `StatusDot` only when streaming/tool_running, fed by 500ms-throttled `'usage'` frames (`completion_tokens` + `ctx_used` + `ctx_max`). `POST /api/chats/:id/discard_stale` marks a stuck-streaming row `failed` when the frontend's 60s no-token timer gives up. diff --git a/apps/web/src/components/panes/ArenaPane.tsx b/apps/web/src/components/panes/ArenaPane.tsx index 66897f5..066b174 100644 --- a/apps/web/src/components/panes/ArenaPane.tsx +++ b/apps/web/src/components/panes/ArenaPane.tsx @@ -218,6 +218,16 @@ function ContestantRow({ {isExpanded && (
+ {data.token_breakdown && ( +
+ {data.token_breakdown.system > 0 && {data.token_breakdown.system}s} + {data.token_breakdown.user > 0 && {data.token_breakdown.user}u} + {data.token_breakdown.assistant > 0 && {data.token_breakdown.assistant}a} + {data.token_breakdown.tools > 0 && {data.token_breakdown.tools}t} + {data.token_breakdown.reasoning > 0 && {data.token_breakdown.reasoning}r} + {data.token_breakdown.total > 0 && ∑{data.token_breakdown.total}} +
+ )} {output.length === 0 ? (
{data.status === 'queued'