diff --git a/.claude/learnings.md b/.claude/learnings.md index ab7a529..890f278 100644 --- a/.claude/learnings.md +++ b/.claude/learnings.md @@ -48,3 +48,8 @@ In flow-editor.tsx (FlowCanvas), touch drag/pan on iOS jumped ~5px then stopped, ## 2026-06-05 — CORRECTION: real root cause was onPointerLeave, not page-scroll On-device HUD instrumentation proved the actual cause of the iOS canvas drag bug: the canvas had `onPointerLeave={onPointerUp}`, and iOS Safari spuriously fires `pointerleave` mid-gesture (finger still down and inside the element, even with pointer capture set). That ended the drag one move in — node/canvas jumped ~5px then froze while the finger kept moving. Fix: gate it to non-touch only `onPointerLeave={(e)=>{ if(e.pointerType!=="touch") onPointerUp(e); }}` — pointer capture guarantees a real pointerup/cancel, so pointerleave is only needed as the desktop mouse-left-window fallback. The non-passive touchmove preventDefault was kept (prevents page-scroll hijack / pointercancel) but was NOT the primary fix. Lesson: when reasoning about WebKit event quirks fails twice, add an on-screen HUD logging the raw pointer event stream instead of guessing. + +## Arthur GenAI Engine — eval read path (for /evals page) +- Arthur is integrated **write-only** in the worker today: `arthur-tracer.ts` ships OpenInference traces via `POST /api/v1/traces`; `arthur-client.ts` only does tasks/prompts (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). Nothing reads evals back. +- Arthur DOES expose a read API (same `Bearer GENAI_ENGINE_API_KEY`, needs `INFERENCE_READ`, org-scoped → matches single-tenant). Primary for fleet eval health: `POST /api/v1/traces/overview` { task_ids, start_time, end_time } → overviews of { trace_count, eval_count, continuous_eval_success_rate, ... }. Per-metric detail: span `metric_results` (metric_type enum is ONLY QueryRelevance|ResponseRelevance|ToolSelection; `details` is opaque JSON, no flat score/pass-fail — parse + threshold yourself). Timeseries: `POST /api/v1/traces/overview/timeseries` (single task per call). +- CRITICAL: the rich rule families the mock shows (hallucination/PII/toxicity/prompt-injection) come ONLY from the legacy `/validate_prompt` + `/validate_response` write path, which we never call. `GET /api/v2/inferences/query` is empty for us. Our trace path yields only success-rate + eval/trace counts + the 3 relevance/tool metric types (and only if continuous evals are configured on the task; else eval_count=0). diff --git a/apps/dashboard/app/(cockpit)/cost/page.tsx b/apps/dashboard/app/(cockpit)/cost/page.tsx index 4ff4588..556bcef 100644 --- a/apps/dashboard/app/(cockpit)/cost/page.tsx +++ b/apps/dashboard/app/(cockpit)/cost/page.tsx @@ -1,5 +1,12 @@ // apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost") -import { CostScreen } from "@/components/cockpit/screens/cost"; +import { Suspense } from "react"; +import { CostData } from "@/app/cost-data"; +import { CostSkeleton } from "@/app/cost-skeleton"; + export default function CostPage() { - return ; + return ( + }> + + + ); } diff --git a/apps/dashboard/app/(cockpit)/evals/page.tsx b/apps/dashboard/app/(cockpit)/evals/page.tsx index 24320a5..108427f 100644 --- a/apps/dashboard/app/(cockpit)/evals/page.tsx +++ b/apps/dashboard/app/(cockpit)/evals/page.tsx @@ -1,5 +1,13 @@ // apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals") -import { EvalsScreen } from "@/components/cockpit/screens/evals"; +import { Suspense } from "react"; + +import { EvalsData } from "@/app/evals-data"; +import { EvalsSkeleton } from "@/app/evals-skeleton"; + export default function EvalsPage() { - return ; + return ( + }> + + + ); } diff --git a/apps/dashboard/app/(cockpit)/prompts/page.tsx b/apps/dashboard/app/(cockpit)/prompts/page.tsx index d3f5a66..d44f3bb 100644 --- a/apps/dashboard/app/(cockpit)/prompts/page.tsx +++ b/apps/dashboard/app/(cockpit)/prompts/page.tsx @@ -1,5 +1,13 @@ // apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts") -import { PromptsScreen } from "@/components/cockpit/screens/prompts"; +import { Suspense } from "react"; + +import { PromptsData } from "@/app/prompts-data"; +import { PromptsSkeleton } from "@/app/prompts-skeleton"; + export default function PromptsPage() { - return ; + return ( + }> + + + ); } diff --git a/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts b/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts new file mode 100644 index 0000000..3979644 --- /dev/null +++ b/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts @@ -0,0 +1,18 @@ +// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts +// Same-origin proxy so the client can lazily fetch a historical prompt-version +// body without the server-only WORKER_API_TOKEN ever reaching the browser. +import { NextResponse } from "next/server"; +import { getJSON } from "@/lib/api/server"; +import type { PromptVersionBodyResponse } from "@shared/contracts"; + +export async function GET( + _req: Request, + { params }: { params: Promise<{ name: string; version: string }> }, +) { + const { name, version } = await params; + const now = new Date().toISOString(); + const data = await getJSON( + `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`, + ).catch(() => ({ generatedAt: now, available: false, body: null })); + return NextResponse.json(data); +} diff --git a/apps/dashboard/app/cost-data.tsx b/apps/dashboard/app/cost-data.tsx new file mode 100644 index 0000000..6620777 --- /dev/null +++ b/apps/dashboard/app/cost-data.tsx @@ -0,0 +1,13 @@ +// apps/dashboard/app/cost-data.tsx +import { getJSON } from "@/lib/api/server"; +import { CostScreen } from "@/components/cockpit/screens/cost"; +import type { CostResponse } from "@shared/contracts"; +import { costFallback } from "@/lib/api/fallbacks"; + +export async function CostData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/cost").catch(() => + costFallback(now), + ); + return ; +} diff --git a/apps/dashboard/app/cost-skeleton.tsx b/apps/dashboard/app/cost-skeleton.tsx new file mode 100644 index 0000000..1a95fa2 --- /dev/null +++ b/apps/dashboard/app/cost-skeleton.tsx @@ -0,0 +1,18 @@ +// apps/dashboard/app/cost-skeleton.tsx +import { Block } from "./skeleton-block"; + +export function CostSkeleton() { + return ( +
+
+ {Array.from({ length: 3 }, (_, i) => )} +
+
+ + +
+ + +
+ ); +} diff --git a/apps/dashboard/app/evals-data.tsx b/apps/dashboard/app/evals-data.tsx new file mode 100644 index 0000000..fd2cd2f --- /dev/null +++ b/apps/dashboard/app/evals-data.tsx @@ -0,0 +1,13 @@ +// apps/dashboard/app/evals-data.tsx +import { getJSON } from "@/lib/api/server"; +import { EvalsScreen } from "@/components/cockpit/screens/evals"; +import type { EvalsResponse } from "@shared/contracts"; +import { evalsFallback } from "@/lib/api/fallbacks"; + +export async function EvalsData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/evals").catch(() => + evalsFallback(now), + ); + return ; +} diff --git a/apps/dashboard/app/evals-skeleton.tsx b/apps/dashboard/app/evals-skeleton.tsx new file mode 100644 index 0000000..86ba867 --- /dev/null +++ b/apps/dashboard/app/evals-skeleton.tsx @@ -0,0 +1,16 @@ +// apps/dashboard/app/evals-skeleton.tsx +import { Block } from "./skeleton-block"; + +export function EvalsSkeleton() { + return ( +
+ {/* Header (eyebrow + title, chip) */} +
+ + +
+ {/* Quality group card */} + +
+ ); +} diff --git a/apps/dashboard/app/prompts-data.tsx b/apps/dashboard/app/prompts-data.tsx new file mode 100644 index 0000000..34d7504 --- /dev/null +++ b/apps/dashboard/app/prompts-data.tsx @@ -0,0 +1,13 @@ +// apps/dashboard/app/prompts-data.tsx +import { getJSON } from "@/lib/api/server"; +import { PromptsScreen } from "@/components/cockpit/screens/prompts"; +import type { PromptsResponse } from "@shared/contracts"; +import { promptsFallback } from "@/lib/api/fallbacks"; + +export async function PromptsData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/prompts").catch(() => + promptsFallback(now), + ); + return ; +} diff --git a/apps/dashboard/app/prompts-skeleton.tsx b/apps/dashboard/app/prompts-skeleton.tsx new file mode 100644 index 0000000..feb45c0 --- /dev/null +++ b/apps/dashboard/app/prompts-skeleton.tsx @@ -0,0 +1,22 @@ +// apps/dashboard/app/prompts-skeleton.tsx +import { Block } from "./skeleton-block"; + +export function PromptsSkeleton() { + return ( +
+
+ + +
+
+ {Array.from({ length: 2 }, (_, i) => ( + + ))} +
+
+ + +
+
+ ); +} diff --git a/apps/dashboard/app/skeleton-block.tsx b/apps/dashboard/app/skeleton-block.tsx new file mode 100644 index 0000000..03c2647 --- /dev/null +++ b/apps/dashboard/app/skeleton-block.tsx @@ -0,0 +1,4 @@ +// apps/dashboard/app/skeleton-block.tsx +export function Block({ className = "" }: { className?: string }) { + return
; +} diff --git a/apps/dashboard/components/cockpit/screens/cost.tsx b/apps/dashboard/components/cockpit/screens/cost.tsx index 499a78a..470378b 100644 --- a/apps/dashboard/components/cockpit/screens/cost.tsx +++ b/apps/dashboard/components/cockpit/screens/cost.tsx @@ -1,141 +1,169 @@ "use client"; import React from "react"; -import { CkCard, CkKPI, CkChip, CkTabs, CkDot } from "@/components/ui"; -import { Spark, AreaChart, Donut } from "@/components/charts"; -import { AIWF_DATA } from "@/lib/data/mock"; -import { sparkSeries } from "@/lib/rng"; +import { CkCard, CkKPI, CkDot } from "@/components/ui"; +import { AreaChart, Donut } from "@/components/charts"; +import type { CostResponse } from "@shared/contracts"; -const D = AIWF_DATA; +const DONUT_COLORS = ["#3C43E7", "#FD6027", "#FFC800", "#181B20", "#8FC548"]; + +/** Short label from an ISO/bucket date string for the daily-spend x-axis. */ +function shortDate(date: string): string { + const d = new Date(date); + if (Number.isNaN(d.getTime())) return date; + return d.toLocaleDateString("en-US", { month: "short", day: "numeric" }); +} + +export function CostScreen({ data }: { data: CostResponse }) { + if (!data.available) { + return ( +
+
+
+
Arthur · token usage
+

Cost & token usage

+
+
+
+ Cost data is unavailable — Arthur GenAI Engine is not configured or unreachable. +
+
+ ); + } + + const { totals, byModel, byWorkflow, daily } = data; + const total = totals.totalTokenCost; + const modelCostTotal = byModel.reduce((a, m) => a + m.cost, 0); -export function CostScreen() { - const total = D.COST_BY_MODEL.reduce((a, m) => a + m.cost, 0); - const tokensTotal = D.COST_BY_MODEL.reduce((a, m) => a + m.tokens, 0); return (
-
Vercel ai gateway · billing
+
Arthur · token usage

Cost & token usage

-
- {}} tabs={[ - { id: "model", label: "By model" }, { id: "wf", label: "By workflow" }, { id: "actor", label: "By actor" }] - } /> - -
-
- - - - +
+ + +
- {}} tabs={[{ id: "cost", label: "Cost" }, { id: "tokens", label: "Tokens" }]} />}> -
- h.cost * 24)} w={680} h={200} stroke="#FD6027" fill="#FD6027" labels={D.HOURS24.map((_, i) => "D" + (i + 1))} valueFmt={(v) => "$" + Math.round(v)} /> -
+ + {daily.length > 0 ? ( +
+ d.cost)} + w={680} + h={200} + stroke="#FD6027" + fill="#FD6027" + labels={daily.map((d) => shortDate(d.date))} + valueFmt={(v) => "$" + Math.round(Number(v))} + /> +
+ ) : ( +
No spend data
+ )}
- -
- m.share)} size={140} thickness={22} colors={["#3C43E7", "#FD6027", "#FFC800", "#181B20", "#8FC548"]} centerLabel={"$" + Math.round(total)} centerSub="MTD" /> -
- {D.COST_BY_MODEL.map((m, i) => -
- - {m.model} - ${m.cost.toFixed(0)} -
- )} + + {byModel.length > 0 ? ( +
+ (modelCostTotal ? m.cost / modelCostTotal : 0))} + size={140} + thickness={22} + colors={DONUT_COLORS} + centerLabel={"$" + Math.round(total)} + centerSub="MTD" + /> +
+ {byModel.map((m, i) => +
+ + {m.model} + ${m.cost.toFixed(0)} +
+ )} +
-
+ ) : ( +
No model data
+ )}
-
- - - - {["Model", "Vendor", "Tokens", "Cost", "Share", "Trend"].map((h, i) => - - )} - - - - {D.COST_BY_MODEL.map((m, i) => - - - - - - - + {byModel.length > 0 ? ( +
+
= 2 ? "text-right" : "text-left"}`}>{h}
{m.model}{m.vendor}{(m.tokens / 1_000_000).toFixed(2)}M${m.cost.toFixed(2)} -
-
-
-
- {(m.share * 100).toFixed(0)}% -
-
- -
+ + + {["Model", "Tokens", "Cost", "Share"].map((h, i) => + + )} - )} - -
= 1 ? "text-right" : "text-left"}`}>{h}
-
+ + + {byModel.map((m, i) => { + const share = modelCostTotal ? m.cost / modelCostTotal : 0; + return ( + + {m.model} + {(m.tokens / 1_000_000).toFixed(2)}M + ${m.cost.toFixed(2)} + +
+
+
+
+ {(share * 100).toFixed(0)}% +
+ + + ); + })} + + +
+ ) : ( +
No model breakdown available
+ )}
-
- - - - {["Workflow", "Runs 24h", "Tokens", "Cost today", "$/run", "Trend"].map((h, i) => - - )} - - - - {D.WORKFLOWS.slice().sort((a, b) => b.costToday - a.costToday).map((w, i, arr) => { - const tokens = Math.round(w.runs24h * 2400); - const perRun = w.costToday / Math.max(1, w.runs24h); - const trendUp = i % 2 === 0; - return ( - + {byWorkflow.length > 0 ? ( +
+
= 1 ? "text-right" : "text-left"}`}>{h}
+ + + {["Workflow", "Runs", "Tokens", "Cost", "$/run"].map((h, i) => + + )} + + + + {[...byWorkflow].sort((a, b) => b.cost - a.cost).map((w, i, arr) => + - - - - - - ); - - })} - -
= 1 ? "text-right" : "text-left"}`}>{h}
-
- {w.name} - {w.primary && primary} -
-
{w.id} · gateway: {w.gateway}
+ {w.name} +
{w.taskId}
{w.runs24h.toLocaleString("en-US")}{(tokens / 1000).toFixed(0)}k -
-
-
-
- ${w.costToday.toFixed(2)} -
-
${perRun.toFixed(3)} - -
-
+ {w.runs.toLocaleString("en-US")} + {(w.tokens / 1000).toFixed(0)}k + ${w.cost.toFixed(2)} + ${w.costPerRun.toFixed(3)} + + )} + + +
+ ) : ( +
No workflow breakdown available
+ )}
); diff --git a/apps/dashboard/components/cockpit/screens/evals.tsx b/apps/dashboard/components/cockpit/screens/evals.tsx index 8bc30fe..dbb8d9b 100644 --- a/apps/dashboard/components/cockpit/screens/evals.tsx +++ b/apps/dashboard/components/cockpit/screens/evals.tsx @@ -1,69 +1,65 @@ "use client"; import { CkCard, CkChip } from "@/components/ui"; -import { Spark } from "@/components/charts"; -import { AIWF_DATA } from "@/lib/data/mock"; -import { jitterSeries } from "@/lib/rng"; +import type { EvalsResponse } from "@shared/contracts"; -const D = AIWF_DATA; +const QUALITY_ACCENT = "#3C43E7"; /* ───────────────────── ARTHUR EVALS ───────────────────── */ -export function EvalsScreen() { - const groups = ["safety", "quality", "ops"]; +function Header({ chip }: { chip: React.ReactNode }) { return ( -
-
-
-
Arthur engine · continuous evaluation
-

Evaluations & guardrails

-
-
- Live · 12,408 spans · 24h - +
+
+
Arthur engine · continuous evaluation
+

Evaluations & guardrails

+
+
{chip}
+
+ ); +} + +export function EvalsScreen({ data }: { data: EvalsResponse }) { + if (!data.available) { + return ( +
+
No data} /> +
+ {data.reason}
+ ); + } - {groups.map((g) => { - const list = D.EVALS.filter((e) => e.axis === g); - const titles: Record = { safety: "Safety", quality: "Quality", ops: "Operations" }; - const accents: Record = { safety: "#FD6027", quality: "#3C43E7", ops: "#181B20" }; - return ( - {list.length} evaluators} - style={{ borderLeft: "3px solid " + accents[g] }} - pad={0}> + return ( +
+
+ Live · {data.spansGraded.toLocaleString("en-US")} spans · {data.windowHours}h + + } + /> -
- {list.map((e, i) => -
= list.length - (list.length % 2 === 0 ? 2 : 1) ? "lg:border-b-0" : ""} ${i % 2 === 0 ? "lg:border-r lg:border-neutral-200" : ""}`}> -
- {e.metric} - {e.status === "pass" ? Pass : - e.status === "warn" ? Warn : - Fail} -
-
- - {typeof e.value === "number" ? e.value < 1 ? e.value.toFixed(3) : e.value : e.value} - - {e.unit && {e.unit}} - 0 ? "text-fail-fg" : "text-neutral-500"}`}> - {e.trend > 0 ? "↗" : e.trend < 0 ? "↘" : "→"} {Math.abs(e.trend).toFixed(3)} - -
-
- - target {e.target} -
-
- )} -
- ); + + {data.score.toFixed(1)}% pass + + } + style={{ borderLeft: "3px solid " + QUALITY_ACCENT }}> - })} +
+ + {data.score.toFixed(1)}% + + + {data.spansGraded.toLocaleString("en-US")} spans graded · {data.traceCount.toLocaleString("en-US")} traces · {data.windowHours}h + +
+
); } diff --git a/apps/dashboard/components/cockpit/screens/prompts.tsx b/apps/dashboard/components/cockpit/screens/prompts.tsx index 03a1af6..b236e60 100644 --- a/apps/dashboard/components/cockpit/screens/prompts.tsx +++ b/apps/dashboard/components/cockpit/screens/prompts.tsx @@ -1,11 +1,8 @@ "use client"; import React, { useState, useEffect } from "react"; -import { CkCard, CkKPI, CkChip } from "@/components/ui"; -import { AIWF_DATA } from "@/lib/data/mock"; -import type { Prompt, PromptVersion, PromptTag } from "@/lib/types"; - -const D = AIWF_DATA; +import { CkCard, CkKPI } from "@/components/ui"; +import type { PromptsResponse, PromptDef, PromptVersion } from "@shared/contracts"; const PROMPT_STATUS_COLOR: Record = { production: { bg: "#EAF7E0", fg: "#3F6B1E", dot: "#5BB04A" }, @@ -13,7 +10,8 @@ const PROMPT_STATUS_COLOR: Record v.tags.includes("production")); +} + /* ───── Prompts list (left rail) ───── */ -function PromptList({ active, onSelect }: { active: string; onSelect: (id: string) => void }) { +function PromptList({ + rows, + active, + onSelect, + arthurEnabled, +}: { + rows: PromptDef[]; + active: string; + onSelect: (name: string) => void; + arthurEnabled: boolean; +}) { const [filter, setFilter] = useState("all"); - const list = filter === "all" ? D.PROMPTS : D.PROMPTS.filter(p => p.tags.includes(filter as PromptTag)); + // Derive the tag filter set from tags that actually occur across all versions. + const allTags = Array.from( + new Set(rows.flatMap((p) => p.versions.flatMap((v) => v.tags))), + ); + const filters = ["all", ...allTags]; + const list = + filter === "all" + ? rows + : rows.filter((p) => p.versions.some((v) => v.tags.includes(filter))); + return ( - } pad={0} className="lg:h-full" style={{ display: "flex", flexDirection: "column" }} > -
- {["all","production","staging","draft","locked"].map(t => ( - - ))} -
+ {filters.length > 1 && ( +
+ {filters.map((t) => ( + + ))} +
+ )}
{list.map((p, i) => { - const on = active === p.id; + const on = active === p.name; + const prod = productionVersion(p); return ( ); @@ -88,19 +104,30 @@ function PromptList({ active, onSelect }: { active: string; onSelect: (id: strin ); } +/* ───── Mini stat (used in prompt header) ───── */ +function Stat({ label, value, sub }: { label: React.ReactNode; value: React.ReactNode; sub?: React.ReactNode }) { + return ( +
+
{label}
+
{value}
+ {sub &&
{sub}
} +
+ ); +} + /* ───── Selected-prompt detail (right pane) ───── */ -function PromptDetail({ promptId }: { promptId: string }) { - const p = D.PROMPTS.find((x: Prompt) => x.id === promptId); - const versions: PromptVersion[] = D.PROMPT_VERSIONS[promptId] || []; - const [selA, setSelA] = useState(versions[0]?.v || null); - const [selB, setSelB] = useState(versions[1]?.v || null); +function PromptDetail({ prompt }: { prompt: PromptDef | undefined }) { + const [selectedVersion, setSelectedVersion] = useState(null); + const [bodyCache, setBodyCache] = useState>({}); + const [loading, setLoading] = useState(false); + + // Reset the selected historical version whenever the active prompt changes — + // default view is always the resolved production body. useEffect(() => { - setSelA(versions[0]?.v || null); - setSelB(versions[1]?.v || null); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [promptId]); + setSelectedVersion(null); + }, [prompt?.name]); - if (!p) { + if (!prompt) { return (
Select a prompt to inspect.
@@ -108,228 +135,137 @@ function PromptDetail({ promptId }: { promptId: string }) { ); } - if (!versions.length) { - return ( - - {p.tags.map(t => )} -
- } - style={{ height: "100%" }} - > -
- Detailed version history not yet captured for this prompt.
- Current: {p.current} · {p.versionCount} versions total -
-
- ); + async function showVersion(v: PromptVersion) { + if (!prompt) return; + setSelectedVersion(v.version); + if (v.body !== undefined) { + setBodyCache((c) => ({ ...c, [v.version]: v.body! })); + return; + } + if (bodyCache[v.version] !== undefined) return; + setLoading(true); + try { + const res = await fetch( + `/api/prompts/${encodeURIComponent(prompt.name)}/versions/${v.version}`, + ); + const json = (await res.json()) as { body: string | null }; + setBodyCache((c) => ({ ...c, [v.version]: json.body ?? "(version body unavailable)" })); + } catch { + setBodyCache((c) => ({ ...c, [v.version]: "(version body unavailable)" })); + } finally { + setLoading(false); + } } + const shownBody = + selectedVersion != null + ? bodyCache[selectedVersion] ?? (loading ? "Loading…" : "") + : prompt.body; + const shownLabel = selectedVersion != null ? `v${selectedVersion}` : "production"; + return (
- {p.tags.map(t => )} +
} > -
- - - 0 ? "↗" : "↘"} ${Math.abs(p.evalDelta).toFixed(3)} vs prev`} tone={p.evalDelta > 0 ? "good" : "bad"} /> - v + " " + (s*100).toFixed(0) + "%").join(" / ")} /> +
+ + + +
- {/* Version timeline */} - 0 && ( + + Click to inspect + + } + > +
+ {prompt.versions.map((v, i) => { + const on = selectedVersion === v.version; + const notLast = i < prompt.versions.length - 1; + const dropDesktopRight = notLast && !on; + return ( + + ); + })} +
+
+ )} + + {/* Body panel (single column, read-only) */} + - Click to inspect · ⇧-click to compare - + selectedVersion != null ? ( + + ) : undefined } > -
- {versions.map((v, i) => { - const isA = selA === v.v; - const isB = selB === v.v; - const borderClass = isA ? "border-[#3C43E7]" : isB ? "border-[#FD6027]" : "border-[#E6E8EB]"; - const notLast = i < versions.length - 1; - // Mobile (stacked): drop bottom border on all but the last so stacked - // buttons share one horizontal divider. - // Desktop (row, lg): drop right border on interior neutral buttons so - // side-by-side buttons share one vertical divider (selected keep full border). - const dropMobileBottom = notLast; - const dropDesktopRight = notLast && !isA && !isB; - return ( - - ); - })} +
+
+ {shownBody} +
- - {/* Diff + metrics */} -
- - -
-
- ); -} - -/* ───── Mini stat (used in prompt header) ───── */ -function Stat({ label, value, sub, tone }: { label: React.ReactNode; value: React.ReactNode; sub?: React.ReactNode; tone?: "good" | "bad" }) { - return ( -
-
{label}
-
{value}
- {sub &&
{sub}
}
); } -/* ───── Diff viewer ───── */ -function PromptDiff({ a, b, versions }: { a: string | null; b: string | null; versions: PromptVersion[] }) { - const bodyA = (a && D.PROMPT_BODIES[a]) || `# ${a}\n(prompt body not captured in mock)`; - const bodyB = (b && D.PROMPT_BODIES[b]) || `# ${b}\n(prompt body not captured in mock)`; - // Naive line-diff: pair lines by index, mark added/removed/equal. - const linesA = bodyA.split("\n"); - const linesB = bodyB.split("\n"); - const max = Math.max(linesA.length, linesB.length); - return ( - - −{linesB.length} from {b} - +{linesA.length} into {a} -
- } - > -
-
- {Array.from({ length: max }).map((_, i) => { - const la = linesA[i] ?? ""; - const lb = linesB[i] ?? ""; - const same = la === lb; - return ( -
-
- {lb ? (i + 1) : ""} - {same ? " " : (lb ? "−" : " ")} - {lb} -
-
- {la ? (i + 1) : ""} - {same ? " " : (la ? "+" : " ")} - {la} -
-
- ); - })} -
-
-
- ); -} - -/* ───── Metrics comparison ───── */ -function PromptMetrics({ versions, selA, selB }: { versions: PromptVersion[]; selA: string | null; selB: string | null }) { - const a = versions.find(v => v.v === selA); - const b = versions.find(v => v.v === selB); - const rows: { k: keyof PromptVersion; l: string; fmt: (v: number) => string; better: "higher" | "lower" | null }[] = [ - { k: "evalScore", l: "Eval score", fmt: (v) => (v*100).toFixed(0), better: "higher" }, - { k: "halluc", l: "Hallucination", fmt: (v) => v.toFixed(3), better: "lower" }, - { k: "p95", l: "p95 latency", fmt: (v) => v.toFixed(1) + "s", better: "lower" }, - { k: "costAvg", l: "Cost / run", fmt: (v) => "$" + v.toFixed(3), better: "lower" }, - { k: "runs", l: "Runs (lifetime)", fmt: (v) => v.toLocaleString("en-US"), better: null }, - ]; - return ( - -
- - A · {selA} - B · {selB} - {rows.map(r => { - const av = a ? (a[r.k] as number) : null; - const bv = b ? (b[r.k] as number) : null; - let aWins = false, bWins = false; - if (av !== null && bv !== null && r.better) { - if (r.better === "higher") { - aWins = av > bv; - bWins = bv > av; - } - if (r.better === "lower") { - aWins = av < bv; - bWins = bv < av; - } - } - return ( - - {r.l} - - {av != null ? r.fmt(av) : "—"} - {aWins && } - - - {bv != null ? r.fmt(bv) : "—"} - {bWins && } - - - ); - })} -
-
- Sample size: {a?.runs.toLocaleString("en-US")} vs {b?.runs.toLocaleString("en-US")} runs -
-
- ); -} - /* ───── Top-level screen ───── */ -export function PromptsScreen() { - const [active, setActive] = useState(D.PROMPTS[0]?.id ?? ""); +export function PromptsScreen({ data }: { data: PromptsResponse }) { + const [active, setActive] = useState(data.rows[0]?.name ?? ""); + const selected = data.rows.find((p) => p.name === active); + const inProd = data.rows.filter((p) => + p.versions.some((v) => v.tags.includes("production")), + ).length; + return (
-
Arthur engine · prompt versioning
+
+ {data.arthurEnabled ? "Arthur engine · prompt versioning" : "In-code defaults · prompt versioning"} +

Prompt registry

@@ -338,16 +274,18 @@ export function PromptsScreen() {
-
- - p.tags.includes("production")).length.toString()} sub="serving traffic" /> - p.tags.includes("ab-test")).length.toString()} sub="live experiments" /> - +
+ +
- - + +
); diff --git a/apps/dashboard/lib/api/fallbacks.ts b/apps/dashboard/lib/api/fallbacks.ts index d7c81cb..d0eec95 100644 --- a/apps/dashboard/lib/api/fallbacks.ts +++ b/apps/dashboard/lib/api/fallbacks.ts @@ -1,6 +1,9 @@ import type { KpisResponse, EvalHealthResponse, + EvalsResponse, + CostResponse, + PromptsResponse, RunsResponse, RunDetailResponse, LiveRunsResponse, @@ -42,3 +45,23 @@ export function liveRunsFallback(now: string): LiveRunsResponse { export function workflowsFallback(now: string): WorkflowsResponse { return { generatedAt: now, rows: [], total: 0 }; } + +export function evalsFallback(now: string): EvalsResponse { + return { available: false, generatedAt: now, reason: "Worker unavailable." }; +} + +export function costFallback(now: string): CostResponse { + return { + generatedAt: now, + available: false, + window: { start: now, end: now }, + totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 }, + byModel: [], + byWorkflow: [], + daily: [], + }; +} + +export function promptsFallback(now: string): PromptsResponse { + return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 }; +} diff --git a/apps/shared/contracts/api.ts b/apps/shared/contracts/api.ts index 32eecbc..4f3d171 100644 --- a/apps/shared/contracts/api.ts +++ b/apps/shared/contracts/api.ts @@ -1,4 +1,4 @@ -import type { Run, RunDetail, RunStep, Workflow } from "./domain.js"; +import type { PromptDef, Run, RunDetail, RunStep, Workflow } from "./domain.js"; export interface ErrorEnvelope { error: { code: string; message: string; details?: unknown }; @@ -24,6 +24,90 @@ export type EvalHealthResponse = } | { available: false; reason: string }; +export type EvalsResponse = + | { + available: true; + generatedAt: string; + windowHours: number; + /** continuous_eval_success_rate × 100, fleet-wide. */ + score: number; + /** Σ eval_count across tasks — "spans graded" in the window. */ + spansGraded: number; + /** Σ trace_count across tasks. */ + traceCount: number; + } + | { available: false; generatedAt: string; reason: string }; + +export interface CostByModelEntry { + /** Arthur span model_name. */ + model: string; + /** USD, summed total_token_cost over the window. */ + cost: number; + /** Summed total_token_count over the window. */ + tokens: number; +} + +export interface CostByWorkflowEntry { + /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */ + taskId: string; + /** Arthur task name (= the ticket-run identifier). */ + name: string; + /** trace_count for the task. */ + runs: number; + /** trace_token_count. */ + tokens: number; + /** trace_token_cost (USD). */ + cost: number; + /** cost / max(1, runs). */ + costPerRun: number; +} + +export interface CostResponse { + generatedAt: string; + /** + * false when Arthur is unconfigured/unreachable or returns nothing. The + * screen renders its empty/N-A state. + */ + available: boolean; + /** Window the figures cover (the request's start_time/end_time). ISO. */ + window: { start: string; end: string }; + totals: { + /** USD, Σ overviews[].trace_token_cost. */ + totalTokenCost: number; + /** Σ overviews[].trace_token_count. */ + totalTokens: number; + /** Σ overviews[].trace_count. */ + traceCount: number; + /** totalTokenCost / max(1, traceCount). */ + costPerRun: number; + }; + byModel: CostByModelEntry[]; + /** Per-task (= per ticket-run) breakdown from /traces/overview. */ + byWorkflow: CostByWorkflowEntry[]; + /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */ + daily: { date: string; cost: number; tokens: number }[]; +} + +export interface PromptsResponse { + generatedAt: string; + /** `false` when the worker can't resolve prompts (degrades to empty list). */ + available: boolean; + /** + * Whether Arthur is configured (key + endpoint + task id all set). When + * false, every prompt's `source` is "fallback" and `versions` is empty. + */ + arthurEnabled: boolean; + rows: PromptDef[]; + total: number; +} + +/** On-demand body for a single historical Arthur version. */ +export interface PromptVersionBodyResponse { + generatedAt: string; + available: boolean; + body: string | null; +} + export interface LiveRunsResponse { generatedAt: string; rows: Run[]; diff --git a/apps/shared/contracts/domain.ts b/apps/shared/contracts/domain.ts index 6aba292..d4868a4 100644 --- a/apps/shared/contracts/domain.ts +++ b/apps/shared/contracts/domain.ts @@ -133,3 +133,36 @@ export interface HourPoint { p95: number; errors: number; } + +/** One Arthur version of a named prompt (metadata; body fetched on demand). */ +export interface PromptVersion { + /** Arthur integer version number. */ + version: number; + /** ISO timestamp the version was created. */ + createdAt: string; + /** Real Arthur tags on this version, e.g. ["production"]. */ + tags: string[]; + modelProvider: string; + modelName: string; + numMessages: number; + numTools: number; + /** Body text. Present only for the production version (eager); other + * versions are fetched on demand via the by-version endpoint. */ + body?: string; +} + +/** A workflow phase prompt as resolved by the worker at runtime. */ +export interface PromptDef { + /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */ + name: string; + /** Human label for the workflow phase, e.g. "Research & Plan". */ + phase: string; + /** Resolved production prompt body (Arthur production tag, or in-code fallback). */ + body: string; + /** Where the resolved `body` came from. */ + source: "arthur" | "fallback"; + /** Model the agent runs this prompt with (env-derived). */ + model: string; + /** Real Arthur version history, newest first. Empty when source is "fallback". */ + versions: PromptVersion[]; +} diff --git a/apps/worker/src/lib/overview/collect-cost.test.ts b/apps/worker/src/lib/overview/collect-cost.test.ts new file mode 100644 index 0000000..f0d9736 --- /dev/null +++ b/apps/worker/src/lib/overview/collect-cost.test.ts @@ -0,0 +1,186 @@ +import { describe, it, expect, vi } from "vitest"; +import { collectCost, type CostArthurClient } from "./collect-cost.js"; +import type { + TraceOverviewListResponse, + TraceTimeseriesPoint, + ModelTokenCost, +} from "../../sandbox/arthur-client.js"; + +const NOW = new Date("2026-06-08T12:00:00.000Z"); + +function makeClient(opts: { + overview: TraceOverviewListResponse; + timeseries: Record; + byModel: ModelTokenCost[]; +}): CostArthurClient { + return { + getTracesOverview: vi.fn().mockResolvedValue(opts.overview), + getTracesTimeseries: vi + .fn() + .mockImplementation((taskId: string) => + Promise.resolve(opts.timeseries[taskId] ?? []), + ), + aggregateSpanTokensByModel: vi.fn().mockResolvedValue(opts.byModel), + }; +} + +describe("collectCost", () => { + it("aggregates totals, per-task breakdown, by-model, and merged daily series", async () => { + const client = makeClient({ + overview: { + count: 2, + overviews: [ + { + task_id: "t1", + trace_count: 4, + trace_token_count: 1000, + trace_token_cost: 2.0, + eval_count: 0, + continuous_eval_success_rate: 1, + last_active: "2026-06-08", + }, + { + task_id: "t2", + trace_count: 6, + trace_token_count: 3000, + trace_token_cost: 4.0, + eval_count: 0, + continuous_eval_success_rate: 1, + }, + ], + }, + timeseries: { + t1: [ + { timestamp: "2026-06-06", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 }, + { timestamp: "2026-06-07", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 }, + ], + t2: [ + { timestamp: "2026-06-07", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 }, + { timestamp: "2026-06-08", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 }, + ], + }, + byModel: [ + { model: "claude-opus-4-6", tokens: 3000, cost: 5.0 }, + { model: "claude-haiku", tokens: 1000, cost: 1.0 }, + ], + }); + + const data = await collectCost(client, { now: NOW, bucketSize: "day" }); + + // totals + expect(data.totals).toEqual({ + totalTokenCost: 6.0, + totalTokens: 4000, + traceCount: 10, + costPerRun: 0.6, + }); + + // window = calendar MTD + expect(data.window.start).toBe("2026-06-01T00:00:00.000Z"); + expect(data.window.end).toBe(NOW.toISOString()); + + // byWorkflow = per-task, with costPerRun guarded + expect(data.byWorkflow).toEqual([ + { taskId: "t1", name: "t1", runs: 4, tokens: 1000, cost: 2.0, costPerRun: 0.5 }, + { taskId: "t2", name: "t2", runs: 6, tokens: 3000, cost: 4.0, costPerRun: 4 / 6 }, + ]); + + // byModel passthrough mapped to contract shape + expect(data.byModel).toEqual([ + { model: "claude-opus-4-6", cost: 5.0, tokens: 3000 }, + { model: "claude-haiku", cost: 1.0, tokens: 1000 }, + ]); + + // daily merged by timestamp, oldest -> newest + expect(data.daily).toEqual([ + { date: "2026-06-06", cost: 1.0, tokens: 500 }, + { date: "2026-06-07", cost: 3.0, tokens: 2000 }, + { date: "2026-06-08", cost: 2.0, tokens: 1500 }, + ]); + }); + + it("treats null trace_token_cost as 0 and guards divide-by-zero", async () => { + const client = makeClient({ + overview: { + count: 1, + overviews: [ + { + task_id: "t1", + trace_count: 0, + trace_token_count: 0, + trace_token_cost: null, + eval_count: 0, + continuous_eval_success_rate: 0, + }, + ], + }, + timeseries: { t1: [] }, + byModel: [], + }); + + const data = await collectCost(client, { now: NOW, bucketSize: "day" }); + + expect(data.totals).toEqual({ + totalTokenCost: 0, + totalTokens: 0, + traceCount: 0, + costPerRun: 0, + }); + expect(data.byWorkflow).toEqual([ + { taskId: "t1", name: "t1", runs: 0, tokens: 0, cost: 0, costPerRun: 0 }, + ]); + expect(data.byModel).toEqual([]); + expect(data.daily).toEqual([]); + }); + + it("returns empty aggregates when Arthur has no tasks", async () => { + const client = makeClient({ + overview: { count: 0, overviews: [] }, + timeseries: {}, + byModel: [], + }); + + const data = await collectCost(client, { now: NOW, bucketSize: "day" }); + + expect(data.totals).toEqual({ + totalTokenCost: 0, + totalTokens: 0, + traceCount: 0, + costPerRun: 0, + }); + expect(data.byWorkflow).toEqual([]); + expect(data.byModel).toEqual([]); + expect(data.daily).toEqual([]); + // No tasks -> no per-task timeseries fan-out. + expect(client.getTracesTimeseries).not.toHaveBeenCalled(); + }); + + it("caps the daily timeseries fan-out to the 50 most-active tasks", async () => { + // 60 tasks, each with a distinct trace_count so the top-50 are deterministic. + const overviews = Array.from({ length: 60 }, (_, i) => ({ + task_id: `t${i}`, + trace_count: i, // t59 most active, t0 least + trace_token_count: 0, + trace_token_cost: 0, + eval_count: 0, + continuous_eval_success_rate: 0, + })); + const client = makeClient({ + overview: { count: overviews.length, overviews }, + timeseries: {}, + byModel: [], + }); + + await collectCost(client, { now: NOW, bucketSize: "day" }); + + // Only the 50 highest-trace_count tasks are queried (t10..t59). + expect(client.getTracesTimeseries).toHaveBeenCalledTimes(50); + const queried = (client.getTracesTimeseries as ReturnType).mock.calls.map( + (c) => c[0], + ); + expect(queried).not.toContain("t0"); + expect(queried).not.toContain("t9"); + expect(queried).toContain("t10"); + expect(queried).toContain("t59"); + }); +}); diff --git a/apps/worker/src/lib/overview/collect-cost.ts b/apps/worker/src/lib/overview/collect-cost.ts new file mode 100644 index 0000000..258cb0e --- /dev/null +++ b/apps/worker/src/lib/overview/collect-cost.ts @@ -0,0 +1,142 @@ +import type { CostResponse } from "@shared/contracts"; +import { logger } from "../logger.js"; +import type { + TraceOverviewListResponse, + TraceTimeseriesPoint, + ModelTokenCost, +} from "../../sandbox/arthur-client.js"; + +/** + * The slice of `ArthurClient` the cost collector depends on. The real object is + * an `ArthurClient`; this narrow interface keeps the aggregation testable with a + * fake (mirrors `RunsLister` for the run-store collectors). + */ +export interface CostArthurClient { + getTracesOverview( + taskIds: string[], + startTime: string, + endTime: string, + ): Promise; + getTracesTimeseries( + taskId: string, + startTime: string, + endTime: string, + bucketSize: string, + ): Promise; + aggregateSpanTokensByModel( + taskIds: string[], + startTime: string, + endTime: string, + ): Promise; +} + +export interface CollectCostOptions { + now: Date; + /** Bucket granularity for the daily-spend timeseries. */ + bucketSize: string; +} + +/** + * Shapes a `CostResponse` (minus `generatedAt`/`available`) from Arthur's + * pre-aggregated token/cost data. Cost comes straight from Arthur's + * `*_token_cost` fields — no client-side pricing. + * + * - `totals` + `byWorkflow` come from one `getTracesOverview` call. Arthur tasks + * ARE the workflow grouping (per ticket-run), so each overview row is one + * `byWorkflow` entry. + * - `byModel` comes from `aggregateSpanTokensByModel` (the one client-side + * grouping, since Arthur has no per-model overview). + * - `daily` fans out one `getTracesTimeseries` call per task that appears in the + * overview and merges points by bucket timestamp. + */ +export async function collectCost( + client: CostArthurClient, + opts: CollectCostOptions, +): Promise> { + const { now, bucketSize } = opts; + // Assumption: calendar month-to-date (matches the original "MTD" framing). + // TODO(arthur-verify): confirm the intended window (calendar MTD vs rolling 30d/24h). + const start = startOfMonthUTC(now).toISOString(); + const end = now.toISOString(); + + // TODO(arthur-verify): empty `task_ids` is assumed to mean org-wide. If Arthur + // requires explicit ids, enumerate the org's tasks and pass them instead. + const { overviews } = await client.getTracesOverview([], start, end); + + let totalTokenCost = 0; + let totalTokens = 0; + let traceCount = 0; + const byWorkflow = overviews.map((o) => { + // trace_token_cost is null when Arthur has no cost data — treat as 0. + const cost = o.trace_token_cost ?? 0; + totalTokenCost += cost; + totalTokens += o.trace_token_count; + traceCount += o.trace_count; + return { + taskId: o.task_id, + // Arthur task name = the ticket-run identifier; overview omits it, so the + // task_id (which IS that identifier) doubles as the display name. + // TODO(arthur-verify): task->workflow mapping — rows stay per-task. + name: o.task_id, + runs: o.trace_count, + tokens: o.trace_token_count, + cost, + costPerRun: o.trace_count > 0 ? cost / o.trace_count : 0, + }; + }); + + const totals = { + totalTokenCost, + totalTokens, + traceCount, + costPerRun: traceCount > 0 ? totalTokenCost / traceCount : 0, + }; + + const byModelRaw = await client.aggregateSpanTokensByModel([], start, end); + const byModel = byModelRaw.map((m) => ({ + model: m.model, + cost: m.cost, + tokens: m.tokens, + })); + + // Fan out one timeseries call per task that has data, then merge by bucket. + // Tasks are per-ticket-run, so a busy month can be hundreds — cap the fan-out + // to the most-active tasks to avoid an unbounded burst of requests. + // TODO(arthur-verify): cap is by trace_count, on the assumption the highest- + // traffic tasks dominate the daily-spend curve; revisit if the chart looks short. + const DAILY_FANOUT_CAP = 50; + const sortedByActivity = [...overviews].sort((a, b) => b.trace_count - a.trace_count); + const fanoutTasks = sortedByActivity.slice(0, DAILY_FANOUT_CAP); + if (sortedByActivity.length > DAILY_FANOUT_CAP) { + logger.info( + { + total: sortedByActivity.length, + capped: DAILY_FANOUT_CAP, + dropped: sortedByActivity.slice(DAILY_FANOUT_CAP).map((o) => o.task_id), + }, + "cost_daily_fanout_capped", + ); + } + const taskIds = fanoutTasks.map((o) => o.task_id); + const series = await Promise.all( + taskIds.map((id) => client.getTracesTimeseries(id, start, end, bucketSize)), + ); + const merged = new Map(); + for (const points of series) { + for (const p of points) { + const row = merged.get(p.timestamp) ?? { cost: 0, tokens: 0 }; + row.cost += p.trace_token_cost ?? 0; + row.tokens += p.trace_token_count; + merged.set(p.timestamp, row); + } + } + const daily = [...merged.entries()] + .map(([date, v]) => ({ date, cost: v.cost, tokens: v.tokens })) + .sort((a, b) => (a.date < b.date ? -1 : a.date > b.date ? 1 : 0)); + + return { window: { start, end }, totals, byModel, byWorkflow, daily }; +} + +function startOfMonthUTC(now: Date): Date { + return new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), 1)); +} diff --git a/apps/worker/src/lib/overview/collect-evals.test.ts b/apps/worker/src/lib/overview/collect-evals.test.ts new file mode 100644 index 0000000..1b40a6f --- /dev/null +++ b/apps/worker/src/lib/overview/collect-evals.test.ts @@ -0,0 +1,95 @@ +import { describe, it, expect, vi } from "vitest"; +import { collectEvals } from "./collect-evals.js"; +import type { TraceOverview } from "../../sandbox/arthur-client.js"; + +const NOW = new Date("2026-06-08T12:00:00.000Z"); + +function makeClient(overviews: TraceOverview[]) { + return { getTracesOverview: vi.fn().mockResolvedValue({ overviews }) }; +} + +function overview(over: Partial): TraceOverview { + return { + task_id: "t", + trace_count: 0, + trace_token_count: 0, + trace_token_cost: 0, + eval_count: 0, + continuous_eval_success_rate: 0, + ...over, + }; +} + +describe("collectEvals", () => { + it("sums spansGraded/traceCount and eval-count-weights the score", async () => { + const client = makeClient([ + overview({ task_id: "a", trace_count: 10, eval_count: 8, continuous_eval_success_rate: 1.0 }), + overview({ task_id: "b", trace_count: 4, eval_count: 2, continuous_eval_success_rate: 0.5 }), + ]); + + const result = await collectEvals({ + client, + taskIds: [], + windowHours: 24, + now: NOW, + }); + + expect(result.spansGraded).toBe(10); + expect(result.traceCount).toBe(14); + // (1.0*8 + 0.5*2) / 10 * 100 = (8 + 1) / 10 * 100 = 90 + expect(result.score).toBe(90); + expect(result.windowHours).toBe(24); + }); + + it("yields score 0 when nothing is graded (eval_count sums to 0)", async () => { + const client = makeClient([ + overview({ task_id: "a", trace_count: 5, eval_count: 0 }), + ]); + + const result = await collectEvals({ + client, + taskIds: [], + windowHours: 24, + now: NOW, + }); + + expect(result.spansGraded).toBe(0); + expect(result.traceCount).toBe(5); + expect(result.score).toBe(0); + }); + + it("computes the window start from windowHours and passes the ISO range to the client", async () => { + const client = makeClient([]); + + await collectEvals({ + client, + taskIds: ["x", "y"], + windowHours: 24, + now: NOW, + }); + + expect(client.getTracesOverview).toHaveBeenCalledWith( + ["x", "y"], + "2026-06-07T12:00:00.000Z", + "2026-06-08T12:00:00.000Z", + ); + }); + + it("returns zeroed aggregates when no overviews are returned", async () => { + const client = makeClient([]); + + const result = await collectEvals({ + client, + taskIds: [], + windowHours: 24, + now: NOW, + }); + + expect(result).toEqual({ + windowHours: 24, + score: 0, + spansGraded: 0, + traceCount: 0, + }); + }); +}); diff --git a/apps/worker/src/lib/overview/collect-evals.ts b/apps/worker/src/lib/overview/collect-evals.ts new file mode 100644 index 0000000..b144232 --- /dev/null +++ b/apps/worker/src/lib/overview/collect-evals.ts @@ -0,0 +1,74 @@ +import type { EvalsResponse } from "@shared/contracts"; +import type { TraceOverview } from "../../sandbox/arthur-client.js"; + +const HOUR = 3_600_000; + +/** Fleet aggregate fields the route spreads onto an `available: true` response. */ +export type EvalsAggregate = Pick< + Extract, + "windowHours" | "score" | "spansGraded" | "traceCount" +>; + +/** + * The slice of `ArthurClient` the eval collector depends on. The real object is + * an `ArthurClient`; this narrow interface keeps the aggregation testable with a + * fake (mirrors `CostArthurClient` for the cost collector). + */ +export interface EvalsArthurClient { + getTracesOverview( + taskIds: string[], + startTime: string, + endTime: string, + ): Promise<{ overviews: TraceOverview[] }>; +} + +export interface CollectEvalsOptions { + client: EvalsArthurClient; + // TODO(arthur-verify): unconfirmed whether `taskIds: []` means "all org tasks" + // on POST /api/v1/traces/overview. If not, the route must enumerate tasks first. + taskIds: string[]; + windowHours: number; + now: Date; +} + +/** + * Aggregates Arthur's per-task trace overviews into fleet-wide eval health: + * eval-count-weighted success rate × 100, summed spans-graded and trace counts + * over the window. When `spansGraded` sums to 0 (no continuous evals configured + * / nothing graded), `score` is 0 and the route turns that into + * `available: false`. + */ +export async function collectEvals( + opts: CollectEvalsOptions, +): Promise { + const endTime = opts.now.toISOString(); + const startTime = new Date( + opts.now.getTime() - opts.windowHours * HOUR, + ).toISOString(); + + const { overviews } = await opts.client.getTracesOverview( + opts.taskIds, + startTime, + endTime, + ); + + const spansGraded = sum(overviews, (o) => o.eval_count); + const traceCount = sum(overviews, (o) => o.trace_count); + const score = + spansGraded === 0 + ? 0 + : (sum(overviews, (o) => o.continuous_eval_success_rate * o.eval_count) / + spansGraded) * + 100; + + return { + windowHours: opts.windowHours, + score, + spansGraded, + traceCount, + }; +} + +function sum(items: T[], pick: (item: T) => number): number { + return items.reduce((acc, item) => acc + (pick(item) || 0), 0); +} diff --git a/apps/worker/src/lib/overview/collect-prompts.test.ts b/apps/worker/src/lib/overview/collect-prompts.test.ts new file mode 100644 index 0000000..8d382cb --- /dev/null +++ b/apps/worker/src/lib/overview/collect-prompts.test.ts @@ -0,0 +1,164 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("../../../env.js", () => ({ env: {} })); + +const mockGetPromptByTag = vi.fn(); +const mockListPromptVersions = vi.fn(); +vi.mock("../../sandbox/arthur-client.js", () => ({ + ArthurClient: { + fromTraceEndpoint: vi.fn(() => ({ + getPromptByTag: mockGetPromptByTag, + listPromptVersions: mockListPromptVersions, + })), + }, +})); + +import { resolvePrompts } from "./collect-prompts.js"; +import { PROMPT_FALLBACKS } from "../prompts.js"; + +async function setEnv(partial: Record) { + const mod = (await import("../../../env.js")) as unknown as { + env: Record; + }; + mod.env = { ...mod.env, ...partial }; +} + +function arthurVersion(version: number, tags: string[]) { + return { + version, + created_at: `2026-06-0${version}T00:00:00.000Z`, + deleted_at: null, + model_provider: "anthropic", + model_name: "claude-opus-4-6", + tags, + num_messages: 1, + num_tools: 0, + }; +} + +describe("resolvePrompts", () => { + beforeEach(async () => { + mockGetPromptByTag.mockReset(); + mockListPromptVersions.mockReset(); + await setEnv({ + AGENT_KIND: "claude", + CLAUDE_MODEL: "claude-opus-4-6", + CODEX_MODEL: "gpt-5-codex", + GENAI_ENGINE_API_KEY: undefined, + GENAI_ENGINE_TRACE_ENDPOINT: undefined, + GENAI_ENGINE_PROMPT_TASK_ID: undefined, + }); + }); + + it("returns fallbacks with empty versions when Arthur is disabled", async () => { + const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true }); + expect(arthurEnabled).toBe(false); + expect(prompts).toHaveLength(3); + expect(prompts.map((p) => p.name)).toEqual(["research-plan", "implement", "review"]); + for (const p of prompts) { + expect(p.source).toBe("fallback"); + expect(p.versions).toEqual([]); + expect(p.model).toBe("claude-opus-4-6"); + } + expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]); + expect(prompts[0].phase).toBe("Research & Plan"); + expect(mockGetPromptByTag).not.toHaveBeenCalled(); + }); + + it("returns fallbacks when PROMPT_TASK_ID is missing even if key+endpoint are set", async () => { + await setEnv({ + GENAI_ENGINE_API_KEY: "k", + GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces", + GENAI_ENGINE_PROMPT_TASK_ID: undefined, + }); + const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true }); + expect(arthurEnabled).toBe(false); + expect(prompts[0].source).toBe("fallback"); + expect(mockGetPromptByTag).not.toHaveBeenCalled(); + }); + + it("resolves Arthur bodies + version history when enabled, attaching the production body", async () => { + await setEnv({ + GENAI_ENGINE_API_KEY: "k", + GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces", + GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000", + }); + mockGetPromptByTag.mockResolvedValue("arthur body"); + mockListPromptVersions.mockResolvedValue([ + arthurVersion(2, ["production"]), + arthurVersion(1, []), + ]); + + const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true }); + expect(arthurEnabled).toBe(true); + expect(mockGetPromptByTag).toHaveBeenCalledTimes(3); + const research = prompts[0]; + expect(research.source).toBe("arthur"); + expect(research.body).toBe("arthur body"); + expect(research.versions).toHaveLength(2); + expect(research.versions[0]).toMatchObject({ + version: 2, + createdAt: "2026-06-02T00:00:00.000Z", + tags: ["production"], + modelProvider: "anthropic", + modelName: "claude-opus-4-6", + numMessages: 1, + numTools: 0, + }); + // production version carries the eager body; the other does not + expect(research.versions[0].body).toBe("arthur body"); + expect(research.versions[1].body).toBeUndefined(); + }); + + it("falls back per-prompt when the production body is missing but keeps versions", async () => { + await setEnv({ + GENAI_ENGINE_API_KEY: "k", + GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces", + GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000", + }); + mockGetPromptByTag.mockResolvedValue(null); + mockListPromptVersions.mockResolvedValue([arthurVersion(1, [])]); + + const { prompts } = await resolvePrompts({ withVersions: true }); + expect(prompts[0].source).toBe("fallback"); + expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]); + expect(prompts[0].versions).toHaveLength(1); + }); + + it("degrades a prompt to fallback with empty versions when the body fetch throws", async () => { + await setEnv({ + GENAI_ENGINE_API_KEY: "k", + GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces", + GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000", + }); + mockGetPromptByTag.mockRejectedValue(new Error("boom")); + mockListPromptVersions.mockResolvedValue([]); + + const { prompts } = await resolvePrompts({ withVersions: true }); + expect(prompts[0].source).toBe("fallback"); + expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]); + expect(prompts[0].versions).toEqual([]); + }); + + it("skips the version fan-out and resolves empty versions when withVersions is false", async () => { + await setEnv({ + GENAI_ENGINE_API_KEY: "k", + GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces", + GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000", + }); + mockGetPromptByTag.mockResolvedValue("arthur body"); + + const { prompts } = await resolvePrompts({ withVersions: false }); + expect(mockGetPromptByTag).toHaveBeenCalledTimes(3); + expect(mockListPromptVersions).not.toHaveBeenCalled(); + expect(prompts[0].source).toBe("arthur"); + expect(prompts[0].body).toBe("arthur body"); + expect(prompts[0].versions).toEqual([]); + }); + + it("uses the codex model when AGENT_KIND=codex", async () => { + await setEnv({ AGENT_KIND: "codex" }); + const { prompts } = await resolvePrompts({ withVersions: true }); + expect(prompts[0].model).toBe("gpt-5-codex"); + }); +}); diff --git a/apps/worker/src/lib/overview/collect-prompts.ts b/apps/worker/src/lib/overview/collect-prompts.ts new file mode 100644 index 0000000..b36ee40 --- /dev/null +++ b/apps/worker/src/lib/overview/collect-prompts.ts @@ -0,0 +1,113 @@ +import type { PromptVersion } from "@shared/contracts"; +import { env } from "../../../env.js"; +import { logger } from "../logger.js"; +import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js"; + +const PHASE_LABEL: Record = { + "research-plan": "Research & Plan", + "implement": "Implement", + "review": "Review", +}; + +export interface ResolvedPrompt { + name: PromptName; + phase: string; + body: string; + source: "arthur" | "fallback"; + model: string; + versions: PromptVersion[]; +} + +export interface ResolvePromptsResult { + arthurEnabled: boolean; + prompts: ResolvedPrompt[]; +} + +/** + * Resolve each workflow phase prompt to its production body + (optionally) real + * Arthur version history. Shared by the durable `loadPrompts()` step and the + * `GET /api/v1/prompts` route so the two never drift. + * + * Version history is a dashboard-only concern, so `withVersions` lets the + * durable step skip the per-prompt `listPromptVersions` fan-out it would + * otherwise discard. When false, `versions` resolves to `[]` and only the + * production body is fetched. + * + * When Arthur is unconfigured (`GENAI_ENGINE_*`, incl. `GENAI_ENGINE_PROMPT_TASK_ID`, + * unset) every prompt resolves to its in-code `PROMPT_FALLBACKS` string with + * `source: "fallback"` and an empty version history. + */ +export async function resolvePrompts(opts: { withVersions: boolean }): Promise { + const { withVersions } = opts; + const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL; + const arthurEnabled = + !!env.GENAI_ENGINE_API_KEY && + !!env.GENAI_ENGINE_TRACE_ENDPOINT && + !!env.GENAI_ENGINE_PROMPT_TASK_ID; + + const base = ( + name: PromptName, + body: string, + source: "arthur" | "fallback", + versions: PromptVersion[] = [], + ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions }); + + if (!arthurEnabled) { + logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved"); + return { + arthurEnabled, + prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")), + }; + } + + const { ArthurClient } = await import("../../sandbox/arthur-client.js"); + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT!, + env.GENAI_ENGINE_API_KEY!, + ); + const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!; + const TAG = "production"; + + async function one(name: PromptName): Promise { + try { + // TODO(arthur-verify): version-list pagination depth — first page only. + let body: string | null; + let versions: PromptVersion[] = []; + if (withVersions) { + const [rawBody, rawVersions] = await Promise.all([ + client.getPromptByTag(taskId, name, TAG), + client.listPromptVersions(taskId, name).catch(() => []), + ]); + body = rawBody; + versions = rawVersions.map((v) => ({ + version: v.version, + createdAt: v.created_at, + tags: v.tags, + modelProvider: v.model_provider, + modelName: v.model_name, + numMessages: v.num_messages, + numTools: v.num_tools, + })); + // Attach the eager production body to its matching version entry; other + // version bodies are fetched on demand via the by-version route. + const prodVersion = versions.find((v) => v.tags.includes(TAG)); + if (prodVersion && body !== null) prodVersion.body = body; + } else { + body = await client.getPromptByTag(taskId, name, TAG); + } + + if (body === null) { + logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved"); + return base(name, PROMPT_FALLBACKS[name], "fallback", versions); + } + logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved"); + return base(name, body, "arthur", versions); + } catch (err) { + logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved"); + return base(name, PROMPT_FALLBACKS[name], "fallback"); + } + } + + const prompts = await Promise.all(PROMPT_NAMES.map(one)); + return { arthurEnabled, prompts }; +} diff --git a/apps/worker/src/routes/api/v1/cost.get.ts b/apps/worker/src/routes/api/v1/cost.get.ts new file mode 100644 index 0000000..6c51680 --- /dev/null +++ b/apps/worker/src/routes/api/v1/cost.get.ts @@ -0,0 +1,43 @@ +import { defineEventHandler, setResponseHeader } from "h3"; +import type { CostResponse } from "@shared/contracts"; +import { env } from "../../../../env.js"; +import { ArthurClient } from "../../../sandbox/arthur-client.js"; +import { collectCost } from "../../../lib/overview/collect-cost.js"; +import { logger } from "../../../lib/logger.js"; + +const EMPTY: Omit = { + window: { start: "", end: "" }, + totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 }, + byModel: [], + byWorkflow: [], + daily: [], +}; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + + const generatedAt = new Date().toISOString(); + + // Arthur unconfigured — degrade to the documented empty state (no crash). + if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) { + return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } }; + } + + try { + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT, + env.GENAI_ENGINE_API_KEY, + ); + // TODO(arthur-verify): bucket_size value ("day") is unconfirmed against a live instance. + const data = await collectCost(client, { now: new Date(), bucketSize: "day" }); + return { generatedAt, available: true, ...data }; + } catch (err) { + // Arthur unreachable / 401 / unexpected shape — degrade like runs.get.ts. + logger.warn({ err: (err as Error).message }, "cost_collect_failed"); + return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } }; + } +}); diff --git a/apps/worker/src/routes/api/v1/evals.get.ts b/apps/worker/src/routes/api/v1/evals.get.ts new file mode 100644 index 0000000..54300d5 --- /dev/null +++ b/apps/worker/src/routes/api/v1/evals.get.ts @@ -0,0 +1,68 @@ +import { defineEventHandler, setResponseHeader } from "h3"; +import type { EvalsResponse } from "@shared/contracts"; +import { env } from "../../../../env.js"; +import { ArthurClient } from "../../../sandbox/arthur-client.js"; +import { collectEvals } from "../../../lib/overview/collect-evals.js"; +import { logger } from "../../../lib/logger.js"; + +const WINDOW_HOURS = 24; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + + const generatedAt = new Date().toISOString(); + + if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) { + return { + available: false, + generatedAt, + reason: "Arthur GenAI Engine not configured.", + }; + } + + try { + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT, + env.GENAI_ENGINE_API_KEY, + ); + // TODO(arthur-verify): pass [] if empty task_ids === all org tasks on + // POST /api/v1/traces/overview; otherwise enumerate via /api/v2/tasks/search. + const taskIds: string[] = []; + + const { windowHours, score, spansGraded, traceCount } = + await collectEvals({ + client, + taskIds, + windowHours: WINDOW_HOURS, + now: new Date(), + }); + + if (spansGraded === 0) { + return { + available: false, + generatedAt, + reason: "No graded evals in the last 24h.", + }; + } + + return { + available: true, + generatedAt, + windowHours, + score, + spansGraded, + traceCount, + }; + } catch (err) { + logger.warn({ err: (err as Error).message }, "evals_list_failed"); + return { + available: false, + generatedAt, + reason: "Eval grading not wired up yet.", + }; + } +}); diff --git a/apps/worker/src/routes/api/v1/prompts.get.ts b/apps/worker/src/routes/api/v1/prompts.get.ts new file mode 100644 index 0000000..d0686d4 --- /dev/null +++ b/apps/worker/src/routes/api/v1/prompts.get.ts @@ -0,0 +1,29 @@ +import { defineEventHandler, setResponseHeader } from "h3"; +import type { PromptsResponse } from "@shared/contracts"; +import { resolvePrompts } from "../../../lib/overview/collect-prompts.js"; +import { logger } from "../../../lib/logger.js"; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + + const generatedAt = new Date().toISOString(); + try { + const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true }); + return { + generatedAt, + available: true, + arthurEnabled, + rows: prompts, + total: prompts.length, + }; + } catch (err) { + // Arthur unreachable / unexpected failure — degrade to the documented empty + // state so the dashboard renders its N/A view instead of a 500. + logger.warn({ err: (err as Error).message }, "prompts_resolve_failed"); + return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 }; + } +}); diff --git a/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts new file mode 100644 index 0000000..c30fffd --- /dev/null +++ b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts @@ -0,0 +1,44 @@ +import { defineEventHandler, getRouterParam, setResponseHeader } from "h3"; +import type { PromptVersionBodyResponse } from "@shared/contracts"; +import { env } from "../../../../../../../env.js"; +import { PROMPT_NAMES, type PromptName } from "../../../../../../lib/prompts.js"; +import { logger } from "../../../../../../lib/logger.js"; + +// TODO(arthur-verify): lazy-vs-eager body — historical bodies are fetched on +// demand here; the production body ships eagerly on the list route. +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + const generatedAt = new Date().toISOString(); + + const name = getRouterParam(event, "name") ?? ""; + const version = getRouterParam(event, "version") ?? ""; + const arthurEnabled = + !!env.GENAI_ENGINE_API_KEY && + !!env.GENAI_ENGINE_TRACE_ENDPOINT && + !!env.GENAI_ENGINE_PROMPT_TASK_ID; + + if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) { + return { generatedAt, available: false, body: null }; + } + + try { + const { ArthurClient } = await import("../../../../../../sandbox/arthur-client.js"); + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT!, + env.GENAI_ENGINE_API_KEY!, + ); + const body = await client.getPromptVersionBody( + env.GENAI_ENGINE_PROMPT_TASK_ID!, + name, + version, + ); + return { generatedAt, available: body !== null, body }; + } catch (err) { + logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed"); + return { generatedAt, available: false, body: null }; + } +}); diff --git a/apps/worker/src/sandbox/arthur-client.test.ts b/apps/worker/src/sandbox/arthur-client.test.ts index a5e4a80..bccd57b 100644 --- a/apps/worker/src/sandbox/arthur-client.test.ts +++ b/apps/worker/src/sandbox/arthur-client.test.ts @@ -239,4 +239,197 @@ describe("ArthurClient", () => { await expect(client.getPromptByTag("t", "x", "production")).rejects.toThrow(/500/); }); }); + + describe("getTracesOverview", () => { + it("POSTs task_ids/start/end and returns the parsed list response", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + count: 1, + overviews: [ + { + task_id: "AWT-42", + trace_count: 3, + trace_token_count: 1200, + trace_token_cost: 0.42, + eval_count: 6, + continuous_eval_success_rate: 0.9, + last_active: "2026-06-08T00:00:00Z", + }, + ], + })); + const client = new ArthurClient("http://host", "secret"); + const res = await client.getTracesOverview(["AWT-42"], "2026-06-01T00:00:00Z", "2026-06-08T00:00:00Z"); + + expect(res.count).toBe(1); + expect(res.overviews[0].task_id).toBe("AWT-42"); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/traces/overview"); + expect(init.method).toBe("POST"); + expect(init.headers.Authorization).toBe("Bearer secret"); + expect(JSON.parse(init.body)).toEqual({ + task_ids: ["AWT-42"], + start_time: "2026-06-01T00:00:00Z", + end_time: "2026-06-08T00:00:00Z", + }); + }); + }); + + describe("getTracesTimeseries", () => { + it("POSTs single task_id + bucket_size and unwraps the { points } envelope", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + points: [ + { timestamp: "2026-06-07T00:00:00Z", trace_count: 1, trace_token_count: 400, trace_token_cost: 0.1 }, + ], + })); + const client = new ArthurClient("http://host", "k"); + const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day"); + + expect(points).toHaveLength(1); + expect(points[0].trace_token_cost).toBe(0.1); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/traces/overview/timeseries"); + expect(init.method).toBe("POST"); + expect(JSON.parse(init.body)).toEqual({ + task_id: "AWT-42", + start_time: "s", + end_time: "e", + bucket_size: "day", + }); + }); + + it("accepts a bare array response", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse([ + { timestamp: "t", trace_count: 2, trace_token_count: 10, trace_token_cost: null }, + ])); + const client = new ArthurClient("http://host", "k"); + const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day"); + expect(points).toHaveLength(1); + }); + }); + + describe("aggregateSpanTokensByModel", () => { + it("sums tokens/cost grouped by model_name and skips null models", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + spans: [ + { model_name: "claude-opus-4-6", total_token_count: 100, total_token_cost: 0.5 }, + { model_name: "claude-opus-4-6", total_token_count: 50, total_token_cost: 0.25 }, + { model_name: "gpt-5", total_token_count: 200, total_token_cost: 1.0 }, + { model_name: null, total_token_count: 999, total_token_cost: 9.0 }, + ], + })); + const client = new ArthurClient("http://host", "k"); + const rows = await client.aggregateSpanTokensByModel(["AWT-42"], "s", "e"); + + expect(rows).toEqual([ + { model: "claude-opus-4-6", tokens: 150, cost: 0.75 }, + { model: "gpt-5", tokens: 200, cost: 1.0 }, + ]); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/traces/spans"); + expect(JSON.parse(init.body)).toEqual({ + task_ids: ["AWT-42"], + start_time: "s", + end_time: "e", + limit: 1000, + }); + }); + + it("treats null token/cost as 0", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse([ + { model_name: "m", total_token_count: null, total_token_cost: null }, + ])); + const client = new ArthurClient("http://host", "k"); + const rows = await client.aggregateSpanTokensByModel([], "s", "e"); + expect(rows).toEqual([{ model: "m", tokens: 0, cost: 0 }]); + }); + }); + + describe("listPromptVersions", () => { + it("GETs the versions endpoint and sorts newest-first", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + count: 2, + versions: [ + { + version: 1, + created_at: "2026-06-01T00:00:00Z", + deleted_at: null, + model_provider: "anthropic", + model_name: "claude-opus-4-6", + tags: [], + num_messages: 1, + num_tools: 0, + }, + { + version: 2, + created_at: "2026-06-02T00:00:00Z", + deleted_at: null, + model_provider: "anthropic", + model_name: "claude-opus-4-6", + tags: ["production"], + num_messages: 1, + num_tools: 0, + }, + ], + })); + const client = new ArthurClient("http://host", "k"); + const versions = await client.listPromptVersions("task-uuid", "research-plan"); + + expect(versions.map((v) => v.version)).toEqual([2, 1]); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions"); + expect(init.method).toBe("GET"); + expect(init.headers.Authorization).toBe("Bearer k"); + }); + + it("returns [] on 404", async () => { + mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 })); + const client = new ArthurClient("http://host", "k"); + expect(await client.listPromptVersions("t", "research-plan")).toEqual([]); + }); + + it("throws on 5xx", async () => { + mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 })); + const client = new ArthurClient("http://host", "k"); + await expect(client.listPromptVersions("t", "x")).rejects.toThrow(/500/); + }); + }); + + describe("getPromptVersionBody", () => { + it("GETs the by-version endpoint and returns messages[0].content", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + name: "research-plan", + version: 3, + messages: [{ role: "user", content: "v3 body" }], + })); + const client = new ArthurClient("http://host", "k"); + const body = await client.getPromptVersionBody("task-uuid", "research-plan", 3); + expect(body).toBe("v3 body"); + const [url, init] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions/3"); + expect(init.method).toBe("GET"); + }); + + it("accepts a string version specifier (latest/tag/datetime)", async () => { + mockFetch.mockResolvedValueOnce(jsonResponse({ + name: "implement", + messages: [{ role: "user", content: "latest body" }], + })); + const client = new ArthurClient("http://host", "k"); + const body = await client.getPromptVersionBody("t", "implement", "latest"); + expect(body).toBe("latest body"); + const [url] = mockFetch.mock.calls[0]; + expect(url).toBe("http://host/api/v1/tasks/t/prompts/implement/versions/latest"); + }); + + it("returns null on 404", async () => { + mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 })); + const client = new ArthurClient("http://host", "k"); + expect(await client.getPromptVersionBody("t", "x", 1)).toBeNull(); + }); + + it("throws on 5xx", async () => { + mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 })); + const client = new ArthurClient("http://host", "k"); + await expect(client.getPromptVersionBody("t", "x", 1)).rejects.toThrow(/500/); + }); + }); }); diff --git a/apps/worker/src/sandbox/arthur-client.ts b/apps/worker/src/sandbox/arthur-client.ts index 8afc77c..66d8b8f 100644 --- a/apps/worker/src/sandbox/arthur-client.ts +++ b/apps/worker/src/sandbox/arthur-client.ts @@ -23,6 +23,68 @@ interface SearchResponse { tasks: ArthurTask[]; } +/** + * Per-task aggregate over a window from `POST /api/v1/traces/overview`. + * Token/cost fields come from Arthur's `TokenCountCostSchema`; `trace_token_cost` + * may be null when cost is unavailable. Typed per the documented shape — these + * read endpoints are UNVERIFIED against a live instance, so parsing stays + * defensive (callers treat null cost as 0). + */ +export interface TraceOverview { + task_id: string; + trace_count: number; + trace_token_count: number; + trace_token_cost: number | null; + eval_count: number; + continuous_eval_success_rate: number; + last_active?: string; +} + +export interface TraceOverviewListResponse { + count: number; + overviews: TraceOverview[]; +} + +/** One bucket from `POST /api/v1/traces/overview/timeseries` (single task). */ +export interface TraceTimeseriesPoint { + timestamp: string; + trace_count: number; + trace_token_count: number; + trace_token_cost: number | null; + continuous_eval_success_rate?: number; +} + +/** Token/cost-by-model aggregation result (one row per Arthur `model_name`). */ +export interface ModelTokenCost { + model: string; + tokens: number; + cost: number; +} + +/** A span row from `GET /api/v1/traces/spans` carrying model + token/cost fields. */ +interface SpanTokenCost { + model_name: string | null; + total_token_count: number | null; + total_token_cost: number | null; +} + +/** One Arthur prompt version's metadata (no message body). */ +export interface ArthurPromptVersion { + version: number; + created_at: string; + deleted_at: string | null; + model_provider: string; + model_name: string; + tags: string[]; + num_messages: number; + num_tools: number; +} + +interface AgenticPromptVersionListResponse { + count: number; + versions: ArthurPromptVersion[]; +} + export class ArthurClient { constructor( private readonly baseUrl: string, @@ -56,6 +118,20 @@ export class ArthurClient { return (await res.json()) as T; } + /** GET that treats 404 as "absent" (returns null) instead of throwing — for the prompt read paths. */ + private async getAllowing404(path: string): Promise { + const res = await fetch(`${this.baseUrl}${path}`, { + method: "GET", + headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" }, + }); + if (res.status === 404) return null; + if (!res.ok) { + const body = await res.text().catch(() => ""); + throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`); + } + return (await res.json()) as T; + } + /** * Return tasks whose name equals `prefix` or matches `^prefix\.\d+$`. * Arthur's `task_name` search is substring-based, so we post-filter to @@ -124,21 +200,8 @@ export class ArthurClient { /** Fetch a tagged prompt version. Returns the first message's content, or null if 404. */ async getPromptByTag(taskId: string, name: string, tag: string): Promise { const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/tags/${encodeURIComponent(tag)}`; - const res = await fetch(`${this.baseUrl}${path}`, { - method: "GET", - headers: { - "Authorization": `Bearer ${this.apiKey}`, - "ngrok-skip-browser-warning": "true", - }, - }); - if (res.status === 404) return null; - if (!res.ok) { - const body = await res.text().catch(() => ""); - throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`); - } - const prompt = (await res.json()) as AgenticPrompt; - const first = prompt.messages?.[0]; - return first?.content ?? null; + const prompt = await this.getAllowing404(path); + return prompt?.messages?.[0]?.content ?? null; } /** Create a new version of a named prompt on a task. Content is sent as a single user message. */ @@ -171,4 +234,110 @@ export class ArthurClient { }, ); } + + /** + * Fleet eval/cost aggregate over a window. One call covers multiple tasks; + * sum across `overviews` for fleet totals. `taskIds` may be empty (see the + * empty-means-all-org open question in the specs). Shared by /evals + /cost. + */ + async getTracesOverview( + taskIds: string[], + startTime: string, + endTime: string, + ): Promise { + return this.request("/api/v1/traces/overview", { + method: "POST", + body: JSON.stringify({ + task_ids: taskIds, + start_time: startTime, + end_time: endTime, + }), + }); + } + + /** + * Per-bucket timeseries for a single task. The caller fans out one call per + * task and merges points by timestamp. The response envelope key is + * unverified, so accept both a bare array and a `{ points }` wrapper. + */ + async getTracesTimeseries( + taskId: string, + startTime: string, + endTime: string, + bucketSize: string, + ): Promise { + const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>( + "/api/v1/traces/overview/timeseries", + { + method: "POST", + body: JSON.stringify({ + task_id: taskId, + start_time: startTime, + end_time: endTime, + bucket_size: bucketSize, + }), + }, + ); + return Array.isArray(res) ? res : (res.points ?? []); + } + + /** + * By-model token/cost aggregation — Arthur has no per-model overview, so we + * fetch span rows (which carry `model_name` + token/cost fields) and sum + * grouped by `model_name`. Spans with a null `model_name` are skipped. + */ + async aggregateSpanTokensByModel( + taskIds: string[], + startTime: string, + endTime: string, + ): Promise { + // TODO(arthur-verify): pagination — first page only, bounded to N spans. The + // read endpoints are unverified, so we send a bounded `limit` rather than + // looping pages; this makes the ceiling explicit instead of pulling an + // unbounded result set and summing it silently in memory. + const res = await this.request<{ spans?: SpanTokenCost[] } | SpanTokenCost[]>( + "/api/v1/traces/spans", + { + method: "POST", + body: JSON.stringify({ + task_ids: taskIds, + start_time: startTime, + end_time: endTime, + limit: 1000, + }), + }, + ); + const spans = Array.isArray(res) ? res : (res.spans ?? []); + const byModel = new Map(); + for (const span of spans) { + if (!span.model_name) continue; + const row = byModel.get(span.model_name) ?? { + model: span.model_name, + tokens: 0, + cost: 0, + }; + row.tokens += span.total_token_count ?? 0; + row.cost += span.total_token_cost ?? 0; + byModel.set(span.model_name, row); + } + return [...byModel.values()]; + } + + /** List version metadata for a named prompt (newest first). First page only. Empty on 404. */ + async listPromptVersions(taskId: string, name: string): Promise { + const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`; + const data = await this.getAllowing404(path); + return [...(data?.versions ?? [])].sort((a, b) => b.version - a.version); + } + + /** + * Fetch the body of a specific version. `version` accepts an integer, + * `"latest"`, an ISO datetime, or a tag. Returns the first message's content, + * or null on 404. Generalizes the by-version GET that `getPromptByTag` uses. + */ + async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise { + const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`; + const prompt = await this.getAllowing404(path); + return prompt?.messages?.[0]?.content ?? null; + } } diff --git a/apps/worker/src/workflows/prompts-step.test.ts b/apps/worker/src/workflows/prompts-step.test.ts index ab09934..4350061 100644 --- a/apps/worker/src/workflows/prompts-step.test.ts +++ b/apps/worker/src/workflows/prompts-step.test.ts @@ -3,9 +3,13 @@ import { describe, it, expect, vi, beforeEach } from "vitest"; vi.mock("../../env.js", () => ({ env: {} })); const mockGetPromptByTag = vi.fn(); +const mockListPromptVersions = vi.fn(); vi.mock("../sandbox/arthur-client.js", () => ({ ArthurClient: { - fromTraceEndpoint: vi.fn(() => ({ getPromptByTag: mockGetPromptByTag })), + fromTraceEndpoint: vi.fn(() => ({ + getPromptByTag: mockGetPromptByTag, + listPromptVersions: mockListPromptVersions, + })), }, })); @@ -20,6 +24,8 @@ async function setEnv(partial: Record) { describe("loadPrompts", () => { beforeEach(async () => { mockGetPromptByTag.mockReset(); + mockListPromptVersions.mockReset(); + mockListPromptVersions.mockResolvedValue([]); await setEnv({ GENAI_ENGINE_API_KEY: undefined, GENAI_ENGINE_TRACE_ENDPOINT: undefined, @@ -65,6 +71,9 @@ describe("loadPrompts", () => { expect(mockGetPromptByTag).toHaveBeenCalledTimes(3); const names = mockGetPromptByTag.mock.calls.map((c) => c[1]); expect(names).toEqual(["research-plan", "implement", "review"]); + // The step throws version metadata away, so it must not pay for the + // dashboard-only listPromptVersions fan-out. + expect(mockListPromptVersions).not.toHaveBeenCalled(); }); it("falls back per-prompt when Arthur returns null or throws", async () => { diff --git a/apps/worker/src/workflows/prompts-step.ts b/apps/worker/src/workflows/prompts-step.ts index 9baae40..bc4a44f 100644 --- a/apps/worker/src/workflows/prompts-step.ts +++ b/apps/worker/src/workflows/prompts-step.ts @@ -6,53 +6,18 @@ export interface LoadedPrompts { export async function loadPrompts(): Promise { "use step"; - const { env } = await import("../../env.js"); - const { logger } = await import("../lib/logger.js"); - const { PROMPT_FALLBACKS } = await import("../lib/prompts.js"); - type PromptName = keyof typeof PROMPT_FALLBACKS; - - const arthurEnabled = - !!env.GENAI_ENGINE_API_KEY && - !!env.GENAI_ENGINE_TRACE_ENDPOINT && - !!env.GENAI_ENGINE_PROMPT_TASK_ID; - - if (!arthurEnabled) { - logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_loaded"); - return { - research: PROMPT_FALLBACKS["research-plan"], - implement: PROMPT_FALLBACKS["implement"], - review: PROMPT_FALLBACKS["review"], - }; - } - - const { ArthurClient } = await import("../sandbox/arthur-client.js"); - const client = ArthurClient.fromTraceEndpoint( - env.GENAI_ENGINE_TRACE_ENDPOINT!, - env.GENAI_ENGINE_API_KEY!, - ); - const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!; - const TAG = "production"; - - async function one(name: PromptName): Promise { - try { - const body = await client.getPromptByTag(taskId, name, TAG); - if (body === null) { - logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_loaded"); - return PROMPT_FALLBACKS[name]; - } - logger.info({ name, source: "arthur" }, "prompts_loaded"); - return body; - } catch (err) { - logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_loaded"); - return PROMPT_FALLBACKS[name]; - } - } - - const [research, implement, review] = await Promise.all([ - one("research-plan"), - one("implement"), - one("review"), - ]); - return { research, implement, review }; + // Delegate to the shared resolver so the durable step and the + // GET /api/v1/prompts route share one source of truth. The resolver carries + // the same logger.info/logger.warn (fallback / arthur / per-prompt error) + // calls the step used to make. Version history is dashboard-only, so skip the + // listPromptVersions fan-out here — the step only consumes prompt bodies. + const { resolvePrompts } = await import("../lib/overview/collect-prompts.js"); + const { prompts } = await resolvePrompts({ withVersions: false }); + const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body])); + return { + research: byName["research-plan"], + implement: byName["implement"], + review: byName["review"], + }; } loadPrompts.maxRetries = 0; diff --git a/docs/superpowers/plans/2026-06-08-cost-real-data.md b/docs/superpowers/plans/2026-06-08-cost-real-data.md new file mode 100644 index 0000000..091c029 --- /dev/null +++ b/docs/superpowers/plans/2026-06-08-cost-real-data.md @@ -0,0 +1,329 @@ +# `/cost` Real-Data Conversion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token/cost from the OpenInference traces the workflow ships in. **Single PR** — no persistence, no capture. + +**Architecture:** New Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) on the existing `ArthurClient`. A worker collector `collect-cost.ts` calls them and shapes a `CostResponse` (totals, by-task breakdown, by-model breakdown, merged daily series). A new route `GET /api/v1/cost` exposes it, degrading to empty when Arthur is unconfigured/unreachable. The dashboard fetches it server-side via `getJSON`, falls back to an empty `CostResponse`, and passes `data` to the `CostScreen` client presenter. Thin `page.tsx` wraps `cost-data.tsx` in ``. Identical read-path shape to `overview-data.tsx` / `runs-data.tsx`. + +**Tech Stack:** Next.js App Router, React, TypeScript, `@shared/contracts`, h3 worker routes, existing `ArthurClient` (fetch + Bearer). Worker has Vitest (`*.test.ts`); dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check. + +**Spec:** `docs/superpowers/specs/2026-06-08-cost-real-data-design.md` + +**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do. + +**Live open questions (resolve with the user; the plan assumes the spec's defaults):** `bucket_size` allowed values for the timeseries; whether empty `task_ids` means org-wide (else enumerate tasks); by-model client aggregation acceptable; task→workflow mapping (breakdown stays per-task); window = calendar MTD. See the spec's "Open questions". + +--- + +### Task 1: Add Arthur read methods + types + +**Files:** +- Modify: `apps/worker/src/sandbox/arthur-client.ts` +- Modify: `apps/worker/src/sandbox/arthur-client.test.ts` + +- [ ] **Step 1: Add response types** + +Add interfaces mirroring Arthur's shapes: + +```ts +export interface TraceOverviewEntry { + task_id: string; + trace_count: number; + trace_token_count: number; + trace_token_cost: number | null; + last_active?: string; +} +export interface TraceTimeseriesPoint { + timestamp: string; + trace_count: number; + trace_token_count: number; + trace_token_cost: number | null; +} +export interface SpanTokenCost { + model_name: string | null; + total_token_count: number | null; + total_token_cost: number | null; +} +``` + +- [ ] **Step 2: Add `getTracesOverview`** + +```ts +async getTracesOverview(taskIds: string[], startTime: string, endTime: string): Promise { + const { overviews } = await this.request<{ count: number; overviews: TraceOverviewEntry[] }>( + "/api/v1/traces/overview", + { method: "POST", body: JSON.stringify({ task_ids: taskIds, start_time: startTime, end_time: endTime }) }, + ); + return overviews; +} +``` + +- [ ] **Step 3: Add `getTracesTimeseries`** (single task per call; caller fans out + merges) + +```ts +async getTracesTimeseries(taskId: string, startTime: string, endTime: string, bucketSize: string): Promise { + const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>( + "/api/v1/traces/overview/timeseries", + { method: "POST", body: JSON.stringify({ task_id: taskId, start_time: startTime, end_time: endTime, bucket_size: bucketSize }) }, + ); + return Array.isArray(res) ? res : (res.points ?? []); +} +``` + +> The response envelope key is unconfirmed — handle both array and `{ points }`. Confirm against a live call. + +- [ ] **Step 4: Add `aggregateSpanTokensByModel`** (the one client-side aggregation) + +Fetch span rows for the window via `GET /api/v1/traces/spans` (paginate if the API requires it), then sum `total_token_count`/`total_token_cost` grouped by `model_name`. Return `Array<{ model: string; tokens: number; cost: number }>`. Skip rows with null `model_name`. + +- [ ] **Step 5: Test** + +Run: `cd apps/worker && pnpm vitest run src/sandbox/arthur-client.test.ts` +Expected: add tests with a stubbed `fetch` asserting each method posts the right body and parses the response (mirror the existing client tests). PASS. + +--- + +### Task 2: Add the `CostResponse` contract + +**Files:** +- Modify: `apps/shared/contracts/api.ts` + +- [ ] **Step 1: Add the interfaces** + +Add `CostByModelEntry`, `CostByWorkflowEntry`, and `CostResponse` exactly as specified in the spec ("Proposed contract"). + +- [ ] **Step 2: Typecheck shared** + +Run: `cd apps/shared && npx tsc --noEmit` (or root `pnpm -w typecheck` if defined) +Expected: PASS. + +--- + +### Task 3: Add the `collectCost` aggregator + worker route + +**Files:** +- Create: `apps/worker/src/lib/overview/collect-cost.ts` +- Create: `apps/worker/src/lib/overview/collect-cost.test.ts` +- Create: `apps/worker/src/routes/api/v1/cost.get.ts` + +- [ ] **Step 1: Write `collectCost`** + +Signature: `collectCost(client: ArthurClient, opts: { now: Date; bucketSize: string }): Promise>`. + +Logic: +1. Resolve the window: `start = startOfMonth(now)`, `end = now` (ISO). (Assumption: calendar MTD — see open Q5.) +2. Resolve `taskIds`: enumerate the org's tasks (assumption from open Q2 — pass ids explicitly). Reuse/extend the client's task listing (`/api/v2/tasks/search`); if a true org-wide overview via empty `task_ids` is confirmed, pass `[]` instead. +3. `overviews = await client.getTracesOverview(taskIds, start, end)`. + - `totals`: sum `trace_token_cost` (→ `totalTokenCost`), `trace_token_count` (→ `totalTokens`), `trace_count` (→ `traceCount`); `costPerRun = totalTokenCost / max(1, traceCount)`. Treat null `trace_token_cost` as 0. + - `byWorkflow`: one entry per overview → `{ taskId, name, runs, tokens, cost, costPerRun }`. `name` from the task listing (task name = ticket-run id). +4. `byModel = await client.aggregateSpanTokensByModel(...)` → map to `{ model, cost, tokens }`. +5. `daily`: fan out `getTracesTimeseries(taskId, start, end, bucketSize)` per task; **merge points by `timestamp`** summing cost/tokens; sort oldest→newest → `{ date, cost, tokens }[]`. + +Keep I/O behind the injected `client` so the aggregation is unit-testable with a fake client (mirror how `collect-runs.ts` takes a `RunsLister`). + +- [ ] **Step 2: Write the route** + +Mirror `workflows.get.ts`: +```ts +setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60"); +const generatedAt = new Date().toISOString(); +if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) { + return { generatedAt, available: false, ...EMPTY }; +} +try { + const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY); + const data = await collectCost(client, { now: new Date(), bucketSize: "day" }); + return { generatedAt, available: true, ...data }; +} catch (err) { + logger.warn({ err: (err as Error).message }, "cost_collect_failed"); + return { generatedAt, available: false, ...EMPTY }; +} +``` +`EMPTY` = the empty totals/arrays/window matching `costFallback`. + +- [ ] **Step 3: Test the aggregator** + +Run: `cd apps/worker && pnpm vitest run src/lib/overview/collect-cost.test.ts` +Expected: with a fake client returning fixtures (2 tasks, 2 models, multi-day timeseries), assert totals, `byWorkflow` rows + `costPerRun`, `byModel` grouping, and merged-by-timestamp `daily`. Empty/null inputs → zeros/empty arrays. PASS. + +- [ ] **Step 4: Worker typecheck** + +Run: `cd apps/worker && npx tsc --noEmit` +Expected: PASS. + +--- + +### Task 4: Add the dashboard fallback + +**Files:** +- Modify: `apps/dashboard/lib/api/fallbacks.ts` + +- [ ] **Step 1: Add `costFallback`** + +```ts +export function costFallback(now: string): CostResponse { + return { + generatedAt: now, + available: false, + window: { start: now, end: now }, + totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 }, + byModel: [], + byWorkflow: [], + daily: [], + }; +} +``` + +Add `CostResponse` to the existing `@shared/contracts` import. + +- [ ] **Step 2: Typecheck** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS (no consumers yet). + +--- + +### Task 5: Add the skeleton + server data component, and convert `CostScreen` + +**Files:** +- Create: `apps/dashboard/app/cost-skeleton.tsx` +- Create: `apps/dashboard/app/cost-data.tsx` +- Modify: `apps/dashboard/components/cockpit/screens/cost.tsx` + +- [ ] **Step 1: Create the skeleton** + +Mirror `overview-skeleton.tsx`, shaped to the cost layout (after embellishments are stripped: 3 KPI blocks, a chart+donut row, two table blocks): + +```tsx +// apps/dashboard/app/cost-skeleton.tsx +function Block({ className = "" }: { className?: string }) { + return
; +} +export function CostSkeleton() { + return ( +
+
+ {Array.from({ length: 3 }, (_, i) => )} +
+
+ +
+ + +
+ ); +} +``` + +- [ ] **Step 2: Create the server data component** + +```tsx +// apps/dashboard/app/cost-data.tsx +import { getJSON } from "@/lib/api/server"; +import { CostScreen } from "@/components/cockpit/screens/cost"; +import type { CostResponse } from "@shared/contracts"; +import { costFallback } from "@/lib/api/fallbacks"; + +export async function CostData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/cost").catch(() => + costFallback(now), + ); + return ; +} +``` + +> Will not typecheck until Step 3 changes `CostScreen`'s signature. The full gate is in Task 6. + +- [ ] **Step 3: Convert `CostScreen` to consume `data` and strip embellishments** + +In `components/cockpit/screens/cost.tsx`: +- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `import { sparkSeries } from "@/lib/rng"`, the `Spark` import (no longer used), and `const D = AIWF_DATA`. +- Add `import type { CostResponse } from "@shared/contracts";`. +- Signature → `export function CostScreen({ data }: { data: CostResponse })`. +- KPIs: `total = data.totals.totalTokenCost`; tokens = `data.totals.totalTokens`; "Cost / run avg" = `$${data.totals.costPerRun.toFixed(2)}`. **Remove** the "Projection · EoM" KPI tile, the `of $1,200 budget` sub, and all `delta`/`deltaTone` props (no source). +- Header: **remove** the `` and the `Export CSV` button. +- Area chart: feed `data.daily.map(d => d.cost)` and labels `data.daily.map(d => d.date)` (format the ISO date to a short label in-screen); **remove** the inner Cost/Tokens `CkTabs` action. +- Donut: shares computed in-screen from `byModel` — `const totalCost = data.byModel.reduce((a,m)=>a+m.cost,0); shares = data.byModel.map(m => totalCost ? m.cost/totalCost : 0)`; center = `"$" + Math.round(total)`. +- Per-model table: map `data.byModel` → columns `{ m.model, m.tokens, m.cost, share }`. **Remove** the `Vendor` column (not in contract) and the `Trend`/`Spark` column. +- Per-workflow table: map `data.byWorkflow` (already aggregated) → `{ w.name, w.taskId, w.runs, w.tokens, w.cost, w.costPerRun }`. **Remove** the in-component `tokens = runs24h*2400`/`perRun` derivations, the `primary` chip / `gateway` line (not in contract), and the `Trend`/`Spark` column. Header label can stay "Per-workflow breakdown" (rows are per task — see spec mapping note). + +- [ ] **Step 4: Verify no mock/embellishment refs remain** + +Run: `grep -nE "\bD\.|AIWF_DATA|sparkSeries|Spark|COST_BY_MODEL|HOURS24|Export CSV|deltaTone|By actor" apps/dashboard/components/cockpit/screens/cost.tsx` +Expected: no matches. + +--- + +### Task 6: Rewrite the route + full verification + +**Files:** +- Modify: `apps/dashboard/app/(cockpit)/cost/page.tsx` + +- [ ] **Step 1: Replace the page with the Suspense + server-component pattern** + +```tsx +// apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost") +import { Suspense } from "react"; +import { CostData } from "@/app/cost-data"; +import { CostSkeleton } from "@/app/cost-skeleton"; + +export default function CostPage() { + return ( + }> + + + ); +} +``` + +- [ ] **Step 2: Typecheck both apps** + +Run: `cd apps/worker && npx tsc --noEmit && cd ../dashboard && npx tsc --noEmit` +Expected: PASS, no errors. + +- [ ] **Step 3: Lint the changed dashboard files** + +Run: `cd apps/dashboard && npx next lint --file app/cost-data.tsx --file app/cost-skeleton.tsx --file "app/(cockpit)/cost/page.tsx" --file components/cockpit/screens/cost.tsx` +Expected: no errors. + +- [ ] **Step 4: Visual check** + +Run: `cd apps/dashboard && pnpm dev`, open `http://localhost:3001/cost`. +Expected: +- With Arthur configured + traces present: real spend, token totals, per-model donut/table, per-task table, and per-day spend chart render. +- With Arthur unconfigured (env unset) or unreachable: zero/empty state — KPIs `$0.00`/`0`, empty tables, empty chart — no crash. + +- [ ] **Step 5: Commit (ONLY if the user asks)** + +```bash +git add apps/shared/contracts/api.ts \ + apps/worker/src/sandbox/arthur-client.ts \ + apps/worker/src/lib/overview/collect-cost.ts apps/worker/src/routes/api/v1/cost.get.ts \ + apps/dashboard/lib/api/fallbacks.ts \ + apps/dashboard/app/cost-data.tsx apps/dashboard/app/cost-skeleton.tsx \ + "apps/dashboard/app/(cockpit)/cost/page.tsx" \ + apps/dashboard/components/cockpit/screens/cost.tsx +git commit -m "feat: wire /cost to real Arthur usage data" +``` + +--- + +## Self-Review + +**Spec coverage:** +- Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) → Task 1. ✓ +- `CostResponse` contract with field-level types → Task 2 (from spec). ✓ +- `collectCost` aggregator (totals / byWorkflow=per-task / byModel / merged daily) + `/api/v1/cost` route with Arthur-unconfigured degrade → Task 3. ✓ +- `costFallback` empty state → Task 4. ✓ +- `cost-data.tsx` + `cost-skeleton.tsx` + `CostScreen` swap with embellishments **removed** (budget, deltas, EoM projection, tabs, CSV, sparklines, vendor/primary/gateway) → Task 5. ✓ +- Thin Suspense page → Task 6. ✓ +- Arthur-down / unconfigured empty state → fallback (Task 4), route degrade (Task 3), verified (Task 6 Step 4). ✓ +- Single PR, no Redis/persistence/capture → no such tasks. ✓ + +**Reuse check:** Read methods extend the existing `ArthurClient` (same `request` + Bearer auth + `fromTraceEndpoint`). Cost comes straight from Arthur's `*_token_cost` — no client-side pricing, the `pricing.ts`/`usage.ts` Slack path is untouched. Read path reuses `getJSON`/fallback/Suspense. Only new infra is one collector + one route — consistent with runs/overview. ✓ + +**Placeholder scan:** No TBD/TODO; the only deferred items are the spec's flagged open questions (`bucket_size`, empty `task_ids`, by-model aggregation, task→workflow, window) and the explicitly-removed embellishments. ✓ + +**Type consistency:** `CostResponse` imported from `@shared/contracts` in `cost-data.tsx` (Task 5), `fallbacks.ts` (Task 4), and the route (Task 3). `CostScreen` accepts `{ data: CostResponse }` (Task 5) matching the call site (Task 5 Step 2). Arthur response types (Task 1) feed `collectCost` (Task 3). ✓ diff --git a/docs/superpowers/plans/2026-06-08-evals-real-data.md b/docs/superpowers/plans/2026-06-08-evals-real-data.md new file mode 100644 index 0000000..cff77e1 --- /dev/null +++ b/docs/superpowers/plans/2026-06-08-evals-real-data.md @@ -0,0 +1,421 @@ +# `/evals` Real-Data Conversion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Convert the `/evals` dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Because no evals list endpoint or eval-read path exists yet, this plan also builds the worker contract, route, and Arthur read path as a prerequisite. + +**Architecture:** New worker route `GET /api/v1/evals` → `EvalsResponse` (discriminated union, same `available` pattern as `EvalHealthResponse`). A new collector `collect-evals.ts` calls the **confirmed** Arthur read endpoint `POST /api/v1/traces/overview` via a new `getTracesOverview()` method on `ArthurClient`, sums the per-task overviews into a fleet `score`/`spansGraded`/`traceCount`, and degrades to `available: false` when Arthur is unconfigured, unreachable, or nothing is graded. On the dashboard, a thin server route (`page.tsx`) wraps a server component (`evals-data.tsx`) in ``; that component fetches via `getJSON`, falls back to `evalsFallback`, and passes `data` to the client presenter `EvalsScreen`. Identical in shape to `runs-data.tsx` / `RunsScreen`. + +**Scope note (read first):** Arthur's read API is confirmed (auth = same `Bearer GENAI_ENGINE_API_KEY`, org-scoped). Our trace path (`POST /api/v1/traces`) only produces `continuous_eval_success_rate`, `eval_count`, `trace_count`, and the three relevance/tool metric types — **and only if continuous evals are configured on the task.** The mock's rule families (hallucination/PII/toxicity/prompt-injection) come from Arthur's `/validate_*` write path, which **we do not call** — they are **out of scope** and dropped from this page. The first increment ships the **fleet aggregate** (score + graded count + window); the per-metric relevance/tool breakdown and trend/sparkline are optional follow-ons (Tasks 3b/3c). + +**Tech Stack:** Worker = h3 + Nitro routes, `@shared/contracts` types, Vitest. Dashboard = Next.js App Router, React 19, TypeScript. Dashboard has no test framework — verification is `npx tsc --noEmit`, `next lint`, and a manual browser check. + +**Spec:** `docs/superpowers/specs/2026-06-08-evals-real-data-design.md` + +**Required env vars (worker):** `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT` (both already declared optional in `apps/worker/env.ts`; the base read URL is derived from the trace endpoint via `ArthurClient.fromTraceEndpoint`). Reads need the `INFERENCE_READ` permission on the key. No new dashboard env vars — `/evals` reuses `WORKER_BASE_URL` / `WORKER_API_TOKEN` via `getJSON`. + +**Remaining open items (non-blocking — see spec Open Questions):** (1) `bucket_size` values for the optional timeseries call; (2) whether empty `task_ids` on `/traces/overview` means "all org tasks" (else enumerate via `/api/v2/tasks/search`); (3) whether continuous evals are actually configured on our live tasks (if not, the page legitimately shows "No graded evals"). None block the aggregate-only increment. + +**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do. + +--- + +### Task 1: Add the `EvalsResponse` contract + +**Files:** +- Modify: `apps/shared/contracts/api.ts` + +- [ ] **Step 1: Add `EvalMetricRow` and `EvalsResponse`** + +Append after the existing `EvalHealthResponse` union: + +```ts +export interface EvalMetricRow { + metric: string; + metricType: "QueryRelevance" | "ResponseRelevance" | "ToolSelection"; + value: number; + status: "pass" | "warn" | "fail"; + axis: "quality"; + trend?: number | null; // only if timeseries wired (Task 3c) + spark?: number[]; // only if timeseries wired (Task 3c) +} + +export type EvalsResponse = + | { + available: true; + generatedAt: string; + windowHours: number; + score: number; // continuous_eval_success_rate × 100, fleet-wide + spansGraded: number; // Σ eval_count + traceCount: number; // Σ trace_count + rows: EvalMetricRow[]; // [] in the aggregate-only first cut + } + | { available: false; generatedAt: string; reason: string }; +``` + +- [ ] **Step 2: Typecheck shared** + +Run: `cd apps/shared && npx tsc --noEmit` +Expected: PASS. + +--- + +### Task 2: Add the dashboard fallback + +**Files:** +- Modify: `apps/dashboard/lib/api/fallbacks.ts` + +- [ ] **Step 1: Import the type and add the fallback** + +Add `EvalsResponse` to the existing `@shared/contracts` import block, then add: + +```ts +export function evalsFallback(now: string): EvalsResponse { + return { available: false, generatedAt: now, reason: "Worker unavailable." }; +} +``` + +- [ ] **Step 2: Typecheck dashboard** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS (the new export is unused so far, but valid). + +--- + +### Task 3: Build the Arthur read path + collector (fleet aggregate) + +This is the first, shippable increment: fleet `score` / `spansGraded` / `traceCount`, `rows: []`. The per-metric breakdown (3b) and trend/sparkline (3c) are optional follow-ons below. + +**Files:** +- Modify: `apps/worker/src/sandbox/arthur-client.ts` (add a read method) +- Create: `apps/worker/src/lib/overview/collect-evals.ts` +- Create: `apps/worker/src/lib/overview/collect-evals.test.ts` + +- [ ] **Step 1: Add `getTracesOverview()` to `ArthurClient`** + +Add a method reusing the existing private `request` helper and bearer auth: + +```ts +interface TraceOverview { + task_id: string; + trace_count: number; + trace_token_count: number; + trace_token_cost: number; + eval_count: number; + continuous_eval_success_rate: number; + last_active: string; +} +interface TraceOverviewListResponse { count: number; overviews: TraceOverview[]; } + +async getTracesOverview(opts: { + taskIds: string[]; // may be empty — see Open Q2 + startTime: string; // ISO + endTime: string; // ISO +}): Promise { + return this.request("/api/v1/traces/overview", { + method: "POST", + body: JSON.stringify({ + task_ids: opts.taskIds, + start_time: opts.startTime, + end_time: opts.endTime, + }), + }); +} +``` + +Keep the raw Arthur types local to the client; do not leak them into `@shared/contracts`. + +> **Task-id enumeration (Open Q2):** if `task_ids: []` is confirmed to mean "all org tasks", pass `[]`. Otherwise enumerate the org's tasks first. The client already searches tasks via `POST /api/v2/tasks/search` (`findTicketTasks`); add a thin `listAllTasks()` if a full enumeration is needed, or have the collector accept a pre-resolved `taskIds`. Default the collector to receive `taskIds` so the route owns the enumeration policy. + +- [ ] **Step 2: Write `collect-evals.ts`** + +Mirror `collect-runs.ts`/`collect-kpis.ts` — accept an injected fetcher and resolve to the `available: true` fields minus `generatedAt`: + +```ts +export interface CollectEvalsOptions { + fetchOverview: (o: { taskIds: string[]; startTime: string; endTime: string }) + => Promise<{ overviews: TraceOverview[] }>; + taskIds: string[]; + windowHours: number; + now: Date; +} + +// Returns { windowHours, score, spansGraded, traceCount, rows } OR a null-ish +// signal when nothing is graded so the route can emit available:false. +export async function collectEvals(opts: CollectEvalsOptions) { + const endTime = opts.now.toISOString(); + const startTime = new Date(opts.now.getTime() - opts.windowHours * 3_600_000).toISOString(); + const { overviews } = await opts.fetchOverview({ taskIds: opts.taskIds, startTime, endTime }); + + const spansGraded = sum(overviews, o => o.eval_count); + const traceCount = sum(overviews, o => o.trace_count); + // weight success rate by eval_count; 0 graded → caller emits unavailable + const score = spansGraded === 0 + ? 0 + : (sum(overviews, o => o.continuous_eval_success_rate * o.eval_count) / spansGraded) * 100; + + return { windowHours: opts.windowHours, score, spansGraded, traceCount, rows: [] }; +} +``` + +The injected-fetcher boundary keeps the Arthur shape isolated and unit-testable. + +- [ ] **Step 3: Unit test the collector** + +In `collect-evals.test.ts`, feed stubbed `overviews` and assert: `spansGraded`/`traceCount` are summed, `score` is the eval-count-weighted success rate × 100, and `spansGraded === 0` yields `score === 0` (route turns this into `available:false`). Mirror the style of the existing `collect-*` tests. + +Run: `cd apps/worker && npx vitest run src/lib/overview/collect-evals.test.ts` +Expected: PASS. + +- [ ] **Step 3b (optional follow-on): per-metric relevance/tool breakdown** + +Only the three Arthur metric types exist on our path. To populate `rows`: list spans (`GET /api/v1/traces/spans`), fetch each span's `metric_results` (`GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results`), parse the opaque `details` JSON string per `metric_type` (e.g. relevance → `llm_relevance_score`), aggregate per metric type, and apply a worker-owned pass/warn/fail threshold. Map each to `EvalMetricRow { metric, metricType, value, status, axis: "quality" }`. Add this behind the same collector with extra fetchers; keep `rows: []` until implemented. + +- [ ] **Step 3c (optional follow-on): trend/sparkline** + +Wire `POST /api/v1/traces/overview/timeseries` (single task per call) to populate `EvalMetricRow.trend`/`spark` from `continuous_eval_success_rate` buckets. **Confirm `bucket_size` allowed values first (Open Q1).** Until wired, omit `trend`/`spark` entirely — no synthetic series. + +--- + +### Task 4: Add the worker route `GET /api/v1/evals` + +**Files:** +- Create: `apps/worker/src/routes/api/v1/evals.get.ts` + +- [ ] **Step 1: Create the route** + +Mirror `apps/worker/src/routes/api/v1/runs.get.ts`: + +```ts +import { defineEventHandler, setResponseHeader } from "h3"; +import type { EvalsResponse } from "@shared/contracts"; +import { env } from "../../../../env.js"; +import { ArthurClient } from "../../../sandbox/arthur-client.js"; +import { collectEvals } from "../../../lib/overview/collect-evals.js"; +import { logger } from "../../../lib/logger.js"; + +const WINDOW_HOURS = 24; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + const generatedAt = new Date().toISOString(); + + if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) { + return { available: false, generatedAt, reason: "Arthur GenAI Engine not configured." }; + } + + try { + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT, + env.GENAI_ENGINE_API_KEY, + ); + // Open Q2: pass [] if empty === all org tasks; else enumerate via tasks/search. + const taskIds: string[] = []; + const { windowHours, score, spansGraded, traceCount, rows } = await collectEvals({ + fetchOverview: (o) => client.getTracesOverview(o), + taskIds, + windowHours: WINDOW_HOURS, + now: new Date(), + }); + if (spansGraded === 0) { + return { available: false, generatedAt, reason: "No graded evals in the last 24h." }; + } + return { available: true, generatedAt, windowHours, score, spansGraded, traceCount, rows }; + } catch (err) { + logger.warn({ err: (err as Error).message }, "evals_list_failed"); + return { available: false, generatedAt, reason: "Eval grading not wired up yet." }; + } +}); +``` + +- [ ] **Step 2: Typecheck worker** + +Run: `cd apps/worker && npx tsc --noEmit` +Expected: PASS. + +- [ ] **Step 3: Hit the route** + +Run the worker locally and `curl -H "Authorization: Bearer $WORKER_API_TOKEN" localhost:/api/v1/evals`. +Expected: +- Arthur unconfigured → `{ available: false, ..., reason: "Arthur GenAI Engine not configured." }`. +- Configured but nothing graded → `{ available: false, ..., reason: "No graded evals in the last 24h." }`. +- Configured + graded → `available: true` with `score` / `spansGraded` / `traceCount` (and `rows` once 3b is built). + +--- + +### Task 5: Add the loading skeleton + +**Files:** +- Create: `apps/dashboard/app/evals-skeleton.tsx` + +- [ ] **Step 1: Create the skeleton** + +Mirror `apps/dashboard/app/overview-skeleton.tsx` — header + one card-shaped block (the Quality group): + +```tsx +// apps/dashboard/app/evals-skeleton.tsx +function Block({ className = "" }: { className?: string }) { + return
; +} + +export function EvalsSkeleton() { + return ( +
+
+ + +
+ +
+ ); +} +``` + +- [ ] **Step 2: Typecheck** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS. + +--- + +### Task 6: Add the server data component + +**Files:** +- Create: `apps/dashboard/app/evals-data.tsx` + +- [ ] **Step 1: Create the server component** + +Mirror `apps/dashboard/app/runs-data.tsx`: + +```tsx +import { getJSON } from "@/lib/api/server"; +import { EvalsScreen } from "@/components/cockpit/screens/evals"; +import type { EvalsResponse } from "@shared/contracts"; +import { evalsFallback } from "@/lib/api/fallbacks"; + +export async function EvalsData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/evals").catch(() => + evalsFallback(now), + ); + return ; +} +``` + +> This will not typecheck until Task 7 changes `EvalsScreen`'s signature. Expected; full typecheck gate is Task 8. + +--- + +### Task 7: Convert `EvalsScreen` to consume real data + +**Files:** +- Modify: `apps/dashboard/components/cockpit/screens/evals.tsx` + +- [ ] **Step 1: Replace imports and signature** + +- Remove `import { AIWF_DATA } from "@/lib/data/mock"` and `const D = AIWF_DATA`. +- Add `import type { EvalsResponse, EvalMetricRow } from "@shared/contracts"`. +- Change `export function EvalsScreen()` → `export function EvalsScreen({ data }: { data: EvalsResponse })`. + +Also remove `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped) and the `groups`/`accents`/`titles` axis-map scaffolding — only the single Quality group remains. + +- [ ] **Step 2: Handle the unavailable branch** + +When `data.available === false`, render the existing header block (eyebrow + title) but replace the chip with a neutral one and the metric cards with a single panel showing `data.reason`. Mirror the reason path in `EvalHealthKPI` (`overview.tsx`). This covers unconfigured, "no graded evals", and worker-down. + +- [ ] **Step 3: Drive the available branch** + +- Drive the live chip from `data.spansGraded.toLocaleString("en-US")` + `data.windowHours` instead of the hardcoded `12,408 spans · 24h`; surface `data.score` (e.g. as the headline number). +- Render a single **Quality** `CkCard` over `data.rows` (all `axis: "quality"`). If `data.rows` is empty (aggregate-only first cut), render just the score + graded-count header, no per-metric grid. +- Per row: show `metric`, formatted `value`, and the pass/warn/fail `CkChip`. +- Trend/sparkline: render `e.trend` / `` **only when present**; otherwise render neither. No `jitterSeries`. + +- [ ] **Step 4: Verify no mock/jitter references remain** + +Run: `grep -nE "AIWF_DATA|\bD\.|jitterSeries" apps/dashboard/components/cockpit/screens/evals.tsx` +Expected: no matches. + +--- + +### Task 8: Rewrite the route to the server pattern + verify + +**Files:** +- Modify: `apps/dashboard/app/(cockpit)/evals/page.tsx` + +- [ ] **Step 1: Replace the page with the Suspense + server-component pattern** + +```tsx +// apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals") +import { Suspense } from "react"; + +import { EvalsData } from "@/app/evals-data"; +import { EvalsSkeleton } from "@/app/evals-skeleton"; + +export default function EvalsPage() { + return ( + }> + + + ); +} +``` + +- [ ] **Step 2: Typecheck the whole app** + +Run: `cd apps/dashboard && npx tsc --noEmit` and `cd apps/worker && npx tsc --noEmit` +Expected: PASS, no errors. + +- [ ] **Step 3: Lint the changed dashboard files** + +Run: `cd apps/dashboard && npx next lint --file app/evals-data.tsx --file app/evals-skeleton.tsx --file "app/(cockpit)/evals/page.tsx" --file components/cockpit/screens/evals.tsx` +Expected: no errors. + +- [ ] **Step 4: Visual check** + +Run: `cd apps/dashboard && pnpm dev` (port 3001), open `http://localhost:3001/evals`. +Expected: +- With the worker unreachable or Arthur unconfigured: header chrome renders + a single reason panel ("Worker unavailable." / "Arthur GenAI Engine not configured."), no crash. +- With Arthur configured but nothing graded (`eval_count = 0`): the "No graded evals in the last 24h." panel. +- With Arthur configured + graded: the real fleet `score` + spans-graded count over the 24h window render; the Quality breakdown appears once Task 3b is built (else just the aggregate header). No sparklines unless Task 3c is wired. + +- [ ] **Step 5: Commit (ONLY if the user asks)** + +```bash +git add apps/shared/contracts/api.ts \ + apps/worker/src/sandbox/arthur-client.ts \ + apps/worker/src/lib/overview/collect-evals.ts \ + apps/worker/src/lib/overview/collect-evals.test.ts \ + apps/worker/src/routes/api/v1/evals.get.ts \ + apps/dashboard/lib/api/fallbacks.ts \ + apps/dashboard/app/evals-data.tsx \ + apps/dashboard/app/evals-skeleton.tsx \ + "apps/dashboard/app/(cockpit)/evals/page.tsx" \ + apps/dashboard/components/cockpit/screens/evals.tsx +git commit -m "feat: wire /evals to real Arthur eval data" +``` + +--- + +## Self-Review + +**Spec coverage:** +- `EvalsResponse` / `EvalMetricRow` contract (mapped to `TraceOverviewResponse`; rule families dropped) → Task 1. ✓ +- Worker Arthur read path `getTracesOverview()` + `collect-evals.ts` (+ test) → Task 3; optional breakdown/timeseries → 3b/3c. ✓ +- Worker route `GET /api/v1/evals` with config-check, `eval_count=0` degrade, error degrade → Task 4. ✓ +- `evalsFallback` → Task 2. ✓ +- `evals-data.tsx` server component → Task 6. ✓ +- `evals-skeleton.tsx` (single Quality block) → Task 5. ✓ +- `EvalsScreen` swap (signature, single Quality group, score + spansGraded chip, optional rows/trend/spark, drop `jitterSeries`) → Task 7. ✓ +- `page.tsx` server route → Task 8. ✓ +- Unavailable / no-graded / worker-down states → Tasks 2, 4, 7; verified in Task 8 Step 4. ✓ +- Out-of-scope (New eval button, overview tile, per-span drill-down, synthetic sparklines, `/validate_*` rule families) → not in any task. ✓ + +**Confirmed dependency:** Arthur read API is ground-truthed (`POST /api/v1/traces/overview`, bearer auth, org-scoped). First increment ships fleet aggregate; per-metric breakdown (3b) and trend (3c) are optional follow-ons. Non-blocking open items (bucket_size, empty-task_ids semantics, whether continuous evals are configured live) noted at top and at their tasks. ✓ + +**Placeholder scan:** No TBD/TODO; remaining unknowns are the three non-blocking open items, explicitly flagged. ✓ + +**Type consistency:** `EvalsResponse` imported from `@shared/contracts` in Tasks 2, 4, 6, 7. `EvalsScreen` accepts `{ data: EvalsResponse }` (Task 7) — matches the call site in Task 6. `collectEvals` returns the `available: true` fields (`windowHours`/`score`/`spansGraded`/`traceCount`/`rows`) the route spreads in Task 4. `EvalsSkeleton` (Task 5) matches the import in Task 8. ✓ diff --git a/docs/superpowers/plans/2026-06-08-prompts-real-data.md b/docs/superpowers/plans/2026-06-08-prompts-real-data.md new file mode 100644 index 0000000..267b1aa --- /dev/null +++ b/docs/superpowers/plans/2026-06-08-prompts-real-data.md @@ -0,0 +1,690 @@ +# `/prompts` Real-Data Conversion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Convert the `/prompts` dashboard page from mock data to live worker data, mirroring the `/runs` server-component fetch pattern. Read-only display of the three workflow prompts the worker actually resolves at runtime, **including real Arthur version history**. + +**Architecture:** New worker route `GET /api/v1/prompts` returns a typed `PromptsResponse` built from the same resolution logic the durable `loadPrompts()` step uses (Arthur `production` tags with in-code fallbacks), plus each prompt's real Arthur version-history metadata. A second route `GET /api/v1/prompts/[name]/versions/[version]` returns a single historical version's body on demand. Thin server route (`page.tsx`) wraps a server component (`prompts-data.tsx`) in ``; it fetches the list via `getJSON`, falls back to an empty `PromptsResponse`, and passes `data` to the client presenter `PromptsScreen`. The client fetches historical version bodies lazily through a same-origin Next route handler that proxies the worker (keeps the bearer token server-side). Shape mirrors `runs.get.ts` / `runs-data.tsx` / `RunsScreen`. + +**Tech stack:** h3 worker (`@apps/worker`), Next.js App Router dashboard (`@apps/dashboard`), shared `@shared/contracts`. Worker has vitest tests; dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check. + +**Spec:** `docs/superpowers/specs/2026-06-08-prompts-real-data-design.md` + +**Scope decisions baked in (confirmed by user + Arthur API ground-truthing):** +- Read-only display. No write/edit endpoints. Action buttons left inert. +- **Real Arthur version history is in scope** (version-list metadata + on-demand bodies). Arthur's version list is metadata only, so per-version eval/halluc/p95/cost metrics and the A/B text diff have **no source** — that markup is **removed**, not stubbed with placeholders. +- Tags are real (`AgenticPromptVersionResponse.tags`); the `production` badge and tag filter stay, backed by data. +- Worker route reuses a shared extracted `resolvePrompts()` helper (option A) called by both `loadPrompts()` and the route. Confirmed OK to touch `prompts-step.ts`. +- Body fetch: production body eager (already resolved); historical bodies lazy via the on-demand route. + +**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the command for when they do. + +--- + +### Task 1: Add the shared `PromptVersion` + `PromptDef` entities + response contracts + +**Files:** +- Modify: `apps/shared/contracts/domain.ts` +- Modify: `apps/shared/contracts/api.ts` + +- [ ] **Step 1: Add `PromptVersion` + `PromptDef` to `domain.ts`** + +```ts +/** One Arthur version of a named prompt (metadata; body fetched on demand). */ +export interface PromptVersion { + /** Arthur integer version number. */ + version: number; + /** ISO timestamp the version was created. */ + createdAt: string; + /** Real Arthur tags on this version, e.g. ["production"]. */ + tags: string[]; + modelProvider: string; + modelName: string; + numMessages: number; + numTools: number; + /** Body text. Present only for the production version (eager); other + * versions are fetched on demand. */ + body?: string; +} + +/** A workflow phase prompt as resolved by the worker at runtime. */ +export interface PromptDef { + /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */ + name: string; + /** Human label for the workflow phase, e.g. "Research & Plan". */ + phase: string; + /** Resolved production prompt body (Arthur production tag, or in-code fallback). */ + body: string; + /** Where the resolved `body` came from. */ + source: "arthur" | "fallback"; + /** Model the agent runs this prompt with (env-derived). */ + model: string; + /** Real Arthur version history, newest first. Empty when source is "fallback". */ + versions: PromptVersion[]; +} +``` + +- [ ] **Step 2: Add `PromptsResponse` + `PromptVersionBodyResponse` to `api.ts`** + +Add `PromptDef` to the existing `import type { ... } from "./domain.js"` line (note: `PromptVersion` is only referenced transitively through `PromptDef`, so it need not be imported in `api.ts`), then append: + +```ts +export interface PromptsResponse { + generatedAt: string; + /** `false` when the worker can't resolve prompts (degrades to empty list). */ + available: boolean; + /** Whether Arthur is configured (key + endpoint + task id all set). When + * false, every prompt's `source` is "fallback" and `versions` is empty. */ + arthurEnabled: boolean; + rows: PromptDef[]; + total: number; +} + +/** On-demand body for a single historical Arthur version. */ +export interface PromptVersionBodyResponse { + generatedAt: string; + available: boolean; + body: string | null; +} +``` + +- [ ] **Step 3: Typecheck shared** + +Run: `pnpm -F @apps/shared exec tsc --noEmit` (or repo-root `pnpm typecheck` if that's the established command — match how the runs plan was verified). +Expected: PASS. + +--- + +### Task 2: Add Arthur version-list + by-version read methods to `ArthurClient` + +**Files:** +- Modify: `apps/worker/src/sandbox/arthur-client.ts` +- Modify: `apps/worker/src/sandbox/arthur-client.test.ts` (add coverage for the new methods, matching the file's existing fetch-mock style) + +**Context:** `ArthurClient` already has `getPromptByTag` (fetches a tagged version's body). Add two read methods, ground-truthed against `arthur-ai/arthur-engine` `main`. Both reuse the existing `this.baseUrl` + bearer header convention. + +- [ ] **Step 1: Add types + `listPromptVersions`** + +```ts +export interface ArthurPromptVersion { + version: number; + created_at: string; + deleted_at: string | null; + model_provider: string; + model_name: string; + tags: string[]; + num_messages: number; + num_tools: number; +} +interface AgenticPromptVersionListResponse { + count: number; + versions: ArthurPromptVersion[]; +} + +/** List version metadata for a named prompt (newest first). First page only. */ +async listPromptVersions(taskId: string, name: string): Promise { + const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`; + const res = await fetch(`${this.baseUrl}${path}`, { + method: "GET", + headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" }, + }); + if (res.status === 404) return []; + if (!res.ok) { + const body = await res.text().catch(() => ""); + throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`); + } + const data = (await res.json()) as AgenticPromptVersionListResponse; + return [...data.versions].sort((a, b) => b.version - a.version); +} +``` + +> Assumption (open Q in spec): first page only — sufficient for the timeline. If deep history is required later, add pagination params here. + +- [ ] **Step 2: Add `getPromptVersionBody`** + +`getPromptByTag` already parses the by-version endpoint's `AgenticPrompt.messages[0].content` shape (passing a tag as `{prompt_version}`). Generalize it to accept any version specifier (integer / `latest` / ISO datetime / tag): + +```ts +/** Fetch the body of a specific version (int | "latest" | ISO datetime | tag). Null on 404. */ +async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise { + const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`; + const res = await fetch(`${this.baseUrl}${path}`, { + method: "GET", + headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" }, + }); + if (res.status === 404) return null; + if (!res.ok) { + const body = await res.text().catch(() => ""); + throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`); + } + const prompt = (await res.json()) as AgenticPrompt; + return prompt.messages?.[0]?.content ?? null; +} +``` + +> `getPromptByTag` can optionally be refactored to delegate to `getPromptVersionBody(taskId, name, tag)` to remove duplication — low risk, but keep it a separate optional cleanup so the existing `loadPrompts` path is untouched if you skip it. + +- [ ] **Step 3: Typecheck + test the worker** + +Run: `pnpm -F @apps/worker exec tsc --noEmit` then `pnpm -F @apps/worker exec vitest run src/sandbox/arthur-client.test.ts` +Expected: PASS, including the new method tests. + +--- + +### Task 3: Extract a reusable `resolvePrompts()` helper in the worker + +**Files:** +- Create: `apps/worker/src/lib/prompts/resolve.ts` (or `apps/worker/src/lib/resolve-prompts.ts` — match existing `lib/` layout) +- Modify: `apps/worker/src/workflows/prompts-step.ts` + +**Context:** `loadPrompts()` (`workflows/prompts-step.ts`) is a `"use step"` durable step returning `{ research, implement, review }`. The Arthur-vs-fallback resolution inside it is what we want to share. Extract the *pure* logic (no `"use step"`) so a plain h3 route can call it too, and have it also collect real version history. `loadPrompts()` then maps the helper's result back to its `{ research, implement, review }` shape so the workflow contract is unchanged. + +- [ ] **Step 1: Create the helper (resolves production body + version history per prompt)** + +```ts +// apps/worker/src/lib/prompts/resolve.ts +import type { PromptVersion } from "@shared/contracts"; +import { env } from "../../../env.js"; +import { logger } from "../logger.js"; +import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js"; + +const PHASE_LABEL: Record = { + "research-plan": "Research & Plan", + "implement": "Implement", + "review": "Review", +}; + +export interface ResolvedPrompt { + name: PromptName; + phase: string; + body: string; + source: "arthur" | "fallback"; + model: string; + versions: PromptVersion[]; +} + +export interface ResolvePromptsResult { + arthurEnabled: boolean; + prompts: ResolvedPrompt[]; +} + +export async function resolvePrompts(): Promise { + const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL; + const arthurEnabled = + !!env.GENAI_ENGINE_API_KEY && + !!env.GENAI_ENGINE_TRACE_ENDPOINT && + !!env.GENAI_ENGINE_PROMPT_TASK_ID; + + const base = ( + name: PromptName, body: string, source: "arthur" | "fallback", versions: PromptVersion[] = [], + ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions }); + + if (!arthurEnabled) { + logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved"); + return { + arthurEnabled, + prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")), + }; + } + + const { ArthurClient } = await import("../../sandbox/arthur-client.js"); + const client = ArthurClient.fromTraceEndpoint( + env.GENAI_ENGINE_TRACE_ENDPOINT!, + env.GENAI_ENGINE_API_KEY!, + ); + const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!; + + async function one(name: PromptName): Promise { + try { + const [body, rawVersions] = await Promise.all([ + client.getPromptByTag(taskId, name, "production"), + client.listPromptVersions(taskId, name).catch(() => []), + ]); + const versions: PromptVersion[] = rawVersions.map((v) => ({ + version: v.version, + createdAt: v.created_at, + tags: v.tags, + modelProvider: v.model_provider, + modelName: v.model_name, + numMessages: v.num_messages, + numTools: v.num_tools, + })); + // Attach the eager production body to its matching version entry. + const prodVersion = versions.find((v) => v.tags.includes("production")); + if (prodVersion && body !== null) prodVersion.body = body; + + if (body === null) { + logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved"); + return base(name, PROMPT_FALLBACKS[name], "fallback", versions); + } + logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved"); + return base(name, body, "arthur", versions); + } catch (err) { + logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved"); + return base(name, PROMPT_FALLBACKS[name], "fallback"); + } + } + + const prompts = await Promise.all(PROMPT_NAMES.map(one)); + return { arthurEnabled, prompts }; +} +``` + +> Verify the import depth (`../../../env.js`, `../logger.js`, `../prompts.js`, `../../sandbox/arthur-client.js`) against the file's actual location before finalizing — adjust to wherever you place it. The originals in `prompts-step.ts` import `../../env.js`, `./lib/logger.js`, `./lib/prompts.js` from `workflows/`. `@shared/contracts` is the same alias the routes use. + +- [ ] **Step 2: Rewrite `loadPrompts()` to delegate to the helper** + +Keep the `"use step"` directive, `maxRetries = 0`, and the `{ research, implement, review }` return shape. Replace the body with a call to `resolvePrompts()` and a map by name: + +```ts +export async function loadPrompts(): Promise { + "use step"; + const { resolvePrompts } = await import("../lib/prompts/resolve.js"); + const { prompts } = await resolvePrompts(); + const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body])); + return { + research: byName["research-plan"], + implement: byName["implement"], + review: byName["review"], + }; +} +loadPrompts.maxRetries = 0; +``` + +- [ ] **Step 3: Run the existing prompts-step tests** + +Run: `pnpm -F @apps/worker exec vitest run src/workflows/prompts-step.test.ts` +Expected: PASS. The test mocks `../sandbox/arthur-client.js` and `../../env.js`; if the helper's import paths differ, update the test's mock paths to match (the behavior — fallbacks when disabled, Arthur when enabled — is unchanged). + +--- + +### Task 4: Add the worker routes (`GET /api/v1/prompts` + on-demand version body) + +**Files:** +- Create: `apps/worker/src/routes/api/v1/prompts.get.ts` +- Create: `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts` + +- [ ] **Step 1: Create the list route (mirror `runs.get.ts`)** + +```ts +import { defineEventHandler, setResponseHeader } from "h3"; +import type { PromptsResponse } from "@shared/contracts"; +import { resolvePrompts } from "../../../lib/prompts/resolve.js"; +import { logger } from "../../../lib/logger.js"; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader( + event, + "Cache-Control", + "private, max-age=15, stale-while-revalidate=60", + ); + + const generatedAt = new Date().toISOString(); + try { + const { arthurEnabled, prompts } = await resolvePrompts(); + return { + generatedAt, + available: true, + arthurEnabled, + rows: prompts, + total: prompts.length, + }; + } catch (err) { + logger.warn({ err: (err as Error).message }, "prompts_resolve_failed"); + return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 }; + } +}); +``` + +> `ResolvedPrompt` is structurally assignable to `PromptDef` (same fields incl. `versions`). If TS complains about the `PromptName` vs `string` `name` field, widen via `rows: prompts as PromptDef[]`. Confirm the auth gate that protects `/api/v1/*` (`lib/api-auth.ts`) is applied route-table-wide (not per-file) — no extra wiring needed. + +- [ ] **Step 2: Create the on-demand version-body route (mirror `runs/[runId].get.ts`)** + +```ts +// apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts +import { defineEventHandler, setResponseHeader, getRouterParam } from "h3"; +import type { PromptVersionBodyResponse } from "@shared/contracts"; +import { env } from "../../../../../../env.js"; +import { PROMPT_NAMES, type PromptName } from "../../../../../lib/prompts.js"; +import { logger } from "../../../../../lib/logger.js"; + +export default defineEventHandler(async (event): Promise => { + setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60"); + const generatedAt = new Date().toISOString(); + + const name = getRouterParam(event, "name") ?? ""; + const version = getRouterParam(event, "version") ?? ""; + const arthurEnabled = + !!env.GENAI_ENGINE_API_KEY && !!env.GENAI_ENGINE_TRACE_ENDPOINT && !!env.GENAI_ENGINE_PROMPT_TASK_ID; + + if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) { + return { generatedAt, available: false, body: null }; + } + try { + const { ArthurClient } = await import("../../../../../sandbox/arthur-client.js"); + const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT!, env.GENAI_ENGINE_API_KEY!); + const body = await client.getPromptVersionBody(env.GENAI_ENGINE_PROMPT_TASK_ID!, name, version); + return { generatedAt, available: body !== null, body }; + } catch (err) { + logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed"); + return { generatedAt, available: false, body: null }; + } +}); +``` + +> Verify the relative import depth for this nested route path against the repo's actual `tsconfig`/route layout — count segments from `routes/api/v1/prompts/[name]/versions/` back to `apps/worker/{env.ts,src/lib,src/sandbox}`. Adjust `../` counts accordingly (the `env.ts` lives at `apps/worker/env.ts`, not under `src/`). Confirm h3's file-based dynamic-segment convention uses `[name]`/`[version]` here the same way `runs/[runId].get.ts` does. + +- [ ] **Step 3: Typecheck the worker** + +Run: `pnpm -F @apps/worker exec tsc --noEmit` +Expected: PASS. + +- [ ] **Step 4: Smoke the endpoints locally (optional but recommended)** + +Start the worker, then: +`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:/api/v1/prompts | jq` +Expected: `{ available: true, arthurEnabled: , total: 3, rows: [3 prompts; each has body, source, model, versions[]] }`. With Arthur on, `versions` is non-empty and one entry carries `body`. +`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:/api/v1/prompts/research-plan/versions/1 | jq` +Expected (Arthur on): `{ available: true, body: "..." }`; (Arthur off / missing): `{ available: false, body: null }`. + +--- + +### Task 5: Add the dashboard fallback + +**Files:** +- Modify: `apps/dashboard/lib/api/fallbacks.ts` + +- [ ] **Step 1: Add `promptsFallback`** + +Add `PromptsResponse` to the existing `import type { ... } from "@shared/contracts"`, then append: + +```ts +export function promptsFallback(now: string): PromptsResponse { + return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 }; +} +``` + +- [ ] **Step 2: Typecheck dashboard** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS (no new errors from this file). + +--- + +### Task 6: Add the loading skeleton + +**Files:** +- Create: `apps/dashboard/app/prompts-skeleton.tsx` + +- [ ] **Step 1: Create the skeleton (mirror `overview-skeleton.tsx`)** + +Header + 4-up KPI row + two-column (rail + detail) block matching the `/prompts` layout: + +```tsx +// apps/dashboard/app/prompts-skeleton.tsx +function Block({ className = "" }: { className?: string }) { + return
; +} + +export function PromptsSkeleton() { + return ( +
+
+ + +
+
+ {Array.from({ length: 4 }, (_, i) => ( + + ))} +
+
+ + +
+
+ ); +} +``` + +- [ ] **Step 2: Typecheck** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS. + +--- + +### Task 7: Add the server data component + the client-side version-body proxy route + +**Files:** +- Create: `apps/dashboard/app/prompts-data.tsx` +- Create: `apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts` + +- [ ] **Step 1: Create the server component (mirror `runs-data.tsx`)** + +```tsx +// apps/dashboard/app/prompts-data.tsx +import { getJSON } from "@/lib/api/server"; +import { PromptsScreen } from "@/components/cockpit/screens/prompts"; +import type { PromptsResponse } from "@shared/contracts"; +import { promptsFallback } from "@/lib/api/fallbacks"; + +export async function PromptsData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/prompts").catch(() => + promptsFallback(now), + ); + return ; +} +``` + +> This won't typecheck until Task 8 changes `PromptsScreen`'s signature. Expected; the full gate is in Task 9. + +- [ ] **Step 2: Create the same-origin proxy route for lazy version bodies** + +`PromptsScreen` is a client component; the bearer-gated worker API can't be hit from the browser (the token is server-only). Add a Next route handler that proxies the worker server-side: + +```ts +// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts +import { NextResponse } from "next/server"; +import { getJSON } from "@/lib/api/server"; +import type { PromptVersionBodyResponse } from "@shared/contracts"; + +export async function GET( + _req: Request, + { params }: { params: Promise<{ name: string; version: string }> }, +) { + const { name, version } = await params; + const now = new Date().toISOString(); + const data = await getJSON( + `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`, + ).catch(() => ({ generatedAt: now, available: false, body: null })); + return NextResponse.json(data); +} +``` + +> `params` is a Promise in Next 15 route handlers — confirm against the repo's Next version and existing route-handler conventions (check whether other `app/api/**/route.ts` files already exist to mirror their `params` typing). If none exist, this is the first; that's fine. + +- [ ] **Step 3: Typecheck dashboard** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS for the route handler (the `prompts-data.tsx` line still fails until Task 8; full gate in Task 9). + +--- + +### Task 8: Convert `PromptsScreen` to consume real data (with real version history) + +**Files:** +- Modify: `apps/dashboard/components/cockpit/screens/prompts.tsx` + +Keep the read-only registry + version-timeline shape, now backed by real data. **Remove** the per-version metrics grid and the two-column A/B diff (no Arthur source). Reuse existing `CkCard`, `CkKPI`, `Stat`, the chip styling, and the single-column mono body markup lifted from the old `PromptDiff`. + +- [ ] **Step 1: Replace imports and remove mock dependency** + +```tsx +"use client"; + +import React, { useState, useEffect } from "react"; +import { CkCard, CkKPI } from "@/components/ui"; +import type { PromptsResponse, PromptDef, PromptVersion } from "@shared/contracts"; +``` + +Remove: `AIWF_DATA`, and the mock `Prompt`/`PromptVersion`/`PromptTag` imports from `@/lib/types` (the `PromptVersion` now comes from `@shared/contracts`). Remove `const D = AIWF_DATA`. Keep `useEffect` (used to reset/lazy-load the selected version body when the active prompt changes). `CkChip` stays if still used. + +- [ ] **Step 2: Repurpose `PromptStatusChip` for real tags + source** + +`PromptStatusChip` keys off a status string. Real statuses now are: the production tag (`production`) on a version, and the resolution `source` (`arthur`/`fallback`). Add `arthur`/`fallback` keys to `PROMPT_STATUS_COLOR` and keep the existing `production`/`staging`/`draft`/`archived`/`locked` keys (real Arthur `tags` may include any string — unknown tags fall through to the default style already coded). + +- [ ] **Step 3: Rewrite `PromptList` to consume `PromptDef[]`** + +- Signature: `function PromptList({ rows, active, onSelect }: { rows: PromptDef[]; active: string; onSelect: (name: string) => void })`. +- Tag filter pills: derive the option set from the tags that actually occur across `rows[].versions[].tags` (e.g. `["all", ...uniqueTags]`); filter rows by whether any of their versions carries the selected tag. (If no versions/tags exist — Arthur off — render just `all` or hide the pill row.) +- Each row keyed by `p.name`; show `p.name`, `p.phase`, `p.model`, the production-tag chip (from the version tagged `production`), and a `source` chip. Remove the eval score/delta figure. +- `eyebrow`: `` `${arthurEnabled ? "Arthur" : "In-code"} · ${rows.length} prompts` `` — thread `arthurEnabled` through as a prop. + +- [ ] **Step 4: Rewrite `PromptDetail` — body panel + real version timeline** + +- Signature: `function PromptDetail({ prompt }: { prompt: PromptDef | undefined })`. +- Keep the "Select a prompt to inspect." empty state when `prompt` is undefined. +- Header eyebrow: `{prompt.source === "arthur" ? "Arthur" : "In-code"} · {prompt.phase}`. Title: `prompt.name`. Action chips: the `source` chip. Leave the `+ New version` / `Deploy` buttons inert (read-only). +- Replace the four mock `Stat`s with real ones: `Phase` = `prompt.phase`, `Source` = `prompt.source`, `Model` = `prompt.model`, `Versions` = `prompt.versions.length`. +- **Version timeline (real):** map `prompt.versions` (newest first). Each card shows: `v{version}`, `createdAt` (format as-is or relative), tag chips (`v.tags`), `modelName`, and `numMessages`/`numTools` counts. **Delete** the mock per-card eval/halluc/p95/cost rows and the `traffic` bar. Clicking a version selects it for the body panel. +- **Body panel (single column, read-only):** lift the inner mono `
` markup from the old `PromptDiff` (drop the two-column diff). Default shows `prompt.body` (the production version). When the user selects a non-production version, fetch its body once via the proxy route and render it: + ```tsx + const [selectedVersion, setSelectedVersion] = useState(null); + const [bodyCache, setBodyCache] = useState>({}); + const [loading, setLoading] = useState(false); + // reset selection when the prompt changes + useEffect(() => { setSelectedVersion(null); }, [prompt?.name]); + async function showVersion(v: PromptVersion) { + setSelectedVersion(v.version); + if (v.body) { setBodyCache((c) => ({ ...c, [v.version]: v.body! })); return; } + if (bodyCache[v.version] !== undefined) return; + setLoading(true); + try { + const res = await fetch(`/api/prompts/${prompt!.name}/versions/${v.version}`); + const json = (await res.json()) as { body: string | null }; + setBodyCache((c) => ({ ...c, [v.version]: json.body ?? "(version body unavailable)" })); + } finally { setLoading(false); } + } + const shownBody = selectedVersion != null ? (bodyCache[selectedVersion] ?? (loading ? "Loading…" : "")) : prompt!.body; + ``` +- Delete the now-unused `PromptDiff` and `PromptMetrics` functions. + +- [ ] **Step 5: Rewrite the top-level `PromptsScreen`** + +```tsx +export function PromptsScreen({ data }: { data: PromptsResponse }) { + const [active, setActive] = useState(data.rows[0]?.name ?? ""); + const selected = data.rows.find((p) => p.name === active); + const inProd = data.rows.filter((p) => p.versions.some((v) => v.tags.includes("production"))).length; + return ( +
+ {/* header — keep the title; leave the inert Import/New buttons */} +
+ + + {/* A/B + avg-Δ tiles removed — no real source */} +
+
+ + +
+
+ ); +} +``` + +> Reduced from 4 KPI tiles to 2 because the A/B-test and avg-eval-Δ tiles have no real source (removed, not stubbed). Adjust the grid (`lg:grid-cols-2`) accordingly. + +- [ ] **Step 6: Verify no mock references remain** + +Run: `grep -nE "AIWF_DATA|\\bD\\.|PROMPT_BODIES|PromptTag|from \"@/lib/types\"" apps/dashboard/components/cockpit/screens/prompts.tsx` +Expected: no matches (note `PromptVersion` now legitimately appears via `@shared/contracts`, so it's excluded from this grep). + +--- + +### Task 9: Rewrite the route to the server pattern + verify + +**Files:** +- Modify: `apps/dashboard/app/(cockpit)/prompts/page.tsx` + +- [ ] **Step 1: Replace the page with the Suspense + server-component pattern** + +```tsx +// apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts") +import { Suspense } from "react"; + +import { PromptsData } from "@/app/prompts-data"; +import { PromptsSkeleton } from "@/app/prompts-skeleton"; + +export default function PromptsPage() { + return ( + }> + + + ); +} +``` + +- [ ] **Step 2: Typecheck the whole dashboard** + +Run: `cd apps/dashboard && npx tsc --noEmit` +Expected: PASS, no errors. + +- [ ] **Step 3: Lint the changed files** + +Run: `cd apps/dashboard && npx next lint --file app/prompts-data.tsx --file app/prompts-skeleton.tsx --file "app/api/prompts/[name]/versions/[version]/route.ts" --file "app/(cockpit)/prompts/page.tsx" --file components/cockpit/screens/prompts.tsx` +Expected: no errors. + +- [ ] **Step 4: Visual check** + +Run: `cd apps/dashboard && pnpm dev`, open `/prompts`. +Expected: +- Three prompts listed (`research-plan`, `implement`, `review`) by phase + model. +- Selecting one shows its production body. With Arthur enabled, the version timeline lists real Arthur versions (version number, created-at, tags, model); clicking a historical version fetches and shows that version's body via `/api/prompts/{name}/versions/{version}`. +- With Arthur disabled, `source` chip reads `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`. +- With the worker unreachable (`WORKER_BASE_URL` unset), the page shows the empty state (`0 prompts`), no crash. A failed version-body fetch shows an inline "version body unavailable" note, no crash. + +- [ ] **Step 5: Commit (ONLY if the user asks)** + +```bash +git add apps/shared/contracts/api.ts apps/shared/contracts/domain.ts \ + apps/worker/src/sandbox/arthur-client.ts apps/worker/src/sandbox/arthur-client.test.ts \ + apps/worker/src/lib/prompts/resolve.ts apps/worker/src/workflows/prompts-step.ts \ + apps/worker/src/routes/api/v1/prompts.get.ts \ + "apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts" \ + apps/dashboard/lib/api/fallbacks.ts apps/dashboard/app/prompts-data.tsx \ + "apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts" \ + apps/dashboard/app/prompts-skeleton.tsx "apps/dashboard/app/(cockpit)/prompts/page.tsx" \ + apps/dashboard/components/cockpit/screens/prompts.tsx +git commit -m "feat: wire /prompts to real worker data with Arthur version history" +``` + +--- + +## Self-Review + +**Spec coverage:** +- `PromptVersion` + `PromptDef` + `PromptsResponse` + `PromptVersionBodyResponse` contracts → Task 1. ✓ +- Arthur read methods (`listPromptVersions`, `getPromptVersionBody`) → Task 2. ✓ +- Real data source (Arthur production tags + in-code fallbacks) + version history via shared `resolvePrompts()` → Task 3. ✓ +- Worker list route `GET /api/v1/prompts` + on-demand body route `GET /api/v1/prompts/[name]/versions/[version]` → Task 4. ✓ +- Dashboard `promptsFallback` → Task 5. ✓ +- `prompts-skeleton.tsx` → Task 6. ✓ +- `prompts-data.tsx` server component + client-side version-body proxy route → Task 7. ✓ +- `PromptsScreen` swap to read-only real-data view with real version timeline; per-version metrics + A/B diff markup removed → Task 8. ✓ +- Page route → server pattern → Task 9. ✓ +- Worker-down empty state → `promptsFallback` (Task 5) + route catch (Task 4), verified in Task 9 Step 4. ✓ +- Embellishment removal (per-version eval/halluc/p95/cost, traffic split, eval Δ, A/B test KPI) — markup deleted, not stubbed (Task 8). ✓ + +**Decisions resolved (no longer open):** read-only confirmed; real version history in scope (metadata + on-demand bodies); tags are real; `resolvePrompts()` extraction confirmed OK; production-body eager / historical lazy. + +**Still-open items (flagged in spec, do not block execution):** +1. Lazy vs eager historical body fetch — plan implements eager-production / lazy-history; switch if the user prefers otherwise. +2. Version-list pagination depth — plan fetches first page only; add pagination if deep history is required. + +**Type consistency:** `PromptsResponse`/`PromptDef`/`PromptVersion`/`PromptVersionBodyResponse` imported from `@shared/contracts` across Tasks 3, 4, 5, 7, 8. `PromptsScreen` accepts `{ data: PromptsResponse }` (Task 8) — matches the call site (Task 7). `ResolvedPrompt` (worker) is structurally assignable to `PromptDef` (incl. `versions: PromptVersion[]`); widen the `name` field if TS narrows on the literal union. `ArthurPromptVersion` (snake_case Arthur shape) is mapped to the camelCase `PromptVersion` inside `resolvePrompts()`. `PromptsSkeleton` (Task 6) matches the import in Task 9. ✓ + +**Placeholder scan:** No TBD/TODO. Verify, when executing: worker route import depths (esp. the nested `prompts/[name]/versions/[version].get.ts` path), the Next route-handler `params` Promise convention against the repo's Next version, and the worker dev-run command — all flagged inline. ✓ + diff --git a/docs/superpowers/specs/2026-06-08-cost-real-data-design.md b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md new file mode 100644 index 0000000..657bfa2 --- /dev/null +++ b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md @@ -0,0 +1,166 @@ +# `/cost` Real-Data Conversion — Design + +**Date:** 2026-06-08 +**Status:** Draft — has open questions (see end) +**Scope:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token counts and USD cost from the OpenInference traces the workflow ships in. + +## Problem + +The `/cost` page (`apps/dashboard/app/(cockpit)/cost/page.tsx`) renders a complete UI — spend / token KPIs, a daily-spend area chart, a per-model donut + breakdown table, and a per-workflow/task breakdown table — entirely from mock data (`AIWF_DATA.COST_BY_MODEL`, `AIWF_DATA.HOURS24`, `AIWF_DATA.WORKFLOWS`). The overview and `/runs` pages already fetch real data from the worker; `/cost` should do the same. + +The overview's `cost24h` (`KpisResponse`), `Run.cost`, `Run.tokens`, and `WorkflowRow.costToday` are all hardcoded `null` (`collect-kpis.ts:69`, `collect-runs.ts:171-172`, `collect-workflows.ts:81`, `derive-kpis.ts:49`) because the Vercel Workflow run store carries no usage. But the workflow already ships OpenInference traces to Arthur (per-ticket task, `apps/worker/src/sandbox/arthur-tracer.ts` + `arthur-client.ts`), and **Arthur aggregates token + cost data first-class** on those traces. So the real source already exists and is queryable — no new capture or persistence is needed. + +## Current state + +### What the screen needs (exact data shape) + +Read from `apps/dashboard/components/cockpit/screens/cost.tsx`: + +| UI element | Mock source | Real source after this change | +| --- | --- | --- | +| KPI: spend | `sum(COST_BY_MODEL.cost)` | `totals.totalTokenCost` (USD) | +| KPI: Tokens | `sum(COST_BY_MODEL.tokens)` | `totals.totalTokens` | +| KPI: Cost/run avg | hardcoded `$0.41` | `totals.costPerRun` | +| KPI: Projection EoM | hardcoded `$1,184` | **removed** (no source) | +| Area chart "Daily spend" | `HOURS24.map(h => h.cost*24)` | `daily[].cost` + `daily[].date` (Arthur timeseries) | +| Donut "Model mix" | `COST_BY_MODEL[].share` + center | `byModel[].cost` → shares computed in-screen; center = `totalTokenCost` | +| Table "Per-model breakdown" | `COST_BY_MODEL[]` | `byModel[] { model, cost, tokens }` (span-level aggregation) | +| Table "Per-workflow breakdown" | `WORKFLOWS[]` sorted by `costToday` | `byWorkflow[]` (= per-Arthur-task; see mapping note) | +| Header tabs "By model / workflow / actor" | inert | **removed** | +| "Export CSV" button | inert | **removed** | +| Sparklines (`Spark`, random `sparkSeries`) | mock RNG | **removed** | +| Budget `$1,200`, MoM/WoW deltas | hardcoded | **removed** | + +Mock shapes (replaced): `CostByModel { model, vendor, cost, tokens, share }` (`apps/dashboard/lib/types.ts:36`); `HourPoint` (`apps/shared/contracts/domain.ts:129`). + +### How real data flows (the template — overview/runs) + +1. Worker route `apps/worker/src/routes/api/v1/...` returns a typed `@shared/contracts` response; wraps the collector in try/catch and degrades to an empty payload on failure (see `runs.get.ts`, `workflows.get.ts`). Sends `Cache-Control: private, max-age=15, swr=60`. +2. Response interface declared in `apps/shared/contracts/api.ts`. +3. Dashboard fetches server-side via `getJSON(path)` (`apps/dashboard/lib/api/server.ts`) — bearer `WORKER_API_TOKEN`, `cache: "no-store"`. +4. A `*-data.tsx` server component calls `getJSON`, `.catch()`s to a fallback in `apps/dashboard/lib/api/fallbacks.ts`, passes a `data` prop to the client screen. +5. The page is a thin `}>` route. + +This is a **single-PR conversion** — no persistence layer, no two-step rollout. + +## The real data source — Arthur GenAI Engine + +The worker already holds an Arthur client. `ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY)` (`arthur-client.ts:37`) builds a client whose `request` helper sends `Authorization: Bearer `. Both env vars are optional (`apps/worker/env.ts:83-84`) → when unset, the route falls back to the empty state. Reads require the `INFERENCE_READ` permission on the key. Arthur is org-scoped (the single deployment sees its own org) — consistent with this project's single-tenant deployment model. + +### Token + cost are first-class on Arthur traces + +Traces/spans extend `TokenCountCostSchema`: +`{ prompt_token_count, completion_token_count, total_token_count, prompt_token_cost, completion_token_cost, total_token_cost }` (cost in USD floats, `null` if unavailable). Responses also carry `display_currency` (defaults USD). + +### Endpoints used + +1. **Totals + per-task breakdown (one call):** `POST /api/v1/traces/overview` + body `{ task_ids, start_time, end_time }` → + `{ count, overviews: [{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }] }`. + Multi-task in one call gives fleet totals (sum across `overviews`) **and** the per-task breakdown over a window. + +2. **Daily-spend chart:** `POST /api/v1/traces/overview/timeseries` + body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) → + points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`. + For a fleet daily-spend chart, fan out one call per task and **merge points by bucket timestamp**, summing `trace_token_cost`/`trace_token_count`. (`bucket_size` allowed values are unconfirmed — see open questions.) + +3. **By-model breakdown (the one manual aggregation):** `GET /api/v1/traces/spans` (and/or `GET /api/v1/traces`) extend `TokenCountCostSchema`, and spans carry `model_name`. The overview endpoint is per-**task**, not per-model, so a by-model table requires fetching span rows for the window and **summing token/cost client-side grouped by `model_name`**. This is the only client-side aggregation; flagged below. + +### How usage→cost is computed + +No client-side pricing. Arthur returns USD cost directly (`*_token_cost`), already derived from the traces. The worker just sums Arthur's pre-aggregated numbers (for totals/timeseries) or groups span rows by `model_name` (for the by-model table). The pricing table (`apps/worker/src/sandbox/agents/pricing.ts`) and the Slack `usageReport` path are untouched and not on this read path. + +### Reconciliation with the overview KPI (out of scope, noted) + +The overview's `cost24h` / `WorkflowRow.costToday` / `Run.cost` are hardcoded `null` today. The same Arthur source could backfill those so cost is computed in exactly one place going forward (e.g. `collectKpis`/`collectWorkflows` querying `/traces/overview` for the matching task/window). Out of scope for this PR, but called out so the `null` placeholders aren't reinvented elsewhere. + +## Proposed contract (`apps/shared/contracts/api.ts`) + +```ts +export interface CostByModelEntry { + model: string; // Arthur span model_name + cost: number; // USD, summed total_token_cost over the window + tokens: number; // summed total_token_count over the window +} + +export interface CostByWorkflowEntry { + /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */ + taskId: string; + /** Arthur task name (= the ticket-run identifier). */ + name: string; + runs: number; // trace_count for the task + tokens: number; // trace_token_count + cost: number; // trace_token_cost (USD) + costPerRun: number; // cost / max(1, runs) +} + +export interface CostResponse { + generatedAt: string; + /** false when Arthur is unconfigured/unreachable or returns nothing. The + * screen renders its empty/N-A state. */ + available: boolean; + /** Window the figures cover (the request's start_time/end_time). */ + window: { start: string; end: string }; // ISO + totals: { + totalTokenCost: number; // USD, Σ overviews[].trace_token_cost + totalTokens: number; // Σ overviews[].trace_token_count + traceCount: number; // Σ overviews[].trace_count + costPerRun: number; // totalTokenCost / max(1, traceCount) + }; + byModel: CostByModelEntry[]; + /** Per-task (= per ticket-run) breakdown from /traces/overview. */ + byWorkflow: CostByWorkflowEntry[]; + /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */ + daily: { date: string; cost: number; tokens: number }[]; // date = bucket ISO timestamp +} +``` + +Notes: +- `byWorkflow` is named to match the screen's "Per-workflow breakdown" section, but its entries are **per Arthur task** (per ticket-run), since that's the natural grain of `/traces/overview`. See the mapping open question. +- Stripped from the contract/screen (no real source, per user decision): budget, MoM/WoW deltas, EoM projection, "By actor" tab, decorative sparklines, "Export CSV". + +## Fallback / unavailable state + +Add `costFallback(now)` to `apps/dashboard/lib/api/fallbacks.ts`: + +```ts +export function costFallback(now: string): CostResponse { + return { + generatedAt: now, + available: false, + window: { start: now, end: now }, + totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 }, + byModel: [], + byWorkflow: [], + daily: [], + }; +} +``` + +The worker route degrades to the same empty payload (`available:false`) when `GENAI_ENGINE_API_KEY`/`GENAI_ENGINE_TRACE_ENDPOINT` are unset or any Arthur call throws — matching `runs.get.ts`/`workflows.get.ts`. The screen renders `$0.00` / `0` / empty tables — never crashes. + +## Behavior + +- **Happy path:** `/cost` shows real spend, token totals, per-model and per-task breakdowns, and a per-day spend chart, all from Arthur over the chosen window. +- **Arthur unconfigured / unreachable / 401:** `getJSON` returns (or the worker degrades to) `available:false` → empty/zero state. No crash. + +## Out of scope + +- Wiring tabs / "Export CSV" (removed). +- Backfilling the overview's `cost24h`/`costToday`/`Run.cost` from Arthur (mentioned above). +- A task→workflow mapping for a true by-workflow rollup (breakdown stays per-task). + +## Open questions / assumptions + +1. **`bucket_size` values.** `/traces/overview/timeseries` takes a `bucket_size`, but the allowed values (e.g. `"day"` vs a duration vs an enum) are unconfirmed. **Assumption:** a day-granularity bucket exists for the daily chart; confirm the exact value. +2. **Empty `task_ids`.** Does `/traces/overview` with an empty/omitted `task_ids` return org-wide totals, or is `task_ids` required? If required, the worker must first list the org's tasks (the client already lists tasks via `/api/v2/tasks/search`) and pass their ids. **Assumption:** we enumerate tasks and pass ids explicitly. +3. **By-model client aggregation.** Per-model totals require fetching span rows and summing by `model_name` client-side (Arthur has no per-model overview). Acceptable, given span volume per window? Or drop the by-model table for v1? +4. **Task→workflow mapping.** Arthur tasks are per ticket-run (`AWT-42`, `AWT-42.1`). The "by workflow" section therefore shows **per-task** rows unless we add a task→workflow mapping. Stated, not blocking; per-task is the natural breakdown. +5. **Window.** Which window do the KPIs cover — calendar MTD, rolling 30d, or 24h? Drives `start_time`/`end_time`. **Assumption:** calendar month-to-date (matches the original "MTD" framing); confirm. + +## Verification + +1. Worker + dashboard typecheck pass. +2. `GET /api/v1/cost` returns non-empty `totals`/`byWorkflow` for a window with real Arthur traces. +3. `/cost` renders those figures (spend, tokens, breakdowns, daily chart). +4. With Arthur unconfigured (env unset) or unreachable, `/cost` shows the zero/empty state — no crash. diff --git a/docs/superpowers/specs/2026-06-08-evals-real-data-design.md b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md new file mode 100644 index 0000000..95d683a --- /dev/null +++ b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md @@ -0,0 +1,227 @@ +# `/evals` Real-Data Conversion — Design + +**Date:** 2026-06-08 +**Status:** Draft (has open questions — see end) +**Scope:** Convert the `/evals` page from mock data to live data, mirroring the overview/runs server-component fetch pattern. Unlike `/runs`, the worker does **not** yet expose an evals list endpoint and the underlying eval results are **not yet read from anywhere** — so this design also covers the prerequisite of producing/reading eval results, with the data-source decision flagged explicitly. + +## Problem + +The `/evals` dashboard page (`apps/dashboard/app/(cockpit)/evals/page.tsx`) is a 4-line stub that renders `EvalsScreen` with no data fetch. `EvalsScreen` (`apps/dashboard/components/cockpit/screens/evals.tsx`) is a `"use client"` component that reads the hardcoded `AIWF_DATA.EVALS` mock slice and draws synthetic sparklines via `jitterSeries`. Nothing on this page is real. + +We want `/evals` to fetch real data from the worker through the same three-layer pattern the overview and runs pages use: +1. thin server route (`page.tsx`) → `` + server data component; +2. `evals-data.tsx` server component calling `getJSON` with a `.catch()` fallback; +3. client presenter `EvalsScreen` receiving a typed `data` prop. + +## Current state + +### Mock (what the screen renders today) + +`apps/dashboard/lib/data/mock.ts` exports `EVALS: EvalMetric[]` (the "Arthur evals" slice, lines ~82–93). The shape is `EvalMetric` from `apps/dashboard/lib/types.ts`: + +```ts +export interface EvalMetric { + metric: string; // "Hallucination", "PII Detection", … + value: number; // numeric reading + target: string; // human string, e.g. "< 0.05", "= 0", "flags" + status: "pass" | "warn" | "fail"; + trend: number; // signed delta vs prior window + axis: "safety" | "quality" | "ops"; // grouping bucket + family: string; // "output" | "agent" | "input" | "rag" | "runtime" + unit?: string; // optional, e.g. "flags/24h" +} +``` + +`EvalsScreen` renders, per `axis` group ("safety", "quality", "ops"): +- a `CkCard` with eyebrow=axis, title from a fixed map, a left-border accent color, and an action label `{list.length} evaluators`; +- one cell per metric containing: `metric` name, a `pass`/`warn`/`fail` `CkChip`, the formatted `value` (`<1` → `toFixed(3)`, else as-is), optional `unit`, a `trend` indicator (`↗`/`↘`/`→` + `Math.abs(trend).toFixed(3)`; **negative trend renders green, positive red** — i.e. "down is good" by current convention), a `Spark` sparkline, and `target {e.target}`. + +Header chrome is decorative/hardcoded: the eyebrow "Arthur engine · continuous evaluation", the title "Evaluations & guardrails", a `CkChip` "Live · 12,408 spans · 24h", and a `+ New eval` button. + +**The sparkline is fake:** `Spark data={jitterSeries(...)}`. There is no per-metric time series in the mock or anywhere else. + +### Existing eval scaffold + +`apps/shared/contracts/api.ts` already declares a discriminated union: + +```ts +export type EvalHealthResponse = + | { available: true; score: number; pass: number; warn: number; fail: number; + spansGraded: number; windowHours: number } + | { available: false; reason: string }; +``` + +The worker route `apps/worker/src/routes/api/v1/overview/eval-health.get.ts` is a hardcoded stub returning `{ available: false, reason: "Eval grading not wired up yet." }`. The overview page already consumes it: `overview-data.tsx` fetches `/api/v1/overview/eval-health` (falls back to `evalHealthFallback()` → `{ available: false, reason: "Worker unavailable." }`), and `EvalHealthKPI` in `overview.tsx` renders a donut of pass/warn/fail + score + `spansGraded`/`windowHours` when `available`, else the `reason` string. This is a **summary** KPI tile, not the per-metric breakdown the `/evals` page needs. + +### Where eval results actually originate (the real data source — CONFIRMED) + +Arthur is integrated **write-only** today: +- `apps/worker/src/sandbox/arthur-client.ts` — a client for the Arthur GenAI Engine **tasks/prompts** API (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). It creates one task per ticket run and hosts/tags prompt versions. It has **no** read method yet. +- `apps/worker/src/sandbox/arthur-tracer.ts` — a bundled Python OpenInference tracer that **ships traces/spans into** Arthur Engine from inside each sandbox via `POST /api/v1/traces`. Data flows out of the worker; nothing reads it back. +- Wiring lives in `apps/worker/src/workflows/agent.ts` (`ensureArthurTaskForTicket`, gated on `env.GENAI_ENGINE_API_KEY` + `env.GENAI_ENGINE_TRACE_ENDPOINT`). + +**The Arthur GenAI Engine DOES expose a read API** (ground-truthed from `arthur-ai/arthur-engine` + `arthur-common` on `main`). Auth is the **same** `Authorization: Bearer GENAI_ENGINE_API_KEY` used for writes; reads require the `INFERENCE_READ` permission. All reads are **org-scoped** — a deployment's key sees its whole org, which matches our single-tenant-per-deployment model. The relevant endpoints: + +- **Fleet aggregate (primary source for this page) — one call, multi-task:** + `POST /api/v1/traces/overview` body `TraceOverviewRequest { task_ids, start_time, end_time }` → `TraceOverviewListResponse { count, overviews: TraceOverviewResponse[] }`. Each `TraceOverviewResponse` = `{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }`. This yields fleet-wide eval health (success rate + trace/eval counts) over a 24h window with no per-task fan-out at the result-shaping layer. +- **Per-metric breakdown (optional):** `GET /api/v1/traces/spans` (list, metadata only) → `GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results: MetricResultResponse[]` where each = `{ id, metric_type, details, prompt_tokens, completion_tokens, latency_ms, span_id, metric_id, created_at }`. `metric_type` is an enum of **only** `QueryRelevance | ResponseRelevance | ToolSelection`. `details` is an opaque JSON string (e.g. relevance → `{ bert_f_score, reranker_relevance_score, llm_relevance_score, reason }`). **There is no flat numeric score or pass/fail on a metric result** — we parse `details` and apply our own threshold. +- **Trend/timeseries (optional):** `POST /api/v1/traces/overview/timeseries` body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) → points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`. + +#### CRITICAL CAVEAT — what our trace path actually yields + +The rich rule-based evals the mock screen implies — **hallucination, PII, toxicity, prompt-injection** Pass/Fail — live in Arthur's **legacy inference/rule model**, populated **only** by the `/validate_prompt` + `/validate_response` write path. **We never call that path; we only ship OpenInference traces (`POST /api/v1/traces`).** Therefore `GET /api/v2/inferences/query` and those rule families are **empty for us**. + +What our trace path actually produces: +- `continuous_eval_success_rate`, `eval_count` (spans graded), `trace_count` — from `/traces/overview`; +- the three relevance/tool metric types — and **only if continuous evals are configured on the task**; otherwise `eval_count = 0`. + +So the realistic `/evals` page = an overall **eval-health score** (`continuous_eval_success_rate × 100`), the **graded count + window**, and a **relevance / tool-selection breakdown**. The hallucination/toxicity/PII/prompt-injection families the mock shows are **dropped** from this page. Adopting Arthur's `validate_*` API to populate them is a **separate future prerequisite, explicitly out of scope** here. + +**Conclusion:** evals are now reachable via a confirmed read API, so this is no longer blocked. Conversion's prerequisite is to add a worker-side read path (`getTracesOverview()` on `ArthurClient` + a `collect-evals.ts` collector). When Arthur is unconfigured, or when `eval_count = 0` (no continuous evals configured / no graded spans in window), the page degrades to the documented unavailable state — exactly like `eval-health` does today. + +## Proposed data contract + +Add to `apps/shared/contracts/api.ts`. The shape now maps directly to `TraceOverviewResponse` (the fleet aggregate) plus the relevance/tool-selection breakdown. We reuse the **same discriminated-union shape** as `EvalHealthResponse` so the page handles "not wired up" / "nothing graded" identically to overview. Fields with no real source on our trace-only path are **dropped** (no synthetic sparklines, no rule families). + +```ts +/** One evaluator's aggregate reading over the window. Limited to the metric + * types Arthur computes from our OpenInference trace path: + * ResponseRelevance / QueryRelevance / ToolSelection. */ +export interface EvalMetricRow { + metric: string; // display name, e.g. "Response Relevance" + metricType: // Arthur metric_type enum + | "QueryRelevance" + | "ResponseRelevance" + | "ToolSelection"; + value: number; // aggregate score parsed from metric_results.details + status: "pass" | "warn" | "fail"; // computed against our own threshold + axis: "quality"; // all three are quality-axis on our path + // Only present when /traces/overview/timeseries is wired (see Open Q1). + trend?: number | null; // signed delta vs window start; omitted if not wired + spark?: number[]; // success-rate buckets; omitted if not wired +} + +export type EvalsResponse = + | { + available: true; + generatedAt: string; + windowHours: number; + /** continuous_eval_success_rate × 100, fleet-wide. */ + score: number; + /** Σ eval_count across tasks — "spans graded" in the window. */ + spansGraded: number; + /** Σ trace_count across tasks. */ + traceCount: number; + /** Per-metric-type breakdown; empty if no continuous evals configured. */ + rows: EvalMetricRow[]; + } + | { available: false; generatedAt: string; reason: string }; +``` + +Notes: +- `score`/`spansGraded`/`traceCount`/`windowHours` come straight from summing `TraceOverviewResponse` fields across the returned overviews. +- `EvalMetricRow.value`/`status` require the **optional** per-span breakdown (Open Q below). If we ship the aggregate-only first cut, `rows` is `[]` and the page renders the score + graded count without the per-metric grid. This keeps the first increment small. +- `target`/`family`/`unit` from the old draft are **removed** — they were presentation metadata for rule families we cannot populate. `axis` collapses to the single `"quality"` literal because only relevance/tool metrics exist on our path. +- `trend`/`spark` are present **only** if `/traces/overview/timeseries` is wired (Open Q1); otherwise omitted entirely (no static placeholders). + +**Assumption:** the `/evals` page consumes only this trace-derived data; the existing `EvalHealthResponse` summary tile on overview is left untouched. We do **not** consolidate the two endpoints in this change (though `EvalsResponse.score`/`spansGraded` could later feed it). + +## Real data source & how it's obtained (worker side) + +New worker route `GET /api/v1/evals` → `EvalsResponse`, structured like `runs.get.ts`: +- sets `Cache-Control: private, max-age=15, stale-while-revalidate=60`; +- if `env.GENAI_ENGINE_API_KEY` / `env.GENAI_ENGINE_TRACE_ENDPOINT` are unset, returns `{ available: false, reason: "Arthur GenAI Engine not configured." }` (no throw); +- otherwise builds an `ArthurClient` (via the existing `ArthurClient.fromTraceEndpoint`) and calls a new read method `getTracesOverview({ taskIds, startTime, endTime })` → `POST /api/v1/traces/overview`. The new `apps/worker/src/lib/overview/collect-evals.ts` collector sums the returned `overviews` into `score`/`spansGraded`/`traceCount`, and (optionally) shapes `rows` from the per-span metric breakdown. Returns `available: true`; +- if `eval_count` sums to `0` (no continuous evals configured on our tasks, or nothing graded in window), return `{ available: false, reason: "No graded evals in the last 24h." }` — there is genuinely nothing to show; +- on any error, logs `evals_list_failed` and returns `{ available: false, reason: "Eval grading not wired up yet." }` — same degrade behavior as the other routes. + +**Task-id enumeration:** `/traces/overview` takes `task_ids`. It is **unconfirmed** whether an empty/omitted `task_ids` means "all org tasks" (Open Q2). If it does, we pass none. If it does not, we first enumerate the org's tasks via the existing `/api/v2/tasks/search` path (the `ArthurClient` already does substring search there) and pass their ids. The collector boundary (`collect-evals.ts` taking an injected fetcher) keeps this isolated and testable, matching `collect-runs.ts`/`collect-kpis.ts`. + +## Dashboard changes + +### 1. `app/(cockpit)/evals/page.tsx` (rewrite) +Thin server route, drops the direct screen import: +```tsx +import { Suspense } from "react"; +import { EvalsData } from "@/app/evals-data"; +import { EvalsSkeleton } from "@/app/evals-skeleton"; + +export default function EvalsPage() { + return ( + }> + + + ); +} +``` + +### 2. `app/evals-data.tsx` (new server component) +Mirrors `runs-data.tsx`: +```tsx +import { getJSON } from "@/lib/api/server"; +import { EvalsScreen } from "@/components/cockpit/screens/evals"; +import type { EvalsResponse } from "@shared/contracts"; +import { evalsFallback } from "@/lib/api/fallbacks"; + +export async function EvalsData() { + const now = new Date().toISOString(); + const data = await getJSON("/api/v1/evals").catch(() => + evalsFallback(now), + ); + return ; +} +``` + +### 3. `lib/api/fallbacks.ts` (add) +```ts +export function evalsFallback(now: string): EvalsResponse { + return { available: false, generatedAt: now, reason: "Worker unavailable." }; +} +``` + +### 4. `components/cockpit/screens/evals.tsx` (modify) +- Signature `EvalsScreen()` → `EvalsScreen({ data }: { data: EvalsResponse })`. +- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `const D = AIWF_DATA`, and `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped — no static placeholders). +- Import `EvalsResponse`/`EvalMetricRow` from `@shared/contracts` (drop the mock `EvalMetric` reliance). +- When `data.available === false`, render the existing header chrome but replace the metric cards with a single empty/unavailable panel showing `data.reason` (mirroring `EvalHealthKPI`'s reason path). This is also the state when nothing is graded. +- When `available`: + - Drive the "Live · N spans · 24h" chip from `data.spansGraded` / `data.windowHours` instead of the hardcoded "12,408 spans · 24h"; optionally show `data.score`. + - The mock's three axis groups (safety/quality/ops) collapse to a single **Quality** group, since only relevance/tool metrics exist on our path. Render `data.rows` (all `axis: "quality"`) in one card. + - Each row shows `metric`, the formatted `value`, and the pass/warn/fail `CkChip`. + - Sparkline / trend: render `e.spark` / `e.trend` **only when present** (timeseries wired); otherwise render neither. Drop the `Spark`/`jitterSeries` usage when not wired. + - If `rows` is empty (aggregate-only first cut), render just the score + graded-count header — no per-metric grid. + +### 5. `app/evals-skeleton.tsx` (new) +Loading fallback styled like `overview-skeleton.tsx` — header placeholder + one card-shaped block (the Quality group). + +## Behavior + +- **Happy path (Arthur configured, continuous evals graded):** `/evals` renders the fleet eval-health score + spans-graded count over the real 24h window, and (if the per-span breakdown is wired) a Quality card of relevance/tool-selection metrics. Trend/sparkline appear only when the timeseries call is wired. +- **Arthur not configured:** worker returns `available: false`, reason "Arthur GenAI Engine not configured." Page shows header chrome + reason panel. No crash. +- **Nothing graded (`eval_count = 0`):** worker returns `available: false`, reason "No graded evals in the last 24h." Same panel. +- **Worker down / 401:** `getJSON` throws → `evalsFallback` → `available: false`, reason "Worker unavailable." Same silent-degrade as overview/runs. + +## Out of scope + +- Wiring up the `+ New eval` button. +- The `EvalHealthResponse` overview tile (left as-is; could later be derived from `EvalsResponse` but not in this change). +- **Adopting Arthur's `/validate_prompt` + `/validate_response` write path** to populate the legacy rule families (hallucination, PII, toxicity, prompt-injection). This is the prerequisite for those metrics and is a **separate future effort** — those families are simply absent from this page. +- Per-span drill-down / individual inference detail views. +- Synthetic sparklines — removed entirely (no static placeholders). + +## Open questions / assumptions (need user decision) + +The Arthur read API is now **confirmed** (see "Where eval results actually originate"). Remaining genuinely-open items: + +1. **`/traces/overview/timeseries` `bucket_size` values.** The allowed `bucket_size` values are unconfirmed. Needed only if we wire trend/sparkline; the aggregate-only first cut does not require it. **Assumption:** trend/sparkline are deferred to a second increment. +2. **Empty `task_ids` semantics.** Does `POST /api/v1/traces/overview` treat an empty/omitted `task_ids` as "all org tasks"? If yes, one call with no ids suffices. If no, the collector must first enumerate tasks via `/api/v2/tasks/search`. **Assumption:** unconfirmed → plan covers both paths; default to enumerating tasks if empty-means-all is not verified. +3. **Are continuous evals actually configured on our tasks in the live instance?** If continuous evals are not enabled on the per-ticket tasks, `eval_count = 0` and the page legitimately shows the "No graded evals" state. Confirming this is what determines whether the happy path ever fires today. + +Resolved (no longer open): read-API existence/shape, auth, org-scope/single-tenant aggregation, and the metric-family set (only relevance/tool on our path; rule families dropped). + +## Verification + +1. Shared + worker + dashboard typecheck pass (`npx tsc --noEmit`) with `EvalsResponse` imported in the route, `evals-data.tsx`, and `evals.tsx`. +2. With the worker unreachable (or Arthur unconfigured), `/evals` renders header chrome + the reason panel, no crash. +3. With Arthur configured and continuous evals graded, `/evals` renders the real fleet score + spans-graded count over the 24h window (and the Quality breakdown if wired). +4. With Arthur configured but `eval_count = 0`, `/evals` shows the "No graded evals in the last 24h." panel. diff --git a/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md new file mode 100644 index 0000000..49be94c --- /dev/null +++ b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md @@ -0,0 +1,223 @@ +# `/prompts` Real-Data Conversion — Design + +**Date:** 2026-06-08 +**Status:** Approved +**Scope:** Swap the existing `/prompts` page from mock data to live worker data, mirroring the `/runs` and overview pattern. **Read-only display, including real Arthur version history.** No write/edit endpoints. Embellishment fields with no real backing are removed (markup deleted, not stubbed with placeholders). + +## Problem + +The `/prompts` dashboard page (`apps/dashboard/app/(cockpit)/prompts/page.tsx`) renders a full prompt-registry UI but is wired entirely to mock data (`AIWF_DATA.PROMPTS`, `PROMPT_VERSIONS`, `PROMPT_BODIES` from `@/lib/data/mock`). The overview and `/runs` pages already fetch real data from the worker through a server-component pattern. We want `/prompts` to show the prompts the worker actually drives the AI workflow with. + +## Real data source (the important finding) + +In this project, "prompts" are the three system prompts that drive each workflow phase. They live in the worker, not in a CMS: + +- **Static fallbacks (source of truth in code):** `apps/worker/src/lib/prompts.ts` defines three constant strings — `researchPlanPrompt`, `implementPrompt`, `reviewPrompt` — exported as `PROMPT_FALLBACKS: Record` keyed by `PROMPT_NAMES = ["research-plan", "implement", "review"]`. +- **Optional runtime override (Arthur GenAI Engine):** `apps/worker/src/workflows/prompts-step.ts`'s `loadPrompts()` step checks whether `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT`, and `GENAI_ENGINE_PROMPT_TASK_ID` are all set. If so, it fetches the `production`-tagged version of each prompt from Arthur via `ArthurClient.getPromptByTag(taskId, name, "production")` (`apps/worker/src/sandbox/arthur-client.ts`). On 404 / error / Arthur disabled it falls back to the in-code `PROMPT_FALLBACKS` string for that name. +- **Seeding:** `apps/worker/scripts/setup-arthur-prompts.ts` is a one-shot script that pushes the three fallback strings into a single Arthur task named `ai-workflow-prompts` and tags each `production`. This is the only writer; nothing in the request/runtime path writes prompts. + +**Arthur read API (ground-truthed against `arthur-ai/arthur-engine` `main`).** Auth is the same `Authorization: Bearer GENAI_ENGINE_API_KEY`; prompt reads require the `TASK_READ` scope. Three endpoints are relevant: + +- **List versions (metadata only):** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions` → `AgenticPromptVersionListResponse { count, versions: AgenticPromptVersionResponse[] }`. Each `AgenticPromptVersionResponse`: `{ version (int), created_at, deleted_at (nullable), model_provider, model_name, tags: string[], num_messages, num_tools }`. **No message body and no per-version eval metrics.** +- **Fetch a version body:** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions/{prompt_version}` where `{prompt_version}` accepts `latest` | an integer | an ISO datetime | a tag → `AgenticPrompt { messages }`. This is the endpoint the existing `ArthurClient.getPromptByTag` already uses (it passes a tag). We use it to fetch the body of any specific version (the `production`-tagged one eagerly; an arbitrary version on demand). +- **List all prompts on a task:** `GET /api/v1/tasks/{task_id}/prompts` → `LLMGetAllMetadataListResponse { count, llm_metadata: [{ name, versions, tags, created_at, latest_version_created_at, deleted_versions }] }`. Not strictly needed — our three phase-prompt names are fixed — so we don't use it. + +**Conclusion:** there is no editable prompt *registry* in this app, and the worker never persists prompt metadata locally — but Arthur **does** expose real version history (version number, created-at, tags, model) per named prompt, plus on-demand bodies. So the real, available data per phase prompt is: a stable name, the human phase label, the resolved **production body**, the resolved `source` (`arthur` | `fallback`), the model, and a list of **real Arthur versions** (`{ version, createdAt, tags, modelProvider, modelName, numMessages, numTools }`). + +This makes the conversion a faithful read-only swap **with real version history**. The mock-only fields that have **no Arthur source** — per-version eval/halluc/p95/cost metrics, traffic split, KPI deltas, `lastEditedBy`, the two-version A/B text diff — are **removed** (markup deleted, not replaced with static placeholders). Tags are real (`AgenticPromptVersionResponse.tags`), so a `production` badge and a tag filter are backed by data and kept. + +## Current state (mock) + +`apps/dashboard/components/cockpit/screens/prompts.tsx` (`PromptsScreen`) consumes three mock slices via `const D = AIWF_DATA`: + +1. `D.PROMPTS: Prompt[]` — 7 entries. Per the mock `Prompt` type (`apps/dashboard/lib/types.ts:64`): + `id`, `name`, `workflow`, `workflowName`, `span`, `versionCount`, `current`, `trafficSplit: Record`, `evalScore`, `evalDelta`, `lastEditedBy`, `lastEditedAtMin`, `tags: PromptTag[]`, `model`. +2. `D.PROMPT_VERSIONS: Record` — only `p_plan_changes` has history. Per `PromptVersion` (`types.ts:81`): + `v`, `deployedAt`, `by`, `status: PromptTag`, `traffic`, `evalScore`, `runs`, `costAvg`, `p95`, `halluc`, `change`. +3. `D.PROMPT_BODIES: Record` — body text keyed by version label (`v12`, `v11`). + +`PromptTag = "production" | "staging" | "draft" | "archived" | "locked" | "ab-test"`. + +What the screen renders from these: +- **Header KPIs** (`CkKPI`): total prompts, count in `production`, count of `ab-test`, and a hardcoded `"+0.4%"` avg eval delta. +- **Left rail `PromptList`:** tag filter pills (`all/production/staging/draft/locked`), per-prompt row showing `name`, `current` version, `workflowName`, tag chips, and an `evalScore`/`evalDelta` figure. +- **Right pane `PromptDetail`:** header eyebrow `Arthur · {workflowName} → {span}`, `+ New version` / `Deploy` buttons, four `Stat`s (current version, version count, eval score, traffic split), a **version timeline** of `PromptVersion[]`, a two-column **text diff** between two selected versions (`PromptDiff`, reads `PROMPT_BODIES`), and a **side-by-side metrics** table (`PromptMetrics`: evalScore/halluc/p95/costAvg/runs). It already has graceful empty states: "Select a prompt to inspect." and "Detailed version history not yet captured for this prompt." (rendered when `versions.length === 0`). + +The page (`app/(cockpit)/prompts/page.tsx`) is a 4-line stub that renders `` with no data fetch. + +## Existing pattern (template) + +Real data flows through three layers (see `app/overview-data.tsx`, `app/runs-data.tsx`): + +1. `app/(cockpit)//page.tsx` — thin server route: `}>`. +2. `app/-data.tsx` — **server component**: calls `getJSON(path)` (`lib/api/server.ts`, server-only fetch with `Bearer WORKER_API_TOKEN`, `cache: "no-store"`, 10s timeout), `.catch()`es to a fallback in `lib/api/fallbacks.ts`, passes a `data` prop to the client screen. +3. `components/cockpit/screens/.tsx` — **client presenter**: receives `data`, renders. Untracked metrics arrive `null`/empty and render as `—` or an empty state. + +Worker routes live under `apps/worker/src/routes/api/v1/*.get.ts` as h3 `defineEventHandler`s returning a typed `@shared/contracts` response, gated by the shared bearer token (`apps/worker/src/lib/api-auth.ts`). Response interfaces are declared in `apps/shared/contracts/api.ts`; row/entity types in `apps/shared/contracts/domain.ts`. + +## Proposed data contract + +Add to `apps/shared/contracts/api.ts`. Entity type goes in `domain.ts` (currently has no prompt type). + +### `apps/shared/contracts/domain.ts` (new entities) + +```ts +/** One Arthur version of a named prompt (metadata; body fetched on demand). */ +export interface PromptVersion { + /** Arthur integer version number. */ + version: number; + /** ISO timestamp the version was created. */ + createdAt: string; + /** Real Arthur tags on this version, e.g. ["production"]. */ + tags: string[]; + modelProvider: string; + modelName: string; + numMessages: number; + numTools: number; + /** Body text. Present only for the production version (eager); other + * versions are fetched on demand via the by-version endpoint. */ + body?: string; +} + +/** A workflow phase prompt as resolved by the worker at runtime. */ +export interface PromptDef { + /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */ + name: string; + /** Human label for the workflow phase, e.g. "Research & Plan". */ + phase: string; + /** Resolved production prompt body (Arthur production tag, or in-code fallback). */ + body: string; + /** Where the resolved `body` came from. */ + source: "arthur" | "fallback"; + /** Model the agent runs this prompt with (env-derived). */ + model: string; + /** Real Arthur version history, newest first. Empty when source is "fallback". */ + versions: PromptVersion[]; +} +``` + +### `apps/shared/contracts/api.ts` (new response) + +```ts +export interface PromptsResponse { + generatedAt: string; + /** `false` when the worker can't resolve prompts (degrades to empty list). */ + available: boolean; + /** Whether Arthur is configured (key + endpoint + task id all set). When + * false, every prompt's `source` is "fallback" and `versions` is empty. */ + arthurEnabled: boolean; + rows: PromptDef[]; + total: number; +} + +/** On-demand body for a single historical Arthur version. */ +export interface PromptVersionBodyResponse { + generatedAt: string; + available: boolean; + body: string | null; +} +``` + +**Body fetch strategy — decided: eager for the production version, lazy for the rest.** The list response carries every phase prompt with its full `versions` metadata array and the **production body eagerly** on `PromptDef.body` (we already fetch it to resolve what the workflow uses, so it's free). Non-production version bodies are NOT shipped in this response — `PromptVersion.body` is `undefined` for them. When the user expands a historical version, the screen fetches that single body on demand through a second worker route (see "Worker routes"). This keeps the list response small (3 bodies, not N) and avoids fanning out an unbounded number of Arthur body calls per page load. + +Notes: +- `available` follows the `RunsResponse`/`RunDetailResponse` convention: `true` on a successful resolve, `false` in the fallback object. +- `arthurEnabled` lets the screen honestly say "showing in-code defaults" vs "showing production prompts from Arthur". +- Per-version eval/halluc/p95/cost metrics, traffic split, and `lastEditedBy` are **not** in the contract — Arthur's version list is metadata only and has no such source. The screen markup that rendered them is removed. + +## Worker routes + +### `GET /api/v1/prompts` — list (new file `apps/worker/src/routes/api/v1/prompts.get.ts`, mirrors `runs.get.ts`) + +- `defineEventHandler` returning `PromptsResponse`, same `Cache-Control: private, max-age=15, stale-while-revalidate=60` header. +- Resolve all three phase prompts via a shared helper `resolvePrompts()`. The exact production-body resolution already lives in `loadPrompts()` (`workflows/prompts-step.ts`), which is a `"use step"` durable step returning `{ research, implement, review }` — not callable from a plain h3 route. **Decision (option A, confirmed OK to touch the step):** extract the pure resolution into `apps/worker/src/lib/prompts/resolve.ts`, returning `PromptDef[]` + `arthurEnabled`, and have **both** `loadPrompts()` and the route call it. Single source of truth, no drift. +- Per prompt, `resolvePrompts()` does: + - `model` = `env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL` (same expression as `runs.get.ts`). + - `phase` from a static label map: `research-plan → "Research & Plan"`, `implement → "Implement"`, `review → "Review"`. + - When Arthur is enabled: fetch the `production`-tagged body via the existing `ArthurClient.getPromptByTag(taskId, name, "production")` (→ `body`, `source: "arthur"`), AND fetch the version list via a new `ArthurClient.listPromptVersions(taskId, name)` (→ `versions: PromptVersion[]`, newest first). Any single failure degrades that prompt to its in-code fallback body, `source: "fallback"`, `versions: []` — same per-prompt try/catch the current step already has. + - When Arthur is disabled: `body` = `PROMPT_FALLBACKS[name]`, `source: "fallback"`, `versions: []`. +- `available: true` on success; the `catch` returns the empty `available:false` object (matching `runs.get.ts`). Resolution rarely fully throws because each prompt independently falls back, so the happy path always has three rows. + +### `GET /api/v1/prompts/[name]/versions/[version]` — on-demand body (new file) + +Backs lazy body fetching for historical versions the user expands. New file `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts` (h3 dynamic-segment pattern, same as the existing `runs/[runId].get.ts`): + +- Reads route params `name` and `version`, validates `name` against `PROMPT_NAMES` (404/empty otherwise), calls a new `ArthurClient.getPromptVersionBody(taskId, name, version)` which hits `GET /api/v1/tasks/{task_id}/prompts/{name}/versions/{version}` and returns the first message content (the existing `getPromptByTag` already parses this `AgenticPrompt.messages[0].content` shape — generalize it to accept any `{prompt_version}`). +- Returns a small typed response `PromptVersionBodyResponse { generatedAt; available: boolean; body: string | null }` (add to `api.ts`). When Arthur is disabled or the version is missing → `available:false, body:null`. +- Same `Cache-Control` header and bearer gate as the other v1 routes. + +## Dashboard wiring + +1. **`lib/api/fallbacks.ts`** — add `promptsFallback(now)`: + ```ts + export function promptsFallback(now: string): PromptsResponse { + return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 }; + } + ``` +2. **`app/prompts-data.tsx`** (new server component), single fetch like `runs-data.tsx`: + ```ts + const data = await getJSON("/api/v1/prompts").catch(() => promptsFallback(now)); + return ; + ``` +3. **`app/prompts-skeleton.tsx`** (new) — header + KPI row + two-column (rail + detail) block, styled like `overview-skeleton.tsx`. +4. **`app/(cockpit)/prompts/page.tsx`** — rewrite to the `}>` shape. +5. **`components/cockpit/screens/prompts.tsx`** — change `PromptsScreen()` to `PromptsScreen({ data }: { data: PromptsResponse })`. Map the real `PromptDef[]` onto the existing UI. Keep the tag filter and version timeline (now real), but **delete** the per-version metrics grid and the two-column A/B diff (no backing data). Historical-version body expansion fetches lazily from the on-demand route. +6. **On-demand version-body fetch (client).** `PromptsScreen` is a `"use client"` presenter, so expanding a historical version does a client-side `fetch`. The bearer-gated worker API is not directly reachable from the browser (the `WORKER_API_TOKEN` is server-only — see `lib/api/server.ts`). So add a thin Next route handler `app/api/prompts/[name]/versions/[version]/route.ts` that re-uses `getJSON("/api/v1/prompts//versions/")` server-side and returns it to the client. The screen fetches `/api/prompts/{name}/versions/{version}` (same-origin, no token exposure). Cache the resolved body in component state so re-expanding doesn't refetch. + +### Screen mapping (mock field → real field / behavior) + +| Mock usage | Real replacement | +|---|---| +| `D.PROMPTS` list | `data.rows` (3 `PromptDef`) | +| `p.id` (row key, selection) | `p.name` (stable key) | +| `p.name` | `p.name` | +| `p.workflowName` / `p.span` (eyebrow) | `p.phase` (eyebrow `{data.arthurEnabled ? "Arthur" : "In-code"} · {p.phase}`) | +| `p.current` version badge | real: highest `p.versions[].version`, or the production-tagged version number; show `source` chip alongside | +| `p.tags` chips + tag filter pills | **kept, real** — derive the row's tags from its production version's `tags` (`p.versions.find(v => v.tags.includes("production"))?.tags`), and per-version `tags` in the timeline. Filter pills reduced to tags that actually occur (e.g. `all` + `production`). | +| `p.evalScore` / `p.evalDelta` | **removed** (no Arthur source — markup deleted) | +| `D.PROMPT_VERSIONS[id]` timeline | **kept, real** — `p.versions` (`{version, createdAt, tags, modelName, numMessages, numTools}`), newest first. Each entry shows version number, `createdAt`, tag chips, `modelName`, message/tool counts. The mock's eval/halluc/p95/cost rows in each timeline card are **removed**. | +| `D.PROMPT_BODIES[v]` two-column diff (`PromptDiff`) | **removed** — replaced by a single read-only body panel. Shows `p.body` (production) by default; clicking a timeline version fetches that version's body via the on-demand route and renders it in the same panel. | +| `PromptMetrics` side-by-side table | **removed** (no per-version metrics) | +| Header KPIs (total / production / ab-test / avg Δ) | total = `data.total`; "In production" = count of rows whose versions include a `production` tag; ab-test and avg-Δ tiles **removed** (no source) | +| `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons | left inert (read-only), matching how `/runs` left its `+ Filter` / `Export` buttons | + +Faithful render: left rail lists the 3 prompts by `name` + `phase` + `model` + production tag chip; right pane shows a read-only body panel (production body by default, swappable to a selected historical version fetched on demand) plus the real version timeline. Reuses `CkCard`/`CkKPI`/`Stat`, the chip styling (repurposed for real `tags`), and the single-column body markup lifted from the old `PromptDiff`. + +## Behavior + +- **Happy path (Arthur disabled — current production reality):** `/prompts` lists the 3 workflow prompts with their in-code fallback bodies, `source: "fallback"`, `arthurEnabled: false`, `versions: []`. Eyebrow reflects "In-code". The version timeline section is empty (no markup, since there are no versions). Bodies are exactly what the agent runs. +- **Happy path (Arthur enabled):** each prompt's production body and full real version history come from Arthur (`source: "arthur"`). The timeline lists every Arthur version with its real `version`, `createdAt`, `tags`, and `modelName`. Expanding a historical version fetches its body on demand via `GET /api/v1/prompts/[name]/versions/[version]`. A prompt that fails to resolve from Arthur degrades to its fallback body with `versions: []`. +- **Worker down / 401:** `getJSON` throws → `promptsFallback` → empty list, `available:false`. The screen shows its "Select a prompt to inspect." empty state with `0 prompts`. No crash. Same silent-fallback as `/runs`. An on-demand body fetch that fails renders an inline "version body unavailable" note, not a page crash. + +## Out of scope + +- Editing, creating, deploying, or version-bumping prompts (the `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons stay inert). +- Per-version eval/halluc/p95/cost metrics and the two-version A/B text diff — no Arthur source; markup removed. +- Traffic split, `lastEditedBy`, eval deltas — no source; markup removed. +- Wiring the `/editor` view (separate `workflow-editor` screen). + +## Open questions / assumptions + +Resolved by user decisions and Arthur API ground-truthing: + +- **Read-only — confirmed.** No write endpoints; action buttons stay inert. +- **Version history — confirmed in scope.** Real Arthur version history (metadata + on-demand bodies) is included. Per-version eval metrics are NOT available from Arthur's version-list endpoint (metadata only: `{version, created_at, tags, model_name, num_messages, num_tools}`), so the mock's per-version metrics are dropped — confirmed acceptable. +- **Tags are real.** The `production` badge and the tag filter are backed by `AgenticPromptVersionResponse.tags`; kept. +- **Resolution-helper extraction — confirmed.** Shared `resolvePrompts()` used by both `loadPrompts()` and the route; OK to touch `prompts-step.ts`. +- **Embellishment fields — removed, not stubbed.** Per the user decision, fields with no real backing have their markup deleted rather than rendered as static placeholders. + +Still open: + +1. **Lazy vs eager body fetch — proposed eager-for-production, lazy-for-history.** Stated above; flagged here in case you'd rather ship all version bodies eagerly (simpler client, larger/slower response) or fetch even the production body lazily (smaller list response, extra round-trip on first view). +2. **Version pagination depth.** Arthur's `…/versions` endpoint is paginated. Assumption: fetch the first page only (newest N, e.g. default page size) and not the full history — sufficient for the timeline. Confirm whether deep history (all pages) is required. + +## Verification + +1. `apps/shared` + `apps/worker` typecheck (`pnpm -F @apps/worker typecheck` or `npx tsc --noEmit`). +2. Worker `GET /api/v1/prompts` returns 3 rows with non-empty `body`, correct `source`, `arthurEnabled` reflecting env, and (Arthur on) a non-empty `versions[]` with real `version`/`createdAt`/`tags`. Existing `prompts-step` tests still pass. +3. Worker `GET /api/v1/prompts/research-plan/versions/` returns that version's `body` (Arthur on) or `available:false` (Arthur off / missing). +4. Dashboard typecheck passes. +5. `/prompts` renders the 3 real prompts; selecting one shows its production body; the timeline lists real Arthur versions; expanding one fetches and shows that version's body. With Arthur disabled, `source` is `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`. +6. With the worker unreachable, `/prompts` shows the empty state (`0 prompts`), not an error. + +