-
Arthur engine · prompt versioning
+
+ {data.arthurEnabled ? "Arthur engine · prompt versioning" : "In-code defaults · prompt versioning"}
+
Prompt registry
@@ -338,16 +274,18 @@ export function PromptsScreen() {
-
-
-
p.tags.includes("production")).length.toString()} sub="serving traffic" />
- p.tags.includes("ab-test")).length.toString()} sub="live experiments" />
-
+
+
+
);
diff --git a/apps/dashboard/lib/api/fallbacks.ts b/apps/dashboard/lib/api/fallbacks.ts
index d7c81cb..d0eec95 100644
--- a/apps/dashboard/lib/api/fallbacks.ts
+++ b/apps/dashboard/lib/api/fallbacks.ts
@@ -1,6 +1,9 @@
import type {
KpisResponse,
EvalHealthResponse,
+ EvalsResponse,
+ CostResponse,
+ PromptsResponse,
RunsResponse,
RunDetailResponse,
LiveRunsResponse,
@@ -42,3 +45,23 @@ export function liveRunsFallback(now: string): LiveRunsResponse {
export function workflowsFallback(now: string): WorkflowsResponse {
return { generatedAt: now, rows: [], total: 0 };
}
+
+export function evalsFallback(now: string): EvalsResponse {
+ return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+
+export function costFallback(now: string): CostResponse {
+ return {
+ generatedAt: now,
+ available: false,
+ window: { start: now, end: now },
+ totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+ byModel: [],
+ byWorkflow: [],
+ daily: [],
+ };
+}
+
+export function promptsFallback(now: string): PromptsResponse {
+ return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+}
diff --git a/apps/shared/contracts/api.ts b/apps/shared/contracts/api.ts
index 32eecbc..4f3d171 100644
--- a/apps/shared/contracts/api.ts
+++ b/apps/shared/contracts/api.ts
@@ -1,4 +1,4 @@
-import type { Run, RunDetail, RunStep, Workflow } from "./domain.js";
+import type { PromptDef, Run, RunDetail, RunStep, Workflow } from "./domain.js";
export interface ErrorEnvelope {
error: { code: string; message: string; details?: unknown };
@@ -24,6 +24,90 @@ export type EvalHealthResponse =
}
| { available: false; reason: string };
+export type EvalsResponse =
+ | {
+ available: true;
+ generatedAt: string;
+ windowHours: number;
+ /** continuous_eval_success_rate × 100, fleet-wide. */
+ score: number;
+ /** Σ eval_count across tasks — "spans graded" in the window. */
+ spansGraded: number;
+ /** Σ trace_count across tasks. */
+ traceCount: number;
+ }
+ | { available: false; generatedAt: string; reason: string };
+
+export interface CostByModelEntry {
+ /** Arthur span model_name. */
+ model: string;
+ /** USD, summed total_token_cost over the window. */
+ cost: number;
+ /** Summed total_token_count over the window. */
+ tokens: number;
+}
+
+export interface CostByWorkflowEntry {
+ /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */
+ taskId: string;
+ /** Arthur task name (= the ticket-run identifier). */
+ name: string;
+ /** trace_count for the task. */
+ runs: number;
+ /** trace_token_count. */
+ tokens: number;
+ /** trace_token_cost (USD). */
+ cost: number;
+ /** cost / max(1, runs). */
+ costPerRun: number;
+}
+
+export interface CostResponse {
+ generatedAt: string;
+ /**
+ * false when Arthur is unconfigured/unreachable or returns nothing. The
+ * screen renders its empty/N-A state.
+ */
+ available: boolean;
+ /** Window the figures cover (the request's start_time/end_time). ISO. */
+ window: { start: string; end: string };
+ totals: {
+ /** USD, Σ overviews[].trace_token_cost. */
+ totalTokenCost: number;
+ /** Σ overviews[].trace_token_count. */
+ totalTokens: number;
+ /** Σ overviews[].trace_count. */
+ traceCount: number;
+ /** totalTokenCost / max(1, traceCount). */
+ costPerRun: number;
+ };
+ byModel: CostByModelEntry[];
+ /** Per-task (= per ticket-run) breakdown from /traces/overview. */
+ byWorkflow: CostByWorkflowEntry[];
+ /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */
+ daily: { date: string; cost: number; tokens: number }[];
+}
+
+export interface PromptsResponse {
+ generatedAt: string;
+ /** `false` when the worker can't resolve prompts (degrades to empty list). */
+ available: boolean;
+ /**
+ * Whether Arthur is configured (key + endpoint + task id all set). When
+ * false, every prompt's `source` is "fallback" and `versions` is empty.
+ */
+ arthurEnabled: boolean;
+ rows: PromptDef[];
+ total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+ generatedAt: string;
+ available: boolean;
+ body: string | null;
+}
+
export interface LiveRunsResponse {
generatedAt: string;
rows: Run[];
diff --git a/apps/shared/contracts/domain.ts b/apps/shared/contracts/domain.ts
index 6aba292..d4868a4 100644
--- a/apps/shared/contracts/domain.ts
+++ b/apps/shared/contracts/domain.ts
@@ -133,3 +133,36 @@ export interface HourPoint {
p95: number;
errors: number;
}
+
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+ /** Arthur integer version number. */
+ version: number;
+ /** ISO timestamp the version was created. */
+ createdAt: string;
+ /** Real Arthur tags on this version, e.g. ["production"]. */
+ tags: string[];
+ modelProvider: string;
+ modelName: string;
+ numMessages: number;
+ numTools: number;
+ /** Body text. Present only for the production version (eager); other
+ * versions are fetched on demand via the by-version endpoint. */
+ body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+ /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+ name: string;
+ /** Human label for the workflow phase, e.g. "Research & Plan". */
+ phase: string;
+ /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+ body: string;
+ /** Where the resolved `body` came from. */
+ source: "arthur" | "fallback";
+ /** Model the agent runs this prompt with (env-derived). */
+ model: string;
+ /** Real Arthur version history, newest first. Empty when source is "fallback". */
+ versions: PromptVersion[];
+}
diff --git a/apps/worker/src/lib/overview/collect-cost.test.ts b/apps/worker/src/lib/overview/collect-cost.test.ts
new file mode 100644
index 0000000..f0d9736
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-cost.test.ts
@@ -0,0 +1,186 @@
+import { describe, it, expect, vi } from "vitest";
+import { collectCost, type CostArthurClient } from "./collect-cost.js";
+import type {
+ TraceOverviewListResponse,
+ TraceTimeseriesPoint,
+ ModelTokenCost,
+} from "../../sandbox/arthur-client.js";
+
+const NOW = new Date("2026-06-08T12:00:00.000Z");
+
+function makeClient(opts: {
+ overview: TraceOverviewListResponse;
+ timeseries: Record
;
+ byModel: ModelTokenCost[];
+}): CostArthurClient {
+ return {
+ getTracesOverview: vi.fn().mockResolvedValue(opts.overview),
+ getTracesTimeseries: vi
+ .fn()
+ .mockImplementation((taskId: string) =>
+ Promise.resolve(opts.timeseries[taskId] ?? []),
+ ),
+ aggregateSpanTokensByModel: vi.fn().mockResolvedValue(opts.byModel),
+ };
+}
+
+describe("collectCost", () => {
+ it("aggregates totals, per-task breakdown, by-model, and merged daily series", async () => {
+ const client = makeClient({
+ overview: {
+ count: 2,
+ overviews: [
+ {
+ task_id: "t1",
+ trace_count: 4,
+ trace_token_count: 1000,
+ trace_token_cost: 2.0,
+ eval_count: 0,
+ continuous_eval_success_rate: 1,
+ last_active: "2026-06-08",
+ },
+ {
+ task_id: "t2",
+ trace_count: 6,
+ trace_token_count: 3000,
+ trace_token_cost: 4.0,
+ eval_count: 0,
+ continuous_eval_success_rate: 1,
+ },
+ ],
+ },
+ timeseries: {
+ t1: [
+ { timestamp: "2026-06-06", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 },
+ { timestamp: "2026-06-07", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 },
+ ],
+ t2: [
+ { timestamp: "2026-06-07", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 },
+ { timestamp: "2026-06-08", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 },
+ ],
+ },
+ byModel: [
+ { model: "claude-opus-4-6", tokens: 3000, cost: 5.0 },
+ { model: "claude-haiku", tokens: 1000, cost: 1.0 },
+ ],
+ });
+
+ const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+ // totals
+ expect(data.totals).toEqual({
+ totalTokenCost: 6.0,
+ totalTokens: 4000,
+ traceCount: 10,
+ costPerRun: 0.6,
+ });
+
+ // window = calendar MTD
+ expect(data.window.start).toBe("2026-06-01T00:00:00.000Z");
+ expect(data.window.end).toBe(NOW.toISOString());
+
+ // byWorkflow = per-task, with costPerRun guarded
+ expect(data.byWorkflow).toEqual([
+ { taskId: "t1", name: "t1", runs: 4, tokens: 1000, cost: 2.0, costPerRun: 0.5 },
+ { taskId: "t2", name: "t2", runs: 6, tokens: 3000, cost: 4.0, costPerRun: 4 / 6 },
+ ]);
+
+ // byModel passthrough mapped to contract shape
+ expect(data.byModel).toEqual([
+ { model: "claude-opus-4-6", cost: 5.0, tokens: 3000 },
+ { model: "claude-haiku", cost: 1.0, tokens: 1000 },
+ ]);
+
+ // daily merged by timestamp, oldest -> newest
+ expect(data.daily).toEqual([
+ { date: "2026-06-06", cost: 1.0, tokens: 500 },
+ { date: "2026-06-07", cost: 3.0, tokens: 2000 },
+ { date: "2026-06-08", cost: 2.0, tokens: 1500 },
+ ]);
+ });
+
+ it("treats null trace_token_cost as 0 and guards divide-by-zero", async () => {
+ const client = makeClient({
+ overview: {
+ count: 1,
+ overviews: [
+ {
+ task_id: "t1",
+ trace_count: 0,
+ trace_token_count: 0,
+ trace_token_cost: null,
+ eval_count: 0,
+ continuous_eval_success_rate: 0,
+ },
+ ],
+ },
+ timeseries: { t1: [] },
+ byModel: [],
+ });
+
+ const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+ expect(data.totals).toEqual({
+ totalTokenCost: 0,
+ totalTokens: 0,
+ traceCount: 0,
+ costPerRun: 0,
+ });
+ expect(data.byWorkflow).toEqual([
+ { taskId: "t1", name: "t1", runs: 0, tokens: 0, cost: 0, costPerRun: 0 },
+ ]);
+ expect(data.byModel).toEqual([]);
+ expect(data.daily).toEqual([]);
+ });
+
+ it("returns empty aggregates when Arthur has no tasks", async () => {
+ const client = makeClient({
+ overview: { count: 0, overviews: [] },
+ timeseries: {},
+ byModel: [],
+ });
+
+ const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+ expect(data.totals).toEqual({
+ totalTokenCost: 0,
+ totalTokens: 0,
+ traceCount: 0,
+ costPerRun: 0,
+ });
+ expect(data.byWorkflow).toEqual([]);
+ expect(data.byModel).toEqual([]);
+ expect(data.daily).toEqual([]);
+ // No tasks -> no per-task timeseries fan-out.
+ expect(client.getTracesTimeseries).not.toHaveBeenCalled();
+ });
+
+ it("caps the daily timeseries fan-out to the 50 most-active tasks", async () => {
+ // 60 tasks, each with a distinct trace_count so the top-50 are deterministic.
+ const overviews = Array.from({ length: 60 }, (_, i) => ({
+ task_id: `t${i}`,
+ trace_count: i, // t59 most active, t0 least
+ trace_token_count: 0,
+ trace_token_cost: 0,
+ eval_count: 0,
+ continuous_eval_success_rate: 0,
+ }));
+ const client = makeClient({
+ overview: { count: overviews.length, overviews },
+ timeseries: {},
+ byModel: [],
+ });
+
+ await collectCost(client, { now: NOW, bucketSize: "day" });
+
+ // Only the 50 highest-trace_count tasks are queried (t10..t59).
+ expect(client.getTracesTimeseries).toHaveBeenCalledTimes(50);
+ const queried = (client.getTracesTimeseries as ReturnType).mock.calls.map(
+ (c) => c[0],
+ );
+ expect(queried).not.toContain("t0");
+ expect(queried).not.toContain("t9");
+ expect(queried).toContain("t10");
+ expect(queried).toContain("t59");
+ });
+});
diff --git a/apps/worker/src/lib/overview/collect-cost.ts b/apps/worker/src/lib/overview/collect-cost.ts
new file mode 100644
index 0000000..258cb0e
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-cost.ts
@@ -0,0 +1,142 @@
+import type { CostResponse } from "@shared/contracts";
+import { logger } from "../logger.js";
+import type {
+ TraceOverviewListResponse,
+ TraceTimeseriesPoint,
+ ModelTokenCost,
+} from "../../sandbox/arthur-client.js";
+
+/**
+ * The slice of `ArthurClient` the cost collector depends on. The real object is
+ * an `ArthurClient`; this narrow interface keeps the aggregation testable with a
+ * fake (mirrors `RunsLister` for the run-store collectors).
+ */
+export interface CostArthurClient {
+ getTracesOverview(
+ taskIds: string[],
+ startTime: string,
+ endTime: string,
+ ): Promise;
+ getTracesTimeseries(
+ taskId: string,
+ startTime: string,
+ endTime: string,
+ bucketSize: string,
+ ): Promise;
+ aggregateSpanTokensByModel(
+ taskIds: string[],
+ startTime: string,
+ endTime: string,
+ ): Promise;
+}
+
+export interface CollectCostOptions {
+ now: Date;
+ /** Bucket granularity for the daily-spend timeseries. */
+ bucketSize: string;
+}
+
+/**
+ * Shapes a `CostResponse` (minus `generatedAt`/`available`) from Arthur's
+ * pre-aggregated token/cost data. Cost comes straight from Arthur's
+ * `*_token_cost` fields — no client-side pricing.
+ *
+ * - `totals` + `byWorkflow` come from one `getTracesOverview` call. Arthur tasks
+ * ARE the workflow grouping (per ticket-run), so each overview row is one
+ * `byWorkflow` entry.
+ * - `byModel` comes from `aggregateSpanTokensByModel` (the one client-side
+ * grouping, since Arthur has no per-model overview).
+ * - `daily` fans out one `getTracesTimeseries` call per task that appears in the
+ * overview and merges points by bucket timestamp.
+ */
+export async function collectCost(
+ client: CostArthurClient,
+ opts: CollectCostOptions,
+): Promise> {
+ const { now, bucketSize } = opts;
+ // Assumption: calendar month-to-date (matches the original "MTD" framing).
+ // TODO(arthur-verify): confirm the intended window (calendar MTD vs rolling 30d/24h).
+ const start = startOfMonthUTC(now).toISOString();
+ const end = now.toISOString();
+
+ // TODO(arthur-verify): empty `task_ids` is assumed to mean org-wide. If Arthur
+ // requires explicit ids, enumerate the org's tasks and pass them instead.
+ const { overviews } = await client.getTracesOverview([], start, end);
+
+ let totalTokenCost = 0;
+ let totalTokens = 0;
+ let traceCount = 0;
+ const byWorkflow = overviews.map((o) => {
+ // trace_token_cost is null when Arthur has no cost data — treat as 0.
+ const cost = o.trace_token_cost ?? 0;
+ totalTokenCost += cost;
+ totalTokens += o.trace_token_count;
+ traceCount += o.trace_count;
+ return {
+ taskId: o.task_id,
+ // Arthur task name = the ticket-run identifier; overview omits it, so the
+ // task_id (which IS that identifier) doubles as the display name.
+ // TODO(arthur-verify): task->workflow mapping — rows stay per-task.
+ name: o.task_id,
+ runs: o.trace_count,
+ tokens: o.trace_token_count,
+ cost,
+ costPerRun: o.trace_count > 0 ? cost / o.trace_count : 0,
+ };
+ });
+
+ const totals = {
+ totalTokenCost,
+ totalTokens,
+ traceCount,
+ costPerRun: traceCount > 0 ? totalTokenCost / traceCount : 0,
+ };
+
+ const byModelRaw = await client.aggregateSpanTokensByModel([], start, end);
+ const byModel = byModelRaw.map((m) => ({
+ model: m.model,
+ cost: m.cost,
+ tokens: m.tokens,
+ }));
+
+ // Fan out one timeseries call per task that has data, then merge by bucket.
+ // Tasks are per-ticket-run, so a busy month can be hundreds — cap the fan-out
+ // to the most-active tasks to avoid an unbounded burst of requests.
+ // TODO(arthur-verify): cap is by trace_count, on the assumption the highest-
+ // traffic tasks dominate the daily-spend curve; revisit if the chart looks short.
+ const DAILY_FANOUT_CAP = 50;
+ const sortedByActivity = [...overviews].sort((a, b) => b.trace_count - a.trace_count);
+ const fanoutTasks = sortedByActivity.slice(0, DAILY_FANOUT_CAP);
+ if (sortedByActivity.length > DAILY_FANOUT_CAP) {
+ logger.info(
+ {
+ total: sortedByActivity.length,
+ capped: DAILY_FANOUT_CAP,
+ dropped: sortedByActivity.slice(DAILY_FANOUT_CAP).map((o) => o.task_id),
+ },
+ "cost_daily_fanout_capped",
+ );
+ }
+ const taskIds = fanoutTasks.map((o) => o.task_id);
+ const series = await Promise.all(
+ taskIds.map((id) => client.getTracesTimeseries(id, start, end, bucketSize)),
+ );
+ const merged = new Map();
+ for (const points of series) {
+ for (const p of points) {
+ const row = merged.get(p.timestamp) ?? { cost: 0, tokens: 0 };
+ row.cost += p.trace_token_cost ?? 0;
+ row.tokens += p.trace_token_count;
+ merged.set(p.timestamp, row);
+ }
+ }
+ const daily = [...merged.entries()]
+ .map(([date, v]) => ({ date, cost: v.cost, tokens: v.tokens }))
+ .sort((a, b) => (a.date < b.date ? -1 : a.date > b.date ? 1 : 0));
+
+ return { window: { start, end }, totals, byModel, byWorkflow, daily };
+}
+
+function startOfMonthUTC(now: Date): Date {
+ return new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), 1));
+}
diff --git a/apps/worker/src/lib/overview/collect-evals.test.ts b/apps/worker/src/lib/overview/collect-evals.test.ts
new file mode 100644
index 0000000..1b40a6f
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-evals.test.ts
@@ -0,0 +1,95 @@
+import { describe, it, expect, vi } from "vitest";
+import { collectEvals } from "./collect-evals.js";
+import type { TraceOverview } from "../../sandbox/arthur-client.js";
+
+const NOW = new Date("2026-06-08T12:00:00.000Z");
+
+function makeClient(overviews: TraceOverview[]) {
+ return { getTracesOverview: vi.fn().mockResolvedValue({ overviews }) };
+}
+
+function overview(over: Partial): TraceOverview {
+ return {
+ task_id: "t",
+ trace_count: 0,
+ trace_token_count: 0,
+ trace_token_cost: 0,
+ eval_count: 0,
+ continuous_eval_success_rate: 0,
+ ...over,
+ };
+}
+
+describe("collectEvals", () => {
+ it("sums spansGraded/traceCount and eval-count-weights the score", async () => {
+ const client = makeClient([
+ overview({ task_id: "a", trace_count: 10, eval_count: 8, continuous_eval_success_rate: 1.0 }),
+ overview({ task_id: "b", trace_count: 4, eval_count: 2, continuous_eval_success_rate: 0.5 }),
+ ]);
+
+ const result = await collectEvals({
+ client,
+ taskIds: [],
+ windowHours: 24,
+ now: NOW,
+ });
+
+ expect(result.spansGraded).toBe(10);
+ expect(result.traceCount).toBe(14);
+ // (1.0*8 + 0.5*2) / 10 * 100 = (8 + 1) / 10 * 100 = 90
+ expect(result.score).toBe(90);
+ expect(result.windowHours).toBe(24);
+ });
+
+ it("yields score 0 when nothing is graded (eval_count sums to 0)", async () => {
+ const client = makeClient([
+ overview({ task_id: "a", trace_count: 5, eval_count: 0 }),
+ ]);
+
+ const result = await collectEvals({
+ client,
+ taskIds: [],
+ windowHours: 24,
+ now: NOW,
+ });
+
+ expect(result.spansGraded).toBe(0);
+ expect(result.traceCount).toBe(5);
+ expect(result.score).toBe(0);
+ });
+
+ it("computes the window start from windowHours and passes the ISO range to the client", async () => {
+ const client = makeClient([]);
+
+ await collectEvals({
+ client,
+ taskIds: ["x", "y"],
+ windowHours: 24,
+ now: NOW,
+ });
+
+ expect(client.getTracesOverview).toHaveBeenCalledWith(
+ ["x", "y"],
+ "2026-06-07T12:00:00.000Z",
+ "2026-06-08T12:00:00.000Z",
+ );
+ });
+
+ it("returns zeroed aggregates when no overviews are returned", async () => {
+ const client = makeClient([]);
+
+ const result = await collectEvals({
+ client,
+ taskIds: [],
+ windowHours: 24,
+ now: NOW,
+ });
+
+ expect(result).toEqual({
+ windowHours: 24,
+ score: 0,
+ spansGraded: 0,
+ traceCount: 0,
+ });
+ });
+});
diff --git a/apps/worker/src/lib/overview/collect-evals.ts b/apps/worker/src/lib/overview/collect-evals.ts
new file mode 100644
index 0000000..b144232
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-evals.ts
@@ -0,0 +1,74 @@
+import type { EvalsResponse } from "@shared/contracts";
+import type { TraceOverview } from "../../sandbox/arthur-client.js";
+
+const HOUR = 3_600_000;
+
+/** Fleet aggregate fields the route spreads onto an `available: true` response. */
+export type EvalsAggregate = Pick<
+ Extract,
+ "windowHours" | "score" | "spansGraded" | "traceCount"
+>;
+
+/**
+ * The slice of `ArthurClient` the eval collector depends on. The real object is
+ * an `ArthurClient`; this narrow interface keeps the aggregation testable with a
+ * fake (mirrors `CostArthurClient` for the cost collector).
+ */
+export interface EvalsArthurClient {
+ getTracesOverview(
+ taskIds: string[],
+ startTime: string,
+ endTime: string,
+ ): Promise<{ overviews: TraceOverview[] }>;
+}
+
+export interface CollectEvalsOptions {
+ client: EvalsArthurClient;
+ // TODO(arthur-verify): unconfirmed whether `taskIds: []` means "all org tasks"
+ // on POST /api/v1/traces/overview. If not, the route must enumerate tasks first.
+ taskIds: string[];
+ windowHours: number;
+ now: Date;
+}
+
+/**
+ * Aggregates Arthur's per-task trace overviews into fleet-wide eval health:
+ * eval-count-weighted success rate × 100, summed spans-graded and trace counts
+ * over the window. When `spansGraded` sums to 0 (no continuous evals configured
+ * / nothing graded), `score` is 0 and the route turns that into
+ * `available: false`.
+ */
+export async function collectEvals(
+ opts: CollectEvalsOptions,
+): Promise {
+ const endTime = opts.now.toISOString();
+ const startTime = new Date(
+ opts.now.getTime() - opts.windowHours * HOUR,
+ ).toISOString();
+
+ const { overviews } = await opts.client.getTracesOverview(
+ opts.taskIds,
+ startTime,
+ endTime,
+ );
+
+ const spansGraded = sum(overviews, (o) => o.eval_count);
+ const traceCount = sum(overviews, (o) => o.trace_count);
+ const score =
+ spansGraded === 0
+ ? 0
+ : (sum(overviews, (o) => o.continuous_eval_success_rate * o.eval_count) /
+ spansGraded) *
+ 100;
+
+ return {
+ windowHours: opts.windowHours,
+ score,
+ spansGraded,
+ traceCount,
+ };
+}
+
+function sum(items: T[], pick: (item: T) => number): number {
+ return items.reduce((acc, item) => acc + (pick(item) || 0), 0);
+}
diff --git a/apps/worker/src/lib/overview/collect-prompts.test.ts b/apps/worker/src/lib/overview/collect-prompts.test.ts
new file mode 100644
index 0000000..8d382cb
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-prompts.test.ts
@@ -0,0 +1,164 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+
+vi.mock("../../../env.js", () => ({ env: {} }));
+
+const mockGetPromptByTag = vi.fn();
+const mockListPromptVersions = vi.fn();
+vi.mock("../../sandbox/arthur-client.js", () => ({
+ ArthurClient: {
+ fromTraceEndpoint: vi.fn(() => ({
+ getPromptByTag: mockGetPromptByTag,
+ listPromptVersions: mockListPromptVersions,
+ })),
+ },
+}));
+
+import { resolvePrompts } from "./collect-prompts.js";
+import { PROMPT_FALLBACKS } from "../prompts.js";
+
+async function setEnv(partial: Record) {
+ const mod = (await import("../../../env.js")) as unknown as {
+ env: Record;
+ };
+ mod.env = { ...mod.env, ...partial };
+}
+
+function arthurVersion(version: number, tags: string[]) {
+ return {
+ version,
+ created_at: `2026-06-0${version}T00:00:00.000Z`,
+ deleted_at: null,
+ model_provider: "anthropic",
+ model_name: "claude-opus-4-6",
+ tags,
+ num_messages: 1,
+ num_tools: 0,
+ };
+}
+
+describe("resolvePrompts", () => {
+ beforeEach(async () => {
+ mockGetPromptByTag.mockReset();
+ mockListPromptVersions.mockReset();
+ await setEnv({
+ AGENT_KIND: "claude",
+ CLAUDE_MODEL: "claude-opus-4-6",
+ CODEX_MODEL: "gpt-5-codex",
+ GENAI_ENGINE_API_KEY: undefined,
+ GENAI_ENGINE_TRACE_ENDPOINT: undefined,
+ GENAI_ENGINE_PROMPT_TASK_ID: undefined,
+ });
+ });
+
+ it("returns fallbacks with empty versions when Arthur is disabled", async () => {
+ const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+ expect(arthurEnabled).toBe(false);
+ expect(prompts).toHaveLength(3);
+ expect(prompts.map((p) => p.name)).toEqual(["research-plan", "implement", "review"]);
+ for (const p of prompts) {
+ expect(p.source).toBe("fallback");
+ expect(p.versions).toEqual([]);
+ expect(p.model).toBe("claude-opus-4-6");
+ }
+ expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+ expect(prompts[0].phase).toBe("Research & Plan");
+ expect(mockGetPromptByTag).not.toHaveBeenCalled();
+ });
+
+ it("returns fallbacks when PROMPT_TASK_ID is missing even if key+endpoint are set", async () => {
+ await setEnv({
+ GENAI_ENGINE_API_KEY: "k",
+ GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+ GENAI_ENGINE_PROMPT_TASK_ID: undefined,
+ });
+ const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+ expect(arthurEnabled).toBe(false);
+ expect(prompts[0].source).toBe("fallback");
+ expect(mockGetPromptByTag).not.toHaveBeenCalled();
+ });
+
+ it("resolves Arthur bodies + version history when enabled, attaching the production body", async () => {
+ await setEnv({
+ GENAI_ENGINE_API_KEY: "k",
+ GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+ GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+ });
+ mockGetPromptByTag.mockResolvedValue("arthur body");
+ mockListPromptVersions.mockResolvedValue([
+ arthurVersion(2, ["production"]),
+ arthurVersion(1, []),
+ ]);
+
+ const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+ expect(arthurEnabled).toBe(true);
+ expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
+ const research = prompts[0];
+ expect(research.source).toBe("arthur");
+ expect(research.body).toBe("arthur body");
+ expect(research.versions).toHaveLength(2);
+ expect(research.versions[0]).toMatchObject({
+ version: 2,
+ createdAt: "2026-06-02T00:00:00.000Z",
+ tags: ["production"],
+ modelProvider: "anthropic",
+ modelName: "claude-opus-4-6",
+ numMessages: 1,
+ numTools: 0,
+ });
+ // production version carries the eager body; the other does not
+ expect(research.versions[0].body).toBe("arthur body");
+ expect(research.versions[1].body).toBeUndefined();
+ });
+
+ it("falls back per-prompt when the production body is missing but keeps versions", async () => {
+ await setEnv({
+ GENAI_ENGINE_API_KEY: "k",
+ GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+ GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+ });
+ mockGetPromptByTag.mockResolvedValue(null);
+ mockListPromptVersions.mockResolvedValue([arthurVersion(1, [])]);
+
+ const { prompts } = await resolvePrompts({ withVersions: true });
+ expect(prompts[0].source).toBe("fallback");
+ expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+ expect(prompts[0].versions).toHaveLength(1);
+ });
+
+ it("degrades a prompt to fallback with empty versions when the body fetch throws", async () => {
+ await setEnv({
+ GENAI_ENGINE_API_KEY: "k",
+ GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+ GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+ });
+ mockGetPromptByTag.mockRejectedValue(new Error("boom"));
+ mockListPromptVersions.mockResolvedValue([]);
+
+ const { prompts } = await resolvePrompts({ withVersions: true });
+ expect(prompts[0].source).toBe("fallback");
+ expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+ expect(prompts[0].versions).toEqual([]);
+ });
+
+ it("skips the version fan-out and resolves empty versions when withVersions is false", async () => {
+ await setEnv({
+ GENAI_ENGINE_API_KEY: "k",
+ GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+ GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+ });
+ mockGetPromptByTag.mockResolvedValue("arthur body");
+
+ const { prompts } = await resolvePrompts({ withVersions: false });
+ expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
+ expect(mockListPromptVersions).not.toHaveBeenCalled();
+ expect(prompts[0].source).toBe("arthur");
+ expect(prompts[0].body).toBe("arthur body");
+ expect(prompts[0].versions).toEqual([]);
+ });
+
+ it("uses the codex model when AGENT_KIND=codex", async () => {
+ await setEnv({ AGENT_KIND: "codex" });
+ const { prompts } = await resolvePrompts({ withVersions: true });
+ expect(prompts[0].model).toBe("gpt-5-codex");
+ });
+});
diff --git a/apps/worker/src/lib/overview/collect-prompts.ts b/apps/worker/src/lib/overview/collect-prompts.ts
new file mode 100644
index 0000000..b36ee40
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-prompts.ts
@@ -0,0 +1,113 @@
+import type { PromptVersion } from "@shared/contracts";
+import { env } from "../../../env.js";
+import { logger } from "../logger.js";
+import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js";
+
+const PHASE_LABEL: Record = {
+ "research-plan": "Research & Plan",
+ "implement": "Implement",
+ "review": "Review",
+};
+
+export interface ResolvedPrompt {
+ name: PromptName;
+ phase: string;
+ body: string;
+ source: "arthur" | "fallback";
+ model: string;
+ versions: PromptVersion[];
+}
+
+export interface ResolvePromptsResult {
+ arthurEnabled: boolean;
+ prompts: ResolvedPrompt[];
+}
+
+/**
+ * Resolve each workflow phase prompt to its production body + (optionally) real
+ * Arthur version history. Shared by the durable `loadPrompts()` step and the
+ * `GET /api/v1/prompts` route so the two never drift.
+ *
+ * Version history is a dashboard-only concern, so `withVersions` lets the
+ * durable step skip the per-prompt `listPromptVersions` fan-out it would
+ * otherwise discard. When false, `versions` resolves to `[]` and only the
+ * production body is fetched.
+ *
+ * When Arthur is unconfigured (`GENAI_ENGINE_*`, incl. `GENAI_ENGINE_PROMPT_TASK_ID`,
+ * unset) every prompt resolves to its in-code `PROMPT_FALLBACKS` string with
+ * `source: "fallback"` and an empty version history.
+ */
+export async function resolvePrompts(opts: { withVersions: boolean }): Promise {
+ const { withVersions } = opts;
+ const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL;
+ const arthurEnabled =
+ !!env.GENAI_ENGINE_API_KEY &&
+ !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+ !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+ const base = (
+ name: PromptName,
+ body: string,
+ source: "arthur" | "fallback",
+ versions: PromptVersion[] = [],
+ ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions });
+
+ if (!arthurEnabled) {
+ logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved");
+ return {
+ arthurEnabled,
+ prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")),
+ };
+ }
+
+ const { ArthurClient } = await import("../../sandbox/arthur-client.js");
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT!,
+ env.GENAI_ENGINE_API_KEY!,
+ );
+ const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
+ const TAG = "production";
+
+ async function one(name: PromptName): Promise {
+ try {
+ // TODO(arthur-verify): version-list pagination depth — first page only.
+ let body: string | null;
+ let versions: PromptVersion[] = [];
+ if (withVersions) {
+ const [rawBody, rawVersions] = await Promise.all([
+ client.getPromptByTag(taskId, name, TAG),
+ client.listPromptVersions(taskId, name).catch(() => []),
+ ]);
+ body = rawBody;
+ versions = rawVersions.map((v) => ({
+ version: v.version,
+ createdAt: v.created_at,
+ tags: v.tags,
+ modelProvider: v.model_provider,
+ modelName: v.model_name,
+ numMessages: v.num_messages,
+ numTools: v.num_tools,
+ }));
+ // Attach the eager production body to its matching version entry; other
+ // version bodies are fetched on demand via the by-version route.
+ const prodVersion = versions.find((v) => v.tags.includes(TAG));
+ if (prodVersion && body !== null) prodVersion.body = body;
+ } else {
+ body = await client.getPromptByTag(taskId, name, TAG);
+ }
+
+ if (body === null) {
+ logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved");
+ return base(name, PROMPT_FALLBACKS[name], "fallback", versions);
+ }
+ logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved");
+ return base(name, body, "arthur", versions);
+ } catch (err) {
+ logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved");
+ return base(name, PROMPT_FALLBACKS[name], "fallback");
+ }
+ }
+
+ const prompts = await Promise.all(PROMPT_NAMES.map(one));
+ return { arthurEnabled, prompts };
+}
diff --git a/apps/worker/src/routes/api/v1/cost.get.ts b/apps/worker/src/routes/api/v1/cost.get.ts
new file mode 100644
index 0000000..6c51680
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/cost.get.ts
@@ -0,0 +1,43 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { CostResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectCost } from "../../../lib/overview/collect-cost.js";
+import { logger } from "../../../lib/logger.js";
+
+const EMPTY: Omit = {
+ window: { start: "", end: "" },
+ totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+ byModel: [],
+ byWorkflow: [],
+ daily: [],
+};
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+
+ const generatedAt = new Date().toISOString();
+
+ // Arthur unconfigured — degrade to the documented empty state (no crash).
+ if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+ return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } };
+ }
+
+ try {
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT,
+ env.GENAI_ENGINE_API_KEY,
+ );
+ // TODO(arthur-verify): bucket_size value ("day") is unconfirmed against a live instance.
+ const data = await collectCost(client, { now: new Date(), bucketSize: "day" });
+ return { generatedAt, available: true, ...data };
+ } catch (err) {
+ // Arthur unreachable / 401 / unexpected shape — degrade like runs.get.ts.
+ logger.warn({ err: (err as Error).message }, "cost_collect_failed");
+ return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } };
+ }
+});
diff --git a/apps/worker/src/routes/api/v1/evals.get.ts b/apps/worker/src/routes/api/v1/evals.get.ts
new file mode 100644
index 0000000..54300d5
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/evals.get.ts
@@ -0,0 +1,68 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { EvalsResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectEvals } from "../../../lib/overview/collect-evals.js";
+import { logger } from "../../../lib/logger.js";
+
+const WINDOW_HOURS = 24;
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+
+ const generatedAt = new Date().toISOString();
+
+ if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+ return {
+ available: false,
+ generatedAt,
+ reason: "Arthur GenAI Engine not configured.",
+ };
+ }
+
+ try {
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT,
+ env.GENAI_ENGINE_API_KEY,
+ );
+ // TODO(arthur-verify): pass [] if empty task_ids === all org tasks on
+ // POST /api/v1/traces/overview; otherwise enumerate via /api/v2/tasks/search.
+ const taskIds: string[] = [];
+
+ const { windowHours, score, spansGraded, traceCount } =
+ await collectEvals({
+ client,
+ taskIds,
+ windowHours: WINDOW_HOURS,
+ now: new Date(),
+ });
+
+ if (spansGraded === 0) {
+ return {
+ available: false,
+ generatedAt,
+ reason: "No graded evals in the last 24h.",
+ };
+ }
+
+ return {
+ available: true,
+ generatedAt,
+ windowHours,
+ score,
+ spansGraded,
+ traceCount,
+ };
+ } catch (err) {
+ logger.warn({ err: (err as Error).message }, "evals_list_failed");
+ return {
+ available: false,
+ generatedAt,
+ reason: "Eval grading not wired up yet.",
+ };
+ }
+});
diff --git a/apps/worker/src/routes/api/v1/prompts.get.ts b/apps/worker/src/routes/api/v1/prompts.get.ts
new file mode 100644
index 0000000..d0686d4
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/prompts.get.ts
@@ -0,0 +1,29 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { PromptsResponse } from "@shared/contracts";
+import { resolvePrompts } from "../../../lib/overview/collect-prompts.js";
+import { logger } from "../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+
+ const generatedAt = new Date().toISOString();
+ try {
+ const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+ return {
+ generatedAt,
+ available: true,
+ arthurEnabled,
+ rows: prompts,
+ total: prompts.length,
+ };
+ } catch (err) {
+ // Arthur unreachable / unexpected failure — degrade to the documented empty
+ // state so the dashboard renders its N/A view instead of a 500.
+ logger.warn({ err: (err as Error).message }, "prompts_resolve_failed");
+ return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 };
+ }
+});
diff --git a/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
new file mode 100644
index 0000000..c30fffd
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
@@ -0,0 +1,44 @@
+import { defineEventHandler, getRouterParam, setResponseHeader } from "h3";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+import { env } from "../../../../../../../env.js";
+import { PROMPT_NAMES, type PromptName } from "../../../../../../lib/prompts.js";
+import { logger } from "../../../../../../lib/logger.js";
+
+// TODO(arthur-verify): lazy-vs-eager body — historical bodies are fetched on
+// demand here; the production body ships eagerly on the list route.
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+ const generatedAt = new Date().toISOString();
+
+ const name = getRouterParam(event, "name") ?? "";
+ const version = getRouterParam(event, "version") ?? "";
+ const arthurEnabled =
+ !!env.GENAI_ENGINE_API_KEY &&
+ !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+ !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+ if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) {
+ return { generatedAt, available: false, body: null };
+ }
+
+ try {
+ const { ArthurClient } = await import("../../../../../../sandbox/arthur-client.js");
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT!,
+ env.GENAI_ENGINE_API_KEY!,
+ );
+ const body = await client.getPromptVersionBody(
+ env.GENAI_ENGINE_PROMPT_TASK_ID!,
+ name,
+ version,
+ );
+ return { generatedAt, available: body !== null, body };
+ } catch (err) {
+ logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed");
+ return { generatedAt, available: false, body: null };
+ }
+});
diff --git a/apps/worker/src/sandbox/arthur-client.test.ts b/apps/worker/src/sandbox/arthur-client.test.ts
index a5e4a80..bccd57b 100644
--- a/apps/worker/src/sandbox/arthur-client.test.ts
+++ b/apps/worker/src/sandbox/arthur-client.test.ts
@@ -239,4 +239,197 @@ describe("ArthurClient", () => {
await expect(client.getPromptByTag("t", "x", "production")).rejects.toThrow(/500/);
});
});
+
+ describe("getTracesOverview", () => {
+ it("POSTs task_ids/start/end and returns the parsed list response", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ count: 1,
+ overviews: [
+ {
+ task_id: "AWT-42",
+ trace_count: 3,
+ trace_token_count: 1200,
+ trace_token_cost: 0.42,
+ eval_count: 6,
+ continuous_eval_success_rate: 0.9,
+ last_active: "2026-06-08T00:00:00Z",
+ },
+ ],
+ }));
+ const client = new ArthurClient("http://host", "secret");
+ const res = await client.getTracesOverview(["AWT-42"], "2026-06-01T00:00:00Z", "2026-06-08T00:00:00Z");
+
+ expect(res.count).toBe(1);
+ expect(res.overviews[0].task_id).toBe("AWT-42");
+ const [url, init] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/traces/overview");
+ expect(init.method).toBe("POST");
+ expect(init.headers.Authorization).toBe("Bearer secret");
+ expect(JSON.parse(init.body)).toEqual({
+ task_ids: ["AWT-42"],
+ start_time: "2026-06-01T00:00:00Z",
+ end_time: "2026-06-08T00:00:00Z",
+ });
+ });
+ });
+
+ describe("getTracesTimeseries", () => {
+ it("POSTs single task_id + bucket_size and unwraps the { points } envelope", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ points: [
+ { timestamp: "2026-06-07T00:00:00Z", trace_count: 1, trace_token_count: 400, trace_token_cost: 0.1 },
+ ],
+ }));
+ const client = new ArthurClient("http://host", "k");
+ const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day");
+
+ expect(points).toHaveLength(1);
+ expect(points[0].trace_token_cost).toBe(0.1);
+ const [url, init] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/traces/overview/timeseries");
+ expect(init.method).toBe("POST");
+ expect(JSON.parse(init.body)).toEqual({
+ task_id: "AWT-42",
+ start_time: "s",
+ end_time: "e",
+ bucket_size: "day",
+ });
+ });
+
+ it("accepts a bare array response", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse([
+ { timestamp: "t", trace_count: 2, trace_token_count: 10, trace_token_cost: null },
+ ]));
+ const client = new ArthurClient("http://host", "k");
+ const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day");
+ expect(points).toHaveLength(1);
+ });
+ });
+
+ describe("aggregateSpanTokensByModel", () => {
+ it("sums tokens/cost grouped by model_name and skips null models", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ spans: [
+ { model_name: "claude-opus-4-6", total_token_count: 100, total_token_cost: 0.5 },
+ { model_name: "claude-opus-4-6", total_token_count: 50, total_token_cost: 0.25 },
+ { model_name: "gpt-5", total_token_count: 200, total_token_cost: 1.0 },
+ { model_name: null, total_token_count: 999, total_token_cost: 9.0 },
+ ],
+ }));
+ const client = new ArthurClient("http://host", "k");
+ const rows = await client.aggregateSpanTokensByModel(["AWT-42"], "s", "e");
+
+ expect(rows).toEqual([
+ { model: "claude-opus-4-6", tokens: 150, cost: 0.75 },
+ { model: "gpt-5", tokens: 200, cost: 1.0 },
+ ]);
+ const [url, init] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/traces/spans");
+ expect(JSON.parse(init.body)).toEqual({
+ task_ids: ["AWT-42"],
+ start_time: "s",
+ end_time: "e",
+ limit: 1000,
+ });
+ });
+
+ it("treats null token/cost as 0", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse([
+ { model_name: "m", total_token_count: null, total_token_cost: null },
+ ]));
+ const client = new ArthurClient("http://host", "k");
+ const rows = await client.aggregateSpanTokensByModel([], "s", "e");
+ expect(rows).toEqual([{ model: "m", tokens: 0, cost: 0 }]);
+ });
+ });
+
+ describe("listPromptVersions", () => {
+ it("GETs the versions endpoint and sorts newest-first", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ count: 2,
+ versions: [
+ {
+ version: 1,
+ created_at: "2026-06-01T00:00:00Z",
+ deleted_at: null,
+ model_provider: "anthropic",
+ model_name: "claude-opus-4-6",
+ tags: [],
+ num_messages: 1,
+ num_tools: 0,
+ },
+ {
+ version: 2,
+ created_at: "2026-06-02T00:00:00Z",
+ deleted_at: null,
+ model_provider: "anthropic",
+ model_name: "claude-opus-4-6",
+ tags: ["production"],
+ num_messages: 1,
+ num_tools: 0,
+ },
+ ],
+ }));
+ const client = new ArthurClient("http://host", "k");
+ const versions = await client.listPromptVersions("task-uuid", "research-plan");
+
+ expect(versions.map((v) => v.version)).toEqual([2, 1]);
+ const [url, init] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions");
+ expect(init.method).toBe("GET");
+ expect(init.headers.Authorization).toBe("Bearer k");
+ });
+
+ it("returns [] on 404", async () => {
+ mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 }));
+ const client = new ArthurClient("http://host", "k");
+ expect(await client.listPromptVersions("t", "research-plan")).toEqual([]);
+ });
+
+ it("throws on 5xx", async () => {
+ mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 }));
+ const client = new ArthurClient("http://host", "k");
+ await expect(client.listPromptVersions("t", "x")).rejects.toThrow(/500/);
+ });
+ });
+
+ describe("getPromptVersionBody", () => {
+ it("GETs the by-version endpoint and returns messages[0].content", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ name: "research-plan",
+ version: 3,
+ messages: [{ role: "user", content: "v3 body" }],
+ }));
+ const client = new ArthurClient("http://host", "k");
+ const body = await client.getPromptVersionBody("task-uuid", "research-plan", 3);
+ expect(body).toBe("v3 body");
+ const [url, init] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions/3");
+ expect(init.method).toBe("GET");
+ });
+
+ it("accepts a string version specifier (latest/tag/datetime)", async () => {
+ mockFetch.mockResolvedValueOnce(jsonResponse({
+ name: "implement",
+ messages: [{ role: "user", content: "latest body" }],
+ }));
+ const client = new ArthurClient("http://host", "k");
+ const body = await client.getPromptVersionBody("t", "implement", "latest");
+ expect(body).toBe("latest body");
+ const [url] = mockFetch.mock.calls[0];
+ expect(url).toBe("http://host/api/v1/tasks/t/prompts/implement/versions/latest");
+ });
+
+ it("returns null on 404", async () => {
+ mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 }));
+ const client = new ArthurClient("http://host", "k");
+ expect(await client.getPromptVersionBody("t", "x", 1)).toBeNull();
+ });
+
+ it("throws on 5xx", async () => {
+ mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 }));
+ const client = new ArthurClient("http://host", "k");
+ await expect(client.getPromptVersionBody("t", "x", 1)).rejects.toThrow(/500/);
+ });
+ });
});
diff --git a/apps/worker/src/sandbox/arthur-client.ts b/apps/worker/src/sandbox/arthur-client.ts
index 8afc77c..66d8b8f 100644
--- a/apps/worker/src/sandbox/arthur-client.ts
+++ b/apps/worker/src/sandbox/arthur-client.ts
@@ -23,6 +23,68 @@ interface SearchResponse {
tasks: ArthurTask[];
}
+/**
+ * Per-task aggregate over a window from `POST /api/v1/traces/overview`.
+ * Token/cost fields come from Arthur's `TokenCountCostSchema`; `trace_token_cost`
+ * may be null when cost is unavailable. Typed per the documented shape — these
+ * read endpoints are UNVERIFIED against a live instance, so parsing stays
+ * defensive (callers treat null cost as 0).
+ */
+export interface TraceOverview {
+ task_id: string;
+ trace_count: number;
+ trace_token_count: number;
+ trace_token_cost: number | null;
+ eval_count: number;
+ continuous_eval_success_rate: number;
+ last_active?: string;
+}
+
+export interface TraceOverviewListResponse {
+ count: number;
+ overviews: TraceOverview[];
+}
+
+/** One bucket from `POST /api/v1/traces/overview/timeseries` (single task). */
+export interface TraceTimeseriesPoint {
+ timestamp: string;
+ trace_count: number;
+ trace_token_count: number;
+ trace_token_cost: number | null;
+ continuous_eval_success_rate?: number;
+}
+
+/** Token/cost-by-model aggregation result (one row per Arthur `model_name`). */
+export interface ModelTokenCost {
+ model: string;
+ tokens: number;
+ cost: number;
+}
+
+/** A span row from `GET /api/v1/traces/spans` carrying model + token/cost fields. */
+interface SpanTokenCost {
+ model_name: string | null;
+ total_token_count: number | null;
+ total_token_cost: number | null;
+}
+
+/** One Arthur prompt version's metadata (no message body). */
+export interface ArthurPromptVersion {
+ version: number;
+ created_at: string;
+ deleted_at: string | null;
+ model_provider: string;
+ model_name: string;
+ tags: string[];
+ num_messages: number;
+ num_tools: number;
+}
+
+interface AgenticPromptVersionListResponse {
+ count: number;
+ versions: ArthurPromptVersion[];
+}
+
export class ArthurClient {
constructor(
private readonly baseUrl: string,
@@ -56,6 +118,20 @@ export class ArthurClient {
return (await res.json()) as T;
}
+ /** GET that treats 404 as "absent" (returns null) instead of throwing — for the prompt read paths. */
+ private async getAllowing404(path: string): Promise {
+ const res = await fetch(`${this.baseUrl}${path}`, {
+ method: "GET",
+ headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+ });
+ if (res.status === 404) return null;
+ if (!res.ok) {
+ const body = await res.text().catch(() => "");
+ throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+ }
+ return (await res.json()) as T;
+ }
+
/**
* Return tasks whose name equals `prefix` or matches `^prefix\.\d+$`.
* Arthur's `task_name` search is substring-based, so we post-filter to
@@ -124,21 +200,8 @@ export class ArthurClient {
/** Fetch a tagged prompt version. Returns the first message's content, or null if 404. */
async getPromptByTag(taskId: string, name: string, tag: string): Promise {
const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/tags/${encodeURIComponent(tag)}`;
- const res = await fetch(`${this.baseUrl}${path}`, {
- method: "GET",
- headers: {
- "Authorization": `Bearer ${this.apiKey}`,
- "ngrok-skip-browser-warning": "true",
- },
- });
- if (res.status === 404) return null;
- if (!res.ok) {
- const body = await res.text().catch(() => "");
- throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
- }
- const prompt = (await res.json()) as AgenticPrompt;
- const first = prompt.messages?.[0];
- return first?.content ?? null;
+ const prompt = await this.getAllowing404(path);
+ return prompt?.messages?.[0]?.content ?? null;
}
/** Create a new version of a named prompt on a task. Content is sent as a single user message. */
@@ -171,4 +234,110 @@ export class ArthurClient {
},
);
}
+
+ /**
+ * Fleet eval/cost aggregate over a window. One call covers multiple tasks;
+ * sum across `overviews` for fleet totals. `taskIds` may be empty (see the
+ * empty-means-all-org open question in the specs). Shared by /evals + /cost.
+ */
+ async getTracesOverview(
+ taskIds: string[],
+ startTime: string,
+ endTime: string,
+ ): Promise {
+ return this.request("/api/v1/traces/overview", {
+ method: "POST",
+ body: JSON.stringify({
+ task_ids: taskIds,
+ start_time: startTime,
+ end_time: endTime,
+ }),
+ });
+ }
+
+ /**
+ * Per-bucket timeseries for a single task. The caller fans out one call per
+ * task and merges points by timestamp. The response envelope key is
+ * unverified, so accept both a bare array and a `{ points }` wrapper.
+ */
+ async getTracesTimeseries(
+ taskId: string,
+ startTime: string,
+ endTime: string,
+ bucketSize: string,
+ ): Promise {
+ const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>(
+ "/api/v1/traces/overview/timeseries",
+ {
+ method: "POST",
+ body: JSON.stringify({
+ task_id: taskId,
+ start_time: startTime,
+ end_time: endTime,
+ bucket_size: bucketSize,
+ }),
+ },
+ );
+ return Array.isArray(res) ? res : (res.points ?? []);
+ }
+
+ /**
+ * By-model token/cost aggregation — Arthur has no per-model overview, so we
+ * fetch span rows (which carry `model_name` + token/cost fields) and sum
+ * grouped by `model_name`. Spans with a null `model_name` are skipped.
+ */
+ async aggregateSpanTokensByModel(
+ taskIds: string[],
+ startTime: string,
+ endTime: string,
+ ): Promise {
+ // TODO(arthur-verify): pagination — first page only, bounded to N spans. The
+ // read endpoints are unverified, so we send a bounded `limit` rather than
+ // looping pages; this makes the ceiling explicit instead of pulling an
+ // unbounded result set and summing it silently in memory.
+ const res = await this.request<{ spans?: SpanTokenCost[] } | SpanTokenCost[]>(
+ "/api/v1/traces/spans",
+ {
+ method: "POST",
+ body: JSON.stringify({
+ task_ids: taskIds,
+ start_time: startTime,
+ end_time: endTime,
+ limit: 1000,
+ }),
+ },
+ );
+ const spans = Array.isArray(res) ? res : (res.spans ?? []);
+ const byModel = new Map();
+ for (const span of spans) {
+ if (!span.model_name) continue;
+ const row = byModel.get(span.model_name) ?? {
+ model: span.model_name,
+ tokens: 0,
+ cost: 0,
+ };
+ row.tokens += span.total_token_count ?? 0;
+ row.cost += span.total_token_cost ?? 0;
+ byModel.set(span.model_name, row);
+ }
+ return [...byModel.values()];
+ }
+
+ /** List version metadata for a named prompt (newest first). First page only. Empty on 404. */
+ async listPromptVersions(taskId: string, name: string): Promise {
+ const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`;
+ const data = await this.getAllowing404(path);
+ return [...(data?.versions ?? [])].sort((a, b) => b.version - a.version);
+ }
+
+ /**
+ * Fetch the body of a specific version. `version` accepts an integer,
+ * `"latest"`, an ISO datetime, or a tag. Returns the first message's content,
+ * or null on 404. Generalizes the by-version GET that `getPromptByTag` uses.
+ */
+ async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise {
+ const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`;
+ const prompt = await this.getAllowing404(path);
+ return prompt?.messages?.[0]?.content ?? null;
+ }
}
diff --git a/apps/worker/src/workflows/prompts-step.test.ts b/apps/worker/src/workflows/prompts-step.test.ts
index ab09934..4350061 100644
--- a/apps/worker/src/workflows/prompts-step.test.ts
+++ b/apps/worker/src/workflows/prompts-step.test.ts
@@ -3,9 +3,13 @@ import { describe, it, expect, vi, beforeEach } from "vitest";
vi.mock("../../env.js", () => ({ env: {} }));
const mockGetPromptByTag = vi.fn();
+const mockListPromptVersions = vi.fn();
vi.mock("../sandbox/arthur-client.js", () => ({
ArthurClient: {
- fromTraceEndpoint: vi.fn(() => ({ getPromptByTag: mockGetPromptByTag })),
+ fromTraceEndpoint: vi.fn(() => ({
+ getPromptByTag: mockGetPromptByTag,
+ listPromptVersions: mockListPromptVersions,
+ })),
},
}));
@@ -20,6 +24,8 @@ async function setEnv(partial: Record) {
describe("loadPrompts", () => {
beforeEach(async () => {
mockGetPromptByTag.mockReset();
+ mockListPromptVersions.mockReset();
+ mockListPromptVersions.mockResolvedValue([]);
await setEnv({
GENAI_ENGINE_API_KEY: undefined,
GENAI_ENGINE_TRACE_ENDPOINT: undefined,
@@ -65,6 +71,9 @@ describe("loadPrompts", () => {
expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
const names = mockGetPromptByTag.mock.calls.map((c) => c[1]);
expect(names).toEqual(["research-plan", "implement", "review"]);
+ // The step throws version metadata away, so it must not pay for the
+ // dashboard-only listPromptVersions fan-out.
+ expect(mockListPromptVersions).not.toHaveBeenCalled();
});
it("falls back per-prompt when Arthur returns null or throws", async () => {
diff --git a/apps/worker/src/workflows/prompts-step.ts b/apps/worker/src/workflows/prompts-step.ts
index 9baae40..bc4a44f 100644
--- a/apps/worker/src/workflows/prompts-step.ts
+++ b/apps/worker/src/workflows/prompts-step.ts
@@ -6,53 +6,18 @@ export interface LoadedPrompts {
export async function loadPrompts(): Promise {
"use step";
- const { env } = await import("../../env.js");
- const { logger } = await import("../lib/logger.js");
- const { PROMPT_FALLBACKS } = await import("../lib/prompts.js");
- type PromptName = keyof typeof PROMPT_FALLBACKS;
-
- const arthurEnabled =
- !!env.GENAI_ENGINE_API_KEY &&
- !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
- !!env.GENAI_ENGINE_PROMPT_TASK_ID;
-
- if (!arthurEnabled) {
- logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_loaded");
- return {
- research: PROMPT_FALLBACKS["research-plan"],
- implement: PROMPT_FALLBACKS["implement"],
- review: PROMPT_FALLBACKS["review"],
- };
- }
-
- const { ArthurClient } = await import("../sandbox/arthur-client.js");
- const client = ArthurClient.fromTraceEndpoint(
- env.GENAI_ENGINE_TRACE_ENDPOINT!,
- env.GENAI_ENGINE_API_KEY!,
- );
- const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
- const TAG = "production";
-
- async function one(name: PromptName): Promise {
- try {
- const body = await client.getPromptByTag(taskId, name, TAG);
- if (body === null) {
- logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_loaded");
- return PROMPT_FALLBACKS[name];
- }
- logger.info({ name, source: "arthur" }, "prompts_loaded");
- return body;
- } catch (err) {
- logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_loaded");
- return PROMPT_FALLBACKS[name];
- }
- }
-
- const [research, implement, review] = await Promise.all([
- one("research-plan"),
- one("implement"),
- one("review"),
- ]);
- return { research, implement, review };
+ // Delegate to the shared resolver so the durable step and the
+ // GET /api/v1/prompts route share one source of truth. The resolver carries
+ // the same logger.info/logger.warn (fallback / arthur / per-prompt error)
+ // calls the step used to make. Version history is dashboard-only, so skip the
+ // listPromptVersions fan-out here — the step only consumes prompt bodies.
+ const { resolvePrompts } = await import("../lib/overview/collect-prompts.js");
+ const { prompts } = await resolvePrompts({ withVersions: false });
+ const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body]));
+ return {
+ research: byName["research-plan"],
+ implement: byName["implement"],
+ review: byName["review"],
+ };
}
loadPrompts.maxRetries = 0;
diff --git a/docs/superpowers/plans/2026-06-08-cost-real-data.md b/docs/superpowers/plans/2026-06-08-cost-real-data.md
new file mode 100644
index 0000000..091c029
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-cost-real-data.md
@@ -0,0 +1,329 @@
+# `/cost` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token/cost from the OpenInference traces the workflow ships in. **Single PR** — no persistence, no capture.
+
+**Architecture:** New Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) on the existing `ArthurClient`. A worker collector `collect-cost.ts` calls them and shapes a `CostResponse` (totals, by-task breakdown, by-model breakdown, merged daily series). A new route `GET /api/v1/cost` exposes it, degrading to empty when Arthur is unconfigured/unreachable. The dashboard fetches it server-side via `getJSON`, falls back to an empty `CostResponse`, and passes `data` to the `CostScreen` client presenter. Thin `page.tsx` wraps `cost-data.tsx` in ``. Identical read-path shape to `overview-data.tsx` / `runs-data.tsx`.
+
+**Tech Stack:** Next.js App Router, React, TypeScript, `@shared/contracts`, h3 worker routes, existing `ArthurClient` (fetch + Bearer). Worker has Vitest (`*.test.ts`); dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-cost-real-data-design.md`
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do.
+
+**Live open questions (resolve with the user; the plan assumes the spec's defaults):** `bucket_size` allowed values for the timeseries; whether empty `task_ids` means org-wide (else enumerate tasks); by-model client aggregation acceptable; task→workflow mapping (breakdown stays per-task); window = calendar MTD. See the spec's "Open questions".
+
+---
+
+### Task 1: Add Arthur read methods + types
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts`
+- Modify: `apps/worker/src/sandbox/arthur-client.test.ts`
+
+- [ ] **Step 1: Add response types**
+
+Add interfaces mirroring Arthur's shapes:
+
+```ts
+export interface TraceOverviewEntry {
+ task_id: string;
+ trace_count: number;
+ trace_token_count: number;
+ trace_token_cost: number | null;
+ last_active?: string;
+}
+export interface TraceTimeseriesPoint {
+ timestamp: string;
+ trace_count: number;
+ trace_token_count: number;
+ trace_token_cost: number | null;
+}
+export interface SpanTokenCost {
+ model_name: string | null;
+ total_token_count: number | null;
+ total_token_cost: number | null;
+}
+```
+
+- [ ] **Step 2: Add `getTracesOverview`**
+
+```ts
+async getTracesOverview(taskIds: string[], startTime: string, endTime: string): Promise {
+ const { overviews } = await this.request<{ count: number; overviews: TraceOverviewEntry[] }>(
+ "/api/v1/traces/overview",
+ { method: "POST", body: JSON.stringify({ task_ids: taskIds, start_time: startTime, end_time: endTime }) },
+ );
+ return overviews;
+}
+```
+
+- [ ] **Step 3: Add `getTracesTimeseries`** (single task per call; caller fans out + merges)
+
+```ts
+async getTracesTimeseries(taskId: string, startTime: string, endTime: string, bucketSize: string): Promise {
+ const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>(
+ "/api/v1/traces/overview/timeseries",
+ { method: "POST", body: JSON.stringify({ task_id: taskId, start_time: startTime, end_time: endTime, bucket_size: bucketSize }) },
+ );
+ return Array.isArray(res) ? res : (res.points ?? []);
+}
+```
+
+> The response envelope key is unconfirmed — handle both array and `{ points }`. Confirm against a live call.
+
+- [ ] **Step 4: Add `aggregateSpanTokensByModel`** (the one client-side aggregation)
+
+Fetch span rows for the window via `GET /api/v1/traces/spans` (paginate if the API requires it), then sum `total_token_count`/`total_token_cost` grouped by `model_name`. Return `Array<{ model: string; tokens: number; cost: number }>`. Skip rows with null `model_name`.
+
+- [ ] **Step 5: Test**
+
+Run: `cd apps/worker && pnpm vitest run src/sandbox/arthur-client.test.ts`
+Expected: add tests with a stubbed `fetch` asserting each method posts the right body and parses the response (mirror the existing client tests). PASS.
+
+---
+
+### Task 2: Add the `CostResponse` contract
+
+**Files:**
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add the interfaces**
+
+Add `CostByModelEntry`, `CostByWorkflowEntry`, and `CostResponse` exactly as specified in the spec ("Proposed contract").
+
+- [ ] **Step 2: Typecheck shared**
+
+Run: `cd apps/shared && npx tsc --noEmit` (or root `pnpm -w typecheck` if defined)
+Expected: PASS.
+
+---
+
+### Task 3: Add the `collectCost` aggregator + worker route
+
+**Files:**
+- Create: `apps/worker/src/lib/overview/collect-cost.ts`
+- Create: `apps/worker/src/lib/overview/collect-cost.test.ts`
+- Create: `apps/worker/src/routes/api/v1/cost.get.ts`
+
+- [ ] **Step 1: Write `collectCost`**
+
+Signature: `collectCost(client: ArthurClient, opts: { now: Date; bucketSize: string }): Promise>`.
+
+Logic:
+1. Resolve the window: `start = startOfMonth(now)`, `end = now` (ISO). (Assumption: calendar MTD — see open Q5.)
+2. Resolve `taskIds`: enumerate the org's tasks (assumption from open Q2 — pass ids explicitly). Reuse/extend the client's task listing (`/api/v2/tasks/search`); if a true org-wide overview via empty `task_ids` is confirmed, pass `[]` instead.
+3. `overviews = await client.getTracesOverview(taskIds, start, end)`.
+ - `totals`: sum `trace_token_cost` (→ `totalTokenCost`), `trace_token_count` (→ `totalTokens`), `trace_count` (→ `traceCount`); `costPerRun = totalTokenCost / max(1, traceCount)`. Treat null `trace_token_cost` as 0.
+ - `byWorkflow`: one entry per overview → `{ taskId, name, runs, tokens, cost, costPerRun }`. `name` from the task listing (task name = ticket-run id).
+4. `byModel = await client.aggregateSpanTokensByModel(...)` → map to `{ model, cost, tokens }`.
+5. `daily`: fan out `getTracesTimeseries(taskId, start, end, bucketSize)` per task; **merge points by `timestamp`** summing cost/tokens; sort oldest→newest → `{ date, cost, tokens }[]`.
+
+Keep I/O behind the injected `client` so the aggregation is unit-testable with a fake client (mirror how `collect-runs.ts` takes a `RunsLister`).
+
+- [ ] **Step 2: Write the route**
+
+Mirror `workflows.get.ts`:
+```ts
+setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60");
+const generatedAt = new Date().toISOString();
+if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+ return { generatedAt, available: false, ...EMPTY };
+}
+try {
+ const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY);
+ const data = await collectCost(client, { now: new Date(), bucketSize: "day" });
+ return { generatedAt, available: true, ...data };
+} catch (err) {
+ logger.warn({ err: (err as Error).message }, "cost_collect_failed");
+ return { generatedAt, available: false, ...EMPTY };
+}
+```
+`EMPTY` = the empty totals/arrays/window matching `costFallback`.
+
+- [ ] **Step 3: Test the aggregator**
+
+Run: `cd apps/worker && pnpm vitest run src/lib/overview/collect-cost.test.ts`
+Expected: with a fake client returning fixtures (2 tasks, 2 models, multi-day timeseries), assert totals, `byWorkflow` rows + `costPerRun`, `byModel` grouping, and merged-by-timestamp `daily`. Empty/null inputs → zeros/empty arrays. PASS.
+
+- [ ] **Step 4: Worker typecheck**
+
+Run: `cd apps/worker && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 4: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Add `costFallback`**
+
+```ts
+export function costFallback(now: string): CostResponse {
+ return {
+ generatedAt: now,
+ available: false,
+ window: { start: now, end: now },
+ totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+ byModel: [],
+ byWorkflow: [],
+ daily: [],
+ };
+}
+```
+
+Add `CostResponse` to the existing `@shared/contracts` import.
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (no consumers yet).
+
+---
+
+### Task 5: Add the skeleton + server data component, and convert `CostScreen`
+
+**Files:**
+- Create: `apps/dashboard/app/cost-skeleton.tsx`
+- Create: `apps/dashboard/app/cost-data.tsx`
+- Modify: `apps/dashboard/components/cockpit/screens/cost.tsx`
+
+- [ ] **Step 1: Create the skeleton**
+
+Mirror `overview-skeleton.tsx`, shaped to the cost layout (after embellishments are stripped: 3 KPI blocks, a chart+donut row, two table blocks):
+
+```tsx
+// apps/dashboard/app/cost-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+ return ;
+}
+export function CostSkeleton() {
+ return (
+
+
+ {Array.from({ length: 3 }, (_, i) => )}
+
+
+
+
+
+
+
+ );
+}
+```
+
+- [ ] **Step 2: Create the server data component**
+
+```tsx
+// apps/dashboard/app/cost-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { CostScreen } from "@/components/cockpit/screens/cost";
+import type { CostResponse } from "@shared/contracts";
+import { costFallback } from "@/lib/api/fallbacks";
+
+export async function CostData() {
+ const now = new Date().toISOString();
+ const data = await getJSON("/api/v1/cost").catch(() =>
+ costFallback(now),
+ );
+ return ;
+}
+```
+
+> Will not typecheck until Step 3 changes `CostScreen`'s signature. The full gate is in Task 6.
+
+- [ ] **Step 3: Convert `CostScreen` to consume `data` and strip embellishments**
+
+In `components/cockpit/screens/cost.tsx`:
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `import { sparkSeries } from "@/lib/rng"`, the `Spark` import (no longer used), and `const D = AIWF_DATA`.
+- Add `import type { CostResponse } from "@shared/contracts";`.
+- Signature → `export function CostScreen({ data }: { data: CostResponse })`.
+- KPIs: `total = data.totals.totalTokenCost`; tokens = `data.totals.totalTokens`; "Cost / run avg" = `$${data.totals.costPerRun.toFixed(2)}`. **Remove** the "Projection · EoM" KPI tile, the `of $1,200 budget` sub, and all `delta`/`deltaTone` props (no source).
+- Header: **remove** the `` and the `Export CSV` button.
+- Area chart: feed `data.daily.map(d => d.cost)` and labels `data.daily.map(d => d.date)` (format the ISO date to a short label in-screen); **remove** the inner Cost/Tokens `CkTabs` action.
+- Donut: shares computed in-screen from `byModel` — `const totalCost = data.byModel.reduce((a,m)=>a+m.cost,0); shares = data.byModel.map(m => totalCost ? m.cost/totalCost : 0)`; center = `"$" + Math.round(total)`.
+- Per-model table: map `data.byModel` → columns `{ m.model, m.tokens, m.cost, share }`. **Remove** the `Vendor` column (not in contract) and the `Trend`/`Spark` column.
+- Per-workflow table: map `data.byWorkflow` (already aggregated) → `{ w.name, w.taskId, w.runs, w.tokens, w.cost, w.costPerRun }`. **Remove** the in-component `tokens = runs24h*2400`/`perRun` derivations, the `primary` chip / `gateway` line (not in contract), and the `Trend`/`Spark` column. Header label can stay "Per-workflow breakdown" (rows are per task — see spec mapping note).
+
+- [ ] **Step 4: Verify no mock/embellishment refs remain**
+
+Run: `grep -nE "\bD\.|AIWF_DATA|sparkSeries|Spark|COST_BY_MODEL|HOURS24|Export CSV|deltaTone|By actor" apps/dashboard/components/cockpit/screens/cost.tsx`
+Expected: no matches.
+
+---
+
+### Task 6: Rewrite the route + full verification
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/cost/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost")
+import { Suspense } from "react";
+import { CostData } from "@/app/cost-data";
+import { CostSkeleton } from "@/app/cost-skeleton";
+
+export default function CostPage() {
+ return (
+ }>
+
+
+ );
+}
+```
+
+- [ ] **Step 2: Typecheck both apps**
+
+Run: `cd apps/worker && npx tsc --noEmit && cd ../dashboard && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed dashboard files**
+
+Run: `cd apps/dashboard && npx next lint --file app/cost-data.tsx --file app/cost-skeleton.tsx --file "app/(cockpit)/cost/page.tsx" --file components/cockpit/screens/cost.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev`, open `http://localhost:3001/cost`.
+Expected:
+- With Arthur configured + traces present: real spend, token totals, per-model donut/table, per-task table, and per-day spend chart render.
+- With Arthur unconfigured (env unset) or unreachable: zero/empty state — KPIs `$0.00`/`0`, empty tables, empty chart — no crash.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts \
+ apps/worker/src/sandbox/arthur-client.ts \
+ apps/worker/src/lib/overview/collect-cost.ts apps/worker/src/routes/api/v1/cost.get.ts \
+ apps/dashboard/lib/api/fallbacks.ts \
+ apps/dashboard/app/cost-data.tsx apps/dashboard/app/cost-skeleton.tsx \
+ "apps/dashboard/app/(cockpit)/cost/page.tsx" \
+ apps/dashboard/components/cockpit/screens/cost.tsx
+git commit -m "feat: wire /cost to real Arthur usage data"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) → Task 1. ✓
+- `CostResponse` contract with field-level types → Task 2 (from spec). ✓
+- `collectCost` aggregator (totals / byWorkflow=per-task / byModel / merged daily) + `/api/v1/cost` route with Arthur-unconfigured degrade → Task 3. ✓
+- `costFallback` empty state → Task 4. ✓
+- `cost-data.tsx` + `cost-skeleton.tsx` + `CostScreen` swap with embellishments **removed** (budget, deltas, EoM projection, tabs, CSV, sparklines, vendor/primary/gateway) → Task 5. ✓
+- Thin Suspense page → Task 6. ✓
+- Arthur-down / unconfigured empty state → fallback (Task 4), route degrade (Task 3), verified (Task 6 Step 4). ✓
+- Single PR, no Redis/persistence/capture → no such tasks. ✓
+
+**Reuse check:** Read methods extend the existing `ArthurClient` (same `request` + Bearer auth + `fromTraceEndpoint`). Cost comes straight from Arthur's `*_token_cost` — no client-side pricing, the `pricing.ts`/`usage.ts` Slack path is untouched. Read path reuses `getJSON`/fallback/Suspense. Only new infra is one collector + one route — consistent with runs/overview. ✓
+
+**Placeholder scan:** No TBD/TODO; the only deferred items are the spec's flagged open questions (`bucket_size`, empty `task_ids`, by-model aggregation, task→workflow, window) and the explicitly-removed embellishments. ✓
+
+**Type consistency:** `CostResponse` imported from `@shared/contracts` in `cost-data.tsx` (Task 5), `fallbacks.ts` (Task 4), and the route (Task 3). `CostScreen` accepts `{ data: CostResponse }` (Task 5) matching the call site (Task 5 Step 2). Arthur response types (Task 1) feed `collectCost` (Task 3). ✓
diff --git a/docs/superpowers/plans/2026-06-08-evals-real-data.md b/docs/superpowers/plans/2026-06-08-evals-real-data.md
new file mode 100644
index 0000000..cff77e1
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-evals-real-data.md
@@ -0,0 +1,421 @@
+# `/evals` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/evals` dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Because no evals list endpoint or eval-read path exists yet, this plan also builds the worker contract, route, and Arthur read path as a prerequisite.
+
+**Architecture:** New worker route `GET /api/v1/evals` → `EvalsResponse` (discriminated union, same `available` pattern as `EvalHealthResponse`). A new collector `collect-evals.ts` calls the **confirmed** Arthur read endpoint `POST /api/v1/traces/overview` via a new `getTracesOverview()` method on `ArthurClient`, sums the per-task overviews into a fleet `score`/`spansGraded`/`traceCount`, and degrades to `available: false` when Arthur is unconfigured, unreachable, or nothing is graded. On the dashboard, a thin server route (`page.tsx`) wraps a server component (`evals-data.tsx`) in ``; that component fetches via `getJSON`, falls back to `evalsFallback`, and passes `data` to the client presenter `EvalsScreen`. Identical in shape to `runs-data.tsx` / `RunsScreen`.
+
+**Scope note (read first):** Arthur's read API is confirmed (auth = same `Bearer GENAI_ENGINE_API_KEY`, org-scoped). Our trace path (`POST /api/v1/traces`) only produces `continuous_eval_success_rate`, `eval_count`, `trace_count`, and the three relevance/tool metric types — **and only if continuous evals are configured on the task.** The mock's rule families (hallucination/PII/toxicity/prompt-injection) come from Arthur's `/validate_*` write path, which **we do not call** — they are **out of scope** and dropped from this page. The first increment ships the **fleet aggregate** (score + graded count + window); the per-metric relevance/tool breakdown and trend/sparkline are optional follow-ons (Tasks 3b/3c).
+
+**Tech Stack:** Worker = h3 + Nitro routes, `@shared/contracts` types, Vitest. Dashboard = Next.js App Router, React 19, TypeScript. Dashboard has no test framework — verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-evals-real-data-design.md`
+
+**Required env vars (worker):** `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT` (both already declared optional in `apps/worker/env.ts`; the base read URL is derived from the trace endpoint via `ArthurClient.fromTraceEndpoint`). Reads need the `INFERENCE_READ` permission on the key. No new dashboard env vars — `/evals` reuses `WORKER_BASE_URL` / `WORKER_API_TOKEN` via `getJSON`.
+
+**Remaining open items (non-blocking — see spec Open Questions):** (1) `bucket_size` values for the optional timeseries call; (2) whether empty `task_ids` on `/traces/overview` means "all org tasks" (else enumerate via `/api/v2/tasks/search`); (3) whether continuous evals are actually configured on our live tasks (if not, the page legitimately shows "No graded evals"). None block the aggregate-only increment.
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do.
+
+---
+
+### Task 1: Add the `EvalsResponse` contract
+
+**Files:**
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add `EvalMetricRow` and `EvalsResponse`**
+
+Append after the existing `EvalHealthResponse` union:
+
+```ts
+export interface EvalMetricRow {
+ metric: string;
+ metricType: "QueryRelevance" | "ResponseRelevance" | "ToolSelection";
+ value: number;
+ status: "pass" | "warn" | "fail";
+ axis: "quality";
+ trend?: number | null; // only if timeseries wired (Task 3c)
+ spark?: number[]; // only if timeseries wired (Task 3c)
+}
+
+export type EvalsResponse =
+ | {
+ available: true;
+ generatedAt: string;
+ windowHours: number;
+ score: number; // continuous_eval_success_rate × 100, fleet-wide
+ spansGraded: number; // Σ eval_count
+ traceCount: number; // Σ trace_count
+ rows: EvalMetricRow[]; // [] in the aggregate-only first cut
+ }
+ | { available: false; generatedAt: string; reason: string };
+```
+
+- [ ] **Step 2: Typecheck shared**
+
+Run: `cd apps/shared && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 2: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Import the type and add the fallback**
+
+Add `EvalsResponse` to the existing `@shared/contracts` import block, then add:
+
+```ts
+export function evalsFallback(now: string): EvalsResponse {
+ return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+```
+
+- [ ] **Step 2: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (the new export is unused so far, but valid).
+
+---
+
+### Task 3: Build the Arthur read path + collector (fleet aggregate)
+
+This is the first, shippable increment: fleet `score` / `spansGraded` / `traceCount`, `rows: []`. The per-metric breakdown (3b) and trend/sparkline (3c) are optional follow-ons below.
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts` (add a read method)
+- Create: `apps/worker/src/lib/overview/collect-evals.ts`
+- Create: `apps/worker/src/lib/overview/collect-evals.test.ts`
+
+- [ ] **Step 1: Add `getTracesOverview()` to `ArthurClient`**
+
+Add a method reusing the existing private `request` helper and bearer auth:
+
+```ts
+interface TraceOverview {
+ task_id: string;
+ trace_count: number;
+ trace_token_count: number;
+ trace_token_cost: number;
+ eval_count: number;
+ continuous_eval_success_rate: number;
+ last_active: string;
+}
+interface TraceOverviewListResponse { count: number; overviews: TraceOverview[]; }
+
+async getTracesOverview(opts: {
+ taskIds: string[]; // may be empty — see Open Q2
+ startTime: string; // ISO
+ endTime: string; // ISO
+}): Promise {
+ return this.request("/api/v1/traces/overview", {
+ method: "POST",
+ body: JSON.stringify({
+ task_ids: opts.taskIds,
+ start_time: opts.startTime,
+ end_time: opts.endTime,
+ }),
+ });
+}
+```
+
+Keep the raw Arthur types local to the client; do not leak them into `@shared/contracts`.
+
+> **Task-id enumeration (Open Q2):** if `task_ids: []` is confirmed to mean "all org tasks", pass `[]`. Otherwise enumerate the org's tasks first. The client already searches tasks via `POST /api/v2/tasks/search` (`findTicketTasks`); add a thin `listAllTasks()` if a full enumeration is needed, or have the collector accept a pre-resolved `taskIds`. Default the collector to receive `taskIds` so the route owns the enumeration policy.
+
+- [ ] **Step 2: Write `collect-evals.ts`**
+
+Mirror `collect-runs.ts`/`collect-kpis.ts` — accept an injected fetcher and resolve to the `available: true` fields minus `generatedAt`:
+
+```ts
+export interface CollectEvalsOptions {
+ fetchOverview: (o: { taskIds: string[]; startTime: string; endTime: string })
+ => Promise<{ overviews: TraceOverview[] }>;
+ taskIds: string[];
+ windowHours: number;
+ now: Date;
+}
+
+// Returns { windowHours, score, spansGraded, traceCount, rows } OR a null-ish
+// signal when nothing is graded so the route can emit available:false.
+export async function collectEvals(opts: CollectEvalsOptions) {
+ const endTime = opts.now.toISOString();
+ const startTime = new Date(opts.now.getTime() - opts.windowHours * 3_600_000).toISOString();
+ const { overviews } = await opts.fetchOverview({ taskIds: opts.taskIds, startTime, endTime });
+
+ const spansGraded = sum(overviews, o => o.eval_count);
+ const traceCount = sum(overviews, o => o.trace_count);
+ // weight success rate by eval_count; 0 graded → caller emits unavailable
+ const score = spansGraded === 0
+ ? 0
+ : (sum(overviews, o => o.continuous_eval_success_rate * o.eval_count) / spansGraded) * 100;
+
+ return { windowHours: opts.windowHours, score, spansGraded, traceCount, rows: [] };
+}
+```
+
+The injected-fetcher boundary keeps the Arthur shape isolated and unit-testable.
+
+- [ ] **Step 3: Unit test the collector**
+
+In `collect-evals.test.ts`, feed stubbed `overviews` and assert: `spansGraded`/`traceCount` are summed, `score` is the eval-count-weighted success rate × 100, and `spansGraded === 0` yields `score === 0` (route turns this into `available:false`). Mirror the style of the existing `collect-*` tests.
+
+Run: `cd apps/worker && npx vitest run src/lib/overview/collect-evals.test.ts`
+Expected: PASS.
+
+- [ ] **Step 3b (optional follow-on): per-metric relevance/tool breakdown**
+
+Only the three Arthur metric types exist on our path. To populate `rows`: list spans (`GET /api/v1/traces/spans`), fetch each span's `metric_results` (`GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results`), parse the opaque `details` JSON string per `metric_type` (e.g. relevance → `llm_relevance_score`), aggregate per metric type, and apply a worker-owned pass/warn/fail threshold. Map each to `EvalMetricRow { metric, metricType, value, status, axis: "quality" }`. Add this behind the same collector with extra fetchers; keep `rows: []` until implemented.
+
+- [ ] **Step 3c (optional follow-on): trend/sparkline**
+
+Wire `POST /api/v1/traces/overview/timeseries` (single task per call) to populate `EvalMetricRow.trend`/`spark` from `continuous_eval_success_rate` buckets. **Confirm `bucket_size` allowed values first (Open Q1).** Until wired, omit `trend`/`spark` entirely — no synthetic series.
+
+---
+
+### Task 4: Add the worker route `GET /api/v1/evals`
+
+**Files:**
+- Create: `apps/worker/src/routes/api/v1/evals.get.ts`
+
+- [ ] **Step 1: Create the route**
+
+Mirror `apps/worker/src/routes/api/v1/runs.get.ts`:
+
+```ts
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { EvalsResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectEvals } from "../../../lib/overview/collect-evals.js";
+import { logger } from "../../../lib/logger.js";
+
+const WINDOW_HOURS = 24;
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+ const generatedAt = new Date().toISOString();
+
+ if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+ return { available: false, generatedAt, reason: "Arthur GenAI Engine not configured." };
+ }
+
+ try {
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT,
+ env.GENAI_ENGINE_API_KEY,
+ );
+ // Open Q2: pass [] if empty === all org tasks; else enumerate via tasks/search.
+ const taskIds: string[] = [];
+ const { windowHours, score, spansGraded, traceCount, rows } = await collectEvals({
+ fetchOverview: (o) => client.getTracesOverview(o),
+ taskIds,
+ windowHours: WINDOW_HOURS,
+ now: new Date(),
+ });
+ if (spansGraded === 0) {
+ return { available: false, generatedAt, reason: "No graded evals in the last 24h." };
+ }
+ return { available: true, generatedAt, windowHours, score, spansGraded, traceCount, rows };
+ } catch (err) {
+ logger.warn({ err: (err as Error).message }, "evals_list_failed");
+ return { available: false, generatedAt, reason: "Eval grading not wired up yet." };
+ }
+});
+```
+
+- [ ] **Step 2: Typecheck worker**
+
+Run: `cd apps/worker && npx tsc --noEmit`
+Expected: PASS.
+
+- [ ] **Step 3: Hit the route**
+
+Run the worker locally and `curl -H "Authorization: Bearer $WORKER_API_TOKEN" localhost:/api/v1/evals`.
+Expected:
+- Arthur unconfigured → `{ available: false, ..., reason: "Arthur GenAI Engine not configured." }`.
+- Configured but nothing graded → `{ available: false, ..., reason: "No graded evals in the last 24h." }`.
+- Configured + graded → `available: true` with `score` / `spansGraded` / `traceCount` (and `rows` once 3b is built).
+
+---
+
+### Task 5: Add the loading skeleton
+
+**Files:**
+- Create: `apps/dashboard/app/evals-skeleton.tsx`
+
+- [ ] **Step 1: Create the skeleton**
+
+Mirror `apps/dashboard/app/overview-skeleton.tsx` — header + one card-shaped block (the Quality group):
+
+```tsx
+// apps/dashboard/app/evals-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+ return ;
+}
+
+export function EvalsSkeleton() {
+ return (
+
+ );
+}
+```
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 6: Add the server data component
+
+**Files:**
+- Create: `apps/dashboard/app/evals-data.tsx`
+
+- [ ] **Step 1: Create the server component**
+
+Mirror `apps/dashboard/app/runs-data.tsx`:
+
+```tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+ const now = new Date().toISOString();
+ const data = await getJSON("/api/v1/evals").catch(() =>
+ evalsFallback(now),
+ );
+ return ;
+}
+```
+
+> This will not typecheck until Task 7 changes `EvalsScreen`'s signature. Expected; full typecheck gate is Task 8.
+
+---
+
+### Task 7: Convert `EvalsScreen` to consume real data
+
+**Files:**
+- Modify: `apps/dashboard/components/cockpit/screens/evals.tsx`
+
+- [ ] **Step 1: Replace imports and signature**
+
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"` and `const D = AIWF_DATA`.
+- Add `import type { EvalsResponse, EvalMetricRow } from "@shared/contracts"`.
+- Change `export function EvalsScreen()` → `export function EvalsScreen({ data }: { data: EvalsResponse })`.
+
+Also remove `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped) and the `groups`/`accents`/`titles` axis-map scaffolding — only the single Quality group remains.
+
+- [ ] **Step 2: Handle the unavailable branch**
+
+When `data.available === false`, render the existing header block (eyebrow + title) but replace the chip with a neutral one and the metric cards with a single panel showing `data.reason`. Mirror the reason path in `EvalHealthKPI` (`overview.tsx`). This covers unconfigured, "no graded evals", and worker-down.
+
+- [ ] **Step 3: Drive the available branch**
+
+- Drive the live chip from `data.spansGraded.toLocaleString("en-US")` + `data.windowHours` instead of the hardcoded `12,408 spans · 24h`; surface `data.score` (e.g. as the headline number).
+- Render a single **Quality** `CkCard` over `data.rows` (all `axis: "quality"`). If `data.rows` is empty (aggregate-only first cut), render just the score + graded-count header, no per-metric grid.
+- Per row: show `metric`, formatted `value`, and the pass/warn/fail `CkChip`.
+- Trend/sparkline: render `e.trend` / `` **only when present**; otherwise render neither. No `jitterSeries`.
+
+- [ ] **Step 4: Verify no mock/jitter references remain**
+
+Run: `grep -nE "AIWF_DATA|\bD\.|jitterSeries" apps/dashboard/components/cockpit/screens/evals.tsx`
+Expected: no matches.
+
+---
+
+### Task 8: Rewrite the route to the server pattern + verify
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/evals/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals")
+import { Suspense } from "react";
+
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
+export default function EvalsPage() {
+ return (
+ }>
+
+
+ );
+}
+```
+
+- [ ] **Step 2: Typecheck the whole app**
+
+Run: `cd apps/dashboard && npx tsc --noEmit` and `cd apps/worker && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed dashboard files**
+
+Run: `cd apps/dashboard && npx next lint --file app/evals-data.tsx --file app/evals-skeleton.tsx --file "app/(cockpit)/evals/page.tsx" --file components/cockpit/screens/evals.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev` (port 3001), open `http://localhost:3001/evals`.
+Expected:
+- With the worker unreachable or Arthur unconfigured: header chrome renders + a single reason panel ("Worker unavailable." / "Arthur GenAI Engine not configured."), no crash.
+- With Arthur configured but nothing graded (`eval_count = 0`): the "No graded evals in the last 24h." panel.
+- With Arthur configured + graded: the real fleet `score` + spans-graded count over the 24h window render; the Quality breakdown appears once Task 3b is built (else just the aggregate header). No sparklines unless Task 3c is wired.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts \
+ apps/worker/src/sandbox/arthur-client.ts \
+ apps/worker/src/lib/overview/collect-evals.ts \
+ apps/worker/src/lib/overview/collect-evals.test.ts \
+ apps/worker/src/routes/api/v1/evals.get.ts \
+ apps/dashboard/lib/api/fallbacks.ts \
+ apps/dashboard/app/evals-data.tsx \
+ apps/dashboard/app/evals-skeleton.tsx \
+ "apps/dashboard/app/(cockpit)/evals/page.tsx" \
+ apps/dashboard/components/cockpit/screens/evals.tsx
+git commit -m "feat: wire /evals to real Arthur eval data"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- `EvalsResponse` / `EvalMetricRow` contract (mapped to `TraceOverviewResponse`; rule families dropped) → Task 1. ✓
+- Worker Arthur read path `getTracesOverview()` + `collect-evals.ts` (+ test) → Task 3; optional breakdown/timeseries → 3b/3c. ✓
+- Worker route `GET /api/v1/evals` with config-check, `eval_count=0` degrade, error degrade → Task 4. ✓
+- `evalsFallback` → Task 2. ✓
+- `evals-data.tsx` server component → Task 6. ✓
+- `evals-skeleton.tsx` (single Quality block) → Task 5. ✓
+- `EvalsScreen` swap (signature, single Quality group, score + spansGraded chip, optional rows/trend/spark, drop `jitterSeries`) → Task 7. ✓
+- `page.tsx` server route → Task 8. ✓
+- Unavailable / no-graded / worker-down states → Tasks 2, 4, 7; verified in Task 8 Step 4. ✓
+- Out-of-scope (New eval button, overview tile, per-span drill-down, synthetic sparklines, `/validate_*` rule families) → not in any task. ✓
+
+**Confirmed dependency:** Arthur read API is ground-truthed (`POST /api/v1/traces/overview`, bearer auth, org-scoped). First increment ships fleet aggregate; per-metric breakdown (3b) and trend (3c) are optional follow-ons. Non-blocking open items (bucket_size, empty-task_ids semantics, whether continuous evals are configured live) noted at top and at their tasks. ✓
+
+**Placeholder scan:** No TBD/TODO; remaining unknowns are the three non-blocking open items, explicitly flagged. ✓
+
+**Type consistency:** `EvalsResponse` imported from `@shared/contracts` in Tasks 2, 4, 6, 7. `EvalsScreen` accepts `{ data: EvalsResponse }` (Task 7) — matches the call site in Task 6. `collectEvals` returns the `available: true` fields (`windowHours`/`score`/`spansGraded`/`traceCount`/`rows`) the route spreads in Task 4. `EvalsSkeleton` (Task 5) matches the import in Task 8. ✓
diff --git a/docs/superpowers/plans/2026-06-08-prompts-real-data.md b/docs/superpowers/plans/2026-06-08-prompts-real-data.md
new file mode 100644
index 0000000..267b1aa
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-prompts-real-data.md
@@ -0,0 +1,690 @@
+# `/prompts` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/prompts` dashboard page from mock data to live worker data, mirroring the `/runs` server-component fetch pattern. Read-only display of the three workflow prompts the worker actually resolves at runtime, **including real Arthur version history**.
+
+**Architecture:** New worker route `GET /api/v1/prompts` returns a typed `PromptsResponse` built from the same resolution logic the durable `loadPrompts()` step uses (Arthur `production` tags with in-code fallbacks), plus each prompt's real Arthur version-history metadata. A second route `GET /api/v1/prompts/[name]/versions/[version]` returns a single historical version's body on demand. Thin server route (`page.tsx`) wraps a server component (`prompts-data.tsx`) in ``; it fetches the list via `getJSON`, falls back to an empty `PromptsResponse`, and passes `data` to the client presenter `PromptsScreen`. The client fetches historical version bodies lazily through a same-origin Next route handler that proxies the worker (keeps the bearer token server-side). Shape mirrors `runs.get.ts` / `runs-data.tsx` / `RunsScreen`.
+
+**Tech stack:** h3 worker (`@apps/worker`), Next.js App Router dashboard (`@apps/dashboard`), shared `@shared/contracts`. Worker has vitest tests; dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-prompts-real-data-design.md`
+
+**Scope decisions baked in (confirmed by user + Arthur API ground-truthing):**
+- Read-only display. No write/edit endpoints. Action buttons left inert.
+- **Real Arthur version history is in scope** (version-list metadata + on-demand bodies). Arthur's version list is metadata only, so per-version eval/halluc/p95/cost metrics and the A/B text diff have **no source** — that markup is **removed**, not stubbed with placeholders.
+- Tags are real (`AgenticPromptVersionResponse.tags`); the `production` badge and tag filter stay, backed by data.
+- Worker route reuses a shared extracted `resolvePrompts()` helper (option A) called by both `loadPrompts()` and the route. Confirmed OK to touch `prompts-step.ts`.
+- Body fetch: production body eager (already resolved); historical bodies lazy via the on-demand route.
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the command for when they do.
+
+---
+
+### Task 1: Add the shared `PromptVersion` + `PromptDef` entities + response contracts
+
+**Files:**
+- Modify: `apps/shared/contracts/domain.ts`
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add `PromptVersion` + `PromptDef` to `domain.ts`**
+
+```ts
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+ /** Arthur integer version number. */
+ version: number;
+ /** ISO timestamp the version was created. */
+ createdAt: string;
+ /** Real Arthur tags on this version, e.g. ["production"]. */
+ tags: string[];
+ modelProvider: string;
+ modelName: string;
+ numMessages: number;
+ numTools: number;
+ /** Body text. Present only for the production version (eager); other
+ * versions are fetched on demand. */
+ body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+ /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+ name: string;
+ /** Human label for the workflow phase, e.g. "Research & Plan". */
+ phase: string;
+ /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+ body: string;
+ /** Where the resolved `body` came from. */
+ source: "arthur" | "fallback";
+ /** Model the agent runs this prompt with (env-derived). */
+ model: string;
+ /** Real Arthur version history, newest first. Empty when source is "fallback". */
+ versions: PromptVersion[];
+}
+```
+
+- [ ] **Step 2: Add `PromptsResponse` + `PromptVersionBodyResponse` to `api.ts`**
+
+Add `PromptDef` to the existing `import type { ... } from "./domain.js"` line (note: `PromptVersion` is only referenced transitively through `PromptDef`, so it need not be imported in `api.ts`), then append:
+
+```ts
+export interface PromptsResponse {
+ generatedAt: string;
+ /** `false` when the worker can't resolve prompts (degrades to empty list). */
+ available: boolean;
+ /** Whether Arthur is configured (key + endpoint + task id all set). When
+ * false, every prompt's `source` is "fallback" and `versions` is empty. */
+ arthurEnabled: boolean;
+ rows: PromptDef[];
+ total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+ generatedAt: string;
+ available: boolean;
+ body: string | null;
+}
+```
+
+- [ ] **Step 3: Typecheck shared**
+
+Run: `pnpm -F @apps/shared exec tsc --noEmit` (or repo-root `pnpm typecheck` if that's the established command — match how the runs plan was verified).
+Expected: PASS.
+
+---
+
+### Task 2: Add Arthur version-list + by-version read methods to `ArthurClient`
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts`
+- Modify: `apps/worker/src/sandbox/arthur-client.test.ts` (add coverage for the new methods, matching the file's existing fetch-mock style)
+
+**Context:** `ArthurClient` already has `getPromptByTag` (fetches a tagged version's body). Add two read methods, ground-truthed against `arthur-ai/arthur-engine` `main`. Both reuse the existing `this.baseUrl` + bearer header convention.
+
+- [ ] **Step 1: Add types + `listPromptVersions`**
+
+```ts
+export interface ArthurPromptVersion {
+ version: number;
+ created_at: string;
+ deleted_at: string | null;
+ model_provider: string;
+ model_name: string;
+ tags: string[];
+ num_messages: number;
+ num_tools: number;
+}
+interface AgenticPromptVersionListResponse {
+ count: number;
+ versions: ArthurPromptVersion[];
+}
+
+/** List version metadata for a named prompt (newest first). First page only. */
+async listPromptVersions(taskId: string, name: string): Promise {
+ const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`;
+ const res = await fetch(`${this.baseUrl}${path}`, {
+ method: "GET",
+ headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+ });
+ if (res.status === 404) return [];
+ if (!res.ok) {
+ const body = await res.text().catch(() => "");
+ throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+ }
+ const data = (await res.json()) as AgenticPromptVersionListResponse;
+ return [...data.versions].sort((a, b) => b.version - a.version);
+}
+```
+
+> Assumption (open Q in spec): first page only — sufficient for the timeline. If deep history is required later, add pagination params here.
+
+- [ ] **Step 2: Add `getPromptVersionBody`**
+
+`getPromptByTag` already parses the by-version endpoint's `AgenticPrompt.messages[0].content` shape (passing a tag as `{prompt_version}`). Generalize it to accept any version specifier (integer / `latest` / ISO datetime / tag):
+
+```ts
+/** Fetch the body of a specific version (int | "latest" | ISO datetime | tag). Null on 404. */
+async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise {
+ const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`;
+ const res = await fetch(`${this.baseUrl}${path}`, {
+ method: "GET",
+ headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+ });
+ if (res.status === 404) return null;
+ if (!res.ok) {
+ const body = await res.text().catch(() => "");
+ throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+ }
+ const prompt = (await res.json()) as AgenticPrompt;
+ return prompt.messages?.[0]?.content ?? null;
+}
+```
+
+> `getPromptByTag` can optionally be refactored to delegate to `getPromptVersionBody(taskId, name, tag)` to remove duplication — low risk, but keep it a separate optional cleanup so the existing `loadPrompts` path is untouched if you skip it.
+
+- [ ] **Step 3: Typecheck + test the worker**
+
+Run: `pnpm -F @apps/worker exec tsc --noEmit` then `pnpm -F @apps/worker exec vitest run src/sandbox/arthur-client.test.ts`
+Expected: PASS, including the new method tests.
+
+---
+
+### Task 3: Extract a reusable `resolvePrompts()` helper in the worker
+
+**Files:**
+- Create: `apps/worker/src/lib/prompts/resolve.ts` (or `apps/worker/src/lib/resolve-prompts.ts` — match existing `lib/` layout)
+- Modify: `apps/worker/src/workflows/prompts-step.ts`
+
+**Context:** `loadPrompts()` (`workflows/prompts-step.ts`) is a `"use step"` durable step returning `{ research, implement, review }`. The Arthur-vs-fallback resolution inside it is what we want to share. Extract the *pure* logic (no `"use step"`) so a plain h3 route can call it too, and have it also collect real version history. `loadPrompts()` then maps the helper's result back to its `{ research, implement, review }` shape so the workflow contract is unchanged.
+
+- [ ] **Step 1: Create the helper (resolves production body + version history per prompt)**
+
+```ts
+// apps/worker/src/lib/prompts/resolve.ts
+import type { PromptVersion } from "@shared/contracts";
+import { env } from "../../../env.js";
+import { logger } from "../logger.js";
+import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js";
+
+const PHASE_LABEL: Record = {
+ "research-plan": "Research & Plan",
+ "implement": "Implement",
+ "review": "Review",
+};
+
+export interface ResolvedPrompt {
+ name: PromptName;
+ phase: string;
+ body: string;
+ source: "arthur" | "fallback";
+ model: string;
+ versions: PromptVersion[];
+}
+
+export interface ResolvePromptsResult {
+ arthurEnabled: boolean;
+ prompts: ResolvedPrompt[];
+}
+
+export async function resolvePrompts(): Promise {
+ const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL;
+ const arthurEnabled =
+ !!env.GENAI_ENGINE_API_KEY &&
+ !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+ !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+ const base = (
+ name: PromptName, body: string, source: "arthur" | "fallback", versions: PromptVersion[] = [],
+ ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions });
+
+ if (!arthurEnabled) {
+ logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved");
+ return {
+ arthurEnabled,
+ prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")),
+ };
+ }
+
+ const { ArthurClient } = await import("../../sandbox/arthur-client.js");
+ const client = ArthurClient.fromTraceEndpoint(
+ env.GENAI_ENGINE_TRACE_ENDPOINT!,
+ env.GENAI_ENGINE_API_KEY!,
+ );
+ const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
+
+ async function one(name: PromptName): Promise {
+ try {
+ const [body, rawVersions] = await Promise.all([
+ client.getPromptByTag(taskId, name, "production"),
+ client.listPromptVersions(taskId, name).catch(() => []),
+ ]);
+ const versions: PromptVersion[] = rawVersions.map((v) => ({
+ version: v.version,
+ createdAt: v.created_at,
+ tags: v.tags,
+ modelProvider: v.model_provider,
+ modelName: v.model_name,
+ numMessages: v.num_messages,
+ numTools: v.num_tools,
+ }));
+ // Attach the eager production body to its matching version entry.
+ const prodVersion = versions.find((v) => v.tags.includes("production"));
+ if (prodVersion && body !== null) prodVersion.body = body;
+
+ if (body === null) {
+ logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved");
+ return base(name, PROMPT_FALLBACKS[name], "fallback", versions);
+ }
+ logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved");
+ return base(name, body, "arthur", versions);
+ } catch (err) {
+ logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved");
+ return base(name, PROMPT_FALLBACKS[name], "fallback");
+ }
+ }
+
+ const prompts = await Promise.all(PROMPT_NAMES.map(one));
+ return { arthurEnabled, prompts };
+}
+```
+
+> Verify the import depth (`../../../env.js`, `../logger.js`, `../prompts.js`, `../../sandbox/arthur-client.js`) against the file's actual location before finalizing — adjust to wherever you place it. The originals in `prompts-step.ts` import `../../env.js`, `./lib/logger.js`, `./lib/prompts.js` from `workflows/`. `@shared/contracts` is the same alias the routes use.
+
+- [ ] **Step 2: Rewrite `loadPrompts()` to delegate to the helper**
+
+Keep the `"use step"` directive, `maxRetries = 0`, and the `{ research, implement, review }` return shape. Replace the body with a call to `resolvePrompts()` and a map by name:
+
+```ts
+export async function loadPrompts(): Promise {
+ "use step";
+ const { resolvePrompts } = await import("../lib/prompts/resolve.js");
+ const { prompts } = await resolvePrompts();
+ const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body]));
+ return {
+ research: byName["research-plan"],
+ implement: byName["implement"],
+ review: byName["review"],
+ };
+}
+loadPrompts.maxRetries = 0;
+```
+
+- [ ] **Step 3: Run the existing prompts-step tests**
+
+Run: `pnpm -F @apps/worker exec vitest run src/workflows/prompts-step.test.ts`
+Expected: PASS. The test mocks `../sandbox/arthur-client.js` and `../../env.js`; if the helper's import paths differ, update the test's mock paths to match (the behavior — fallbacks when disabled, Arthur when enabled — is unchanged).
+
+---
+
+### Task 4: Add the worker routes (`GET /api/v1/prompts` + on-demand version body)
+
+**Files:**
+- Create: `apps/worker/src/routes/api/v1/prompts.get.ts`
+- Create: `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts`
+
+- [ ] **Step 1: Create the list route (mirror `runs.get.ts`)**
+
+```ts
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { PromptsResponse } from "@shared/contracts";
+import { resolvePrompts } from "../../../lib/prompts/resolve.js";
+import { logger } from "../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(
+ event,
+ "Cache-Control",
+ "private, max-age=15, stale-while-revalidate=60",
+ );
+
+ const generatedAt = new Date().toISOString();
+ try {
+ const { arthurEnabled, prompts } = await resolvePrompts();
+ return {
+ generatedAt,
+ available: true,
+ arthurEnabled,
+ rows: prompts,
+ total: prompts.length,
+ };
+ } catch (err) {
+ logger.warn({ err: (err as Error).message }, "prompts_resolve_failed");
+ return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 };
+ }
+});
+```
+
+> `ResolvedPrompt` is structurally assignable to `PromptDef` (same fields incl. `versions`). If TS complains about the `PromptName` vs `string` `name` field, widen via `rows: prompts as PromptDef[]`. Confirm the auth gate that protects `/api/v1/*` (`lib/api-auth.ts`) is applied route-table-wide (not per-file) — no extra wiring needed.
+
+- [ ] **Step 2: Create the on-demand version-body route (mirror `runs/[runId].get.ts`)**
+
+```ts
+// apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
+import { defineEventHandler, setResponseHeader, getRouterParam } from "h3";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+import { env } from "../../../../../../env.js";
+import { PROMPT_NAMES, type PromptName } from "../../../../../lib/prompts.js";
+import { logger } from "../../../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise => {
+ setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60");
+ const generatedAt = new Date().toISOString();
+
+ const name = getRouterParam(event, "name") ?? "";
+ const version = getRouterParam(event, "version") ?? "";
+ const arthurEnabled =
+ !!env.GENAI_ENGINE_API_KEY && !!env.GENAI_ENGINE_TRACE_ENDPOINT && !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+ if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) {
+ return { generatedAt, available: false, body: null };
+ }
+ try {
+ const { ArthurClient } = await import("../../../../../sandbox/arthur-client.js");
+ const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT!, env.GENAI_ENGINE_API_KEY!);
+ const body = await client.getPromptVersionBody(env.GENAI_ENGINE_PROMPT_TASK_ID!, name, version);
+ return { generatedAt, available: body !== null, body };
+ } catch (err) {
+ logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed");
+ return { generatedAt, available: false, body: null };
+ }
+});
+```
+
+> Verify the relative import depth for this nested route path against the repo's actual `tsconfig`/route layout — count segments from `routes/api/v1/prompts/[name]/versions/` back to `apps/worker/{env.ts,src/lib,src/sandbox}`. Adjust `../` counts accordingly (the `env.ts` lives at `apps/worker/env.ts`, not under `src/`). Confirm h3's file-based dynamic-segment convention uses `[name]`/`[version]` here the same way `runs/[runId].get.ts` does.
+
+- [ ] **Step 3: Typecheck the worker**
+
+Run: `pnpm -F @apps/worker exec tsc --noEmit`
+Expected: PASS.
+
+- [ ] **Step 4: Smoke the endpoints locally (optional but recommended)**
+
+Start the worker, then:
+`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:/api/v1/prompts | jq`
+Expected: `{ available: true, arthurEnabled: , total: 3, rows: [3 prompts; each has body, source, model, versions[]] }`. With Arthur on, `versions` is non-empty and one entry carries `body`.
+`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:/api/v1/prompts/research-plan/versions/1 | jq`
+Expected (Arthur on): `{ available: true, body: "..." }`; (Arthur off / missing): `{ available: false, body: null }`.
+
+---
+
+### Task 5: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Add `promptsFallback`**
+
+Add `PromptsResponse` to the existing `import type { ... } from "@shared/contracts"`, then append:
+
+```ts
+export function promptsFallback(now: string): PromptsResponse {
+ return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+}
+```
+
+- [ ] **Step 2: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (no new errors from this file).
+
+---
+
+### Task 6: Add the loading skeleton
+
+**Files:**
+- Create: `apps/dashboard/app/prompts-skeleton.tsx`
+
+- [ ] **Step 1: Create the skeleton (mirror `overview-skeleton.tsx`)**
+
+Header + 4-up KPI row + two-column (rail + detail) block matching the `/prompts` layout:
+
+```tsx
+// apps/dashboard/app/prompts-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+ return ;
+}
+
+export function PromptsSkeleton() {
+ return (
+
+
+
+
+
+
+ {Array.from({ length: 4 }, (_, i) => (
+
+ ))}
+
+
+
+
+
+
+ );
+}
+```
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 7: Add the server data component + the client-side version-body proxy route
+
+**Files:**
+- Create: `apps/dashboard/app/prompts-data.tsx`
+- Create: `apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts`
+
+- [ ] **Step 1: Create the server component (mirror `runs-data.tsx`)**
+
+```tsx
+// apps/dashboard/app/prompts-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import type { PromptsResponse } from "@shared/contracts";
+import { promptsFallback } from "@/lib/api/fallbacks";
+
+export async function PromptsData() {
+ const now = new Date().toISOString();
+ const data = await getJSON("/api/v1/prompts").catch(() =>
+ promptsFallback(now),
+ );
+ return ;
+}
+```
+
+> This won't typecheck until Task 8 changes `PromptsScreen`'s signature. Expected; the full gate is in Task 9.
+
+- [ ] **Step 2: Create the same-origin proxy route for lazy version bodies**
+
+`PromptsScreen` is a client component; the bearer-gated worker API can't be hit from the browser (the token is server-only). Add a Next route handler that proxies the worker server-side:
+
+```ts
+// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
+import { NextResponse } from "next/server";
+import { getJSON } from "@/lib/api/server";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+
+export async function GET(
+ _req: Request,
+ { params }: { params: Promise<{ name: string; version: string }> },
+) {
+ const { name, version } = await params;
+ const now = new Date().toISOString();
+ const data = await getJSON(
+ `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`,
+ ).catch(() => ({ generatedAt: now, available: false, body: null }));
+ return NextResponse.json(data);
+}
+```
+
+> `params` is a Promise in Next 15 route handlers — confirm against the repo's Next version and existing route-handler conventions (check whether other `app/api/**/route.ts` files already exist to mirror their `params` typing). If none exist, this is the first; that's fine.
+
+- [ ] **Step 3: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS for the route handler (the `prompts-data.tsx` line still fails until Task 8; full gate in Task 9).
+
+---
+
+### Task 8: Convert `PromptsScreen` to consume real data (with real version history)
+
+**Files:**
+- Modify: `apps/dashboard/components/cockpit/screens/prompts.tsx`
+
+Keep the read-only registry + version-timeline shape, now backed by real data. **Remove** the per-version metrics grid and the two-column A/B diff (no Arthur source). Reuse existing `CkCard`, `CkKPI`, `Stat`, the chip styling, and the single-column mono body markup lifted from the old `PromptDiff`.
+
+- [ ] **Step 1: Replace imports and remove mock dependency**
+
+```tsx
+"use client";
+
+import React, { useState, useEffect } from "react";
+import { CkCard, CkKPI } from "@/components/ui";
+import type { PromptsResponse, PromptDef, PromptVersion } from "@shared/contracts";
+```
+
+Remove: `AIWF_DATA`, and the mock `Prompt`/`PromptVersion`/`PromptTag` imports from `@/lib/types` (the `PromptVersion` now comes from `@shared/contracts`). Remove `const D = AIWF_DATA`. Keep `useEffect` (used to reset/lazy-load the selected version body when the active prompt changes). `CkChip` stays if still used.
+
+- [ ] **Step 2: Repurpose `PromptStatusChip` for real tags + source**
+
+`PromptStatusChip` keys off a status string. Real statuses now are: the production tag (`production`) on a version, and the resolution `source` (`arthur`/`fallback`). Add `arthur`/`fallback` keys to `PROMPT_STATUS_COLOR` and keep the existing `production`/`staging`/`draft`/`archived`/`locked` keys (real Arthur `tags` may include any string — unknown tags fall through to the default style already coded).
+
+- [ ] **Step 3: Rewrite `PromptList` to consume `PromptDef[]`**
+
+- Signature: `function PromptList({ rows, active, onSelect }: { rows: PromptDef[]; active: string; onSelect: (name: string) => void })`.
+- Tag filter pills: derive the option set from the tags that actually occur across `rows[].versions[].tags` (e.g. `["all", ...uniqueTags]`); filter rows by whether any of their versions carries the selected tag. (If no versions/tags exist — Arthur off — render just `all` or hide the pill row.)
+- Each row keyed by `p.name`; show `p.name`, `p.phase`, `p.model`, the production-tag chip (from the version tagged `production`), and a `source` chip. Remove the eval score/delta figure.
+- `eyebrow`: `` `${arthurEnabled ? "Arthur" : "In-code"} · ${rows.length} prompts` `` — thread `arthurEnabled` through as a prop.
+
+- [ ] **Step 4: Rewrite `PromptDetail` — body panel + real version timeline**
+
+- Signature: `function PromptDetail({ prompt }: { prompt: PromptDef | undefined })`.
+- Keep the "Select a prompt to inspect." empty state when `prompt` is undefined.
+- Header eyebrow: `{prompt.source === "arthur" ? "Arthur" : "In-code"} · {prompt.phase}`. Title: `prompt.name`. Action chips: the `source` chip. Leave the `+ New version` / `Deploy` buttons inert (read-only).
+- Replace the four mock `Stat`s with real ones: `Phase` = `prompt.phase`, `Source` = `prompt.source`, `Model` = `prompt.model`, `Versions` = `prompt.versions.length`.
+- **Version timeline (real):** map `prompt.versions` (newest first). Each card shows: `v{version}`, `createdAt` (format as-is or relative), tag chips (`v.tags`), `modelName`, and `numMessages`/`numTools` counts. **Delete** the mock per-card eval/halluc/p95/cost rows and the `traffic` bar. Clicking a version selects it for the body panel.
+- **Body panel (single column, read-only):** lift the inner mono `` markup from the old `PromptDiff` (drop the two-column diff). Default shows `prompt.body` (the production version). When the user selects a non-production version, fetch its body once via the proxy route and render it:
+ ```tsx
+ const [selectedVersion, setSelectedVersion] = useState
(null);
+ const [bodyCache, setBodyCache] = useState>({});
+ const [loading, setLoading] = useState(false);
+ // reset selection when the prompt changes
+ useEffect(() => { setSelectedVersion(null); }, [prompt?.name]);
+ async function showVersion(v: PromptVersion) {
+ setSelectedVersion(v.version);
+ if (v.body) { setBodyCache((c) => ({ ...c, [v.version]: v.body! })); return; }
+ if (bodyCache[v.version] !== undefined) return;
+ setLoading(true);
+ try {
+ const res = await fetch(`/api/prompts/${prompt!.name}/versions/${v.version}`);
+ const json = (await res.json()) as { body: string | null };
+ setBodyCache((c) => ({ ...c, [v.version]: json.body ?? "(version body unavailable)" }));
+ } finally { setLoading(false); }
+ }
+ const shownBody = selectedVersion != null ? (bodyCache[selectedVersion] ?? (loading ? "Loading…" : "")) : prompt!.body;
+ ```
+- Delete the now-unused `PromptDiff` and `PromptMetrics` functions.
+
+- [ ] **Step 5: Rewrite the top-level `PromptsScreen`**
+
+```tsx
+export function PromptsScreen({ data }: { data: PromptsResponse }) {
+ const [active, setActive] = useState(data.rows[0]?.name ?? "");
+ const selected = data.rows.find((p) => p.name === active);
+ const inProd = data.rows.filter((p) => p.versions.some((v) => v.tags.includes("production"))).length;
+ return (
+
+ {/* header — keep the title; leave the inert Import/New buttons */}
+
+
+
+ {/* A/B + avg-Δ tiles removed — no real source */}
+
+
+
+ );
+}
+```
+
+> Reduced from 4 KPI tiles to 2 because the A/B-test and avg-eval-Δ tiles have no real source (removed, not stubbed). Adjust the grid (`lg:grid-cols-2`) accordingly.
+
+- [ ] **Step 6: Verify no mock references remain**
+
+Run: `grep -nE "AIWF_DATA|\\bD\\.|PROMPT_BODIES|PromptTag|from \"@/lib/types\"" apps/dashboard/components/cockpit/screens/prompts.tsx`
+Expected: no matches (note `PromptVersion` now legitimately appears via `@shared/contracts`, so it's excluded from this grep).
+
+---
+
+### Task 9: Rewrite the route to the server pattern + verify
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/prompts/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts")
+import { Suspense } from "react";
+
+import { PromptsData } from "@/app/prompts-data";
+import { PromptsSkeleton } from "@/app/prompts-skeleton";
+
+export default function PromptsPage() {
+ return (
+ }>
+
+
+ );
+}
+```
+
+- [ ] **Step 2: Typecheck the whole dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed files**
+
+Run: `cd apps/dashboard && npx next lint --file app/prompts-data.tsx --file app/prompts-skeleton.tsx --file "app/api/prompts/[name]/versions/[version]/route.ts" --file "app/(cockpit)/prompts/page.tsx" --file components/cockpit/screens/prompts.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev`, open `/prompts`.
+Expected:
+- Three prompts listed (`research-plan`, `implement`, `review`) by phase + model.
+- Selecting one shows its production body. With Arthur enabled, the version timeline lists real Arthur versions (version number, created-at, tags, model); clicking a historical version fetches and shows that version's body via `/api/prompts/{name}/versions/{version}`.
+- With Arthur disabled, `source` chip reads `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`.
+- With the worker unreachable (`WORKER_BASE_URL` unset), the page shows the empty state (`0 prompts`), no crash. A failed version-body fetch shows an inline "version body unavailable" note, no crash.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts apps/shared/contracts/domain.ts \
+ apps/worker/src/sandbox/arthur-client.ts apps/worker/src/sandbox/arthur-client.test.ts \
+ apps/worker/src/lib/prompts/resolve.ts apps/worker/src/workflows/prompts-step.ts \
+ apps/worker/src/routes/api/v1/prompts.get.ts \
+ "apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts" \
+ apps/dashboard/lib/api/fallbacks.ts apps/dashboard/app/prompts-data.tsx \
+ "apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts" \
+ apps/dashboard/app/prompts-skeleton.tsx "apps/dashboard/app/(cockpit)/prompts/page.tsx" \
+ apps/dashboard/components/cockpit/screens/prompts.tsx
+git commit -m "feat: wire /prompts to real worker data with Arthur version history"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- `PromptVersion` + `PromptDef` + `PromptsResponse` + `PromptVersionBodyResponse` contracts → Task 1. ✓
+- Arthur read methods (`listPromptVersions`, `getPromptVersionBody`) → Task 2. ✓
+- Real data source (Arthur production tags + in-code fallbacks) + version history via shared `resolvePrompts()` → Task 3. ✓
+- Worker list route `GET /api/v1/prompts` + on-demand body route `GET /api/v1/prompts/[name]/versions/[version]` → Task 4. ✓
+- Dashboard `promptsFallback` → Task 5. ✓
+- `prompts-skeleton.tsx` → Task 6. ✓
+- `prompts-data.tsx` server component + client-side version-body proxy route → Task 7. ✓
+- `PromptsScreen` swap to read-only real-data view with real version timeline; per-version metrics + A/B diff markup removed → Task 8. ✓
+- Page route → server pattern → Task 9. ✓
+- Worker-down empty state → `promptsFallback` (Task 5) + route catch (Task 4), verified in Task 9 Step 4. ✓
+- Embellishment removal (per-version eval/halluc/p95/cost, traffic split, eval Δ, A/B test KPI) — markup deleted, not stubbed (Task 8). ✓
+
+**Decisions resolved (no longer open):** read-only confirmed; real version history in scope (metadata + on-demand bodies); tags are real; `resolvePrompts()` extraction confirmed OK; production-body eager / historical lazy.
+
+**Still-open items (flagged in spec, do not block execution):**
+1. Lazy vs eager historical body fetch — plan implements eager-production / lazy-history; switch if the user prefers otherwise.
+2. Version-list pagination depth — plan fetches first page only; add pagination if deep history is required.
+
+**Type consistency:** `PromptsResponse`/`PromptDef`/`PromptVersion`/`PromptVersionBodyResponse` imported from `@shared/contracts` across Tasks 3, 4, 5, 7, 8. `PromptsScreen` accepts `{ data: PromptsResponse }` (Task 8) — matches the call site (Task 7). `ResolvedPrompt` (worker) is structurally assignable to `PromptDef` (incl. `versions: PromptVersion[]`); widen the `name` field if TS narrows on the literal union. `ArthurPromptVersion` (snake_case Arthur shape) is mapped to the camelCase `PromptVersion` inside `resolvePrompts()`. `PromptsSkeleton` (Task 6) matches the import in Task 9. ✓
+
+**Placeholder scan:** No TBD/TODO. Verify, when executing: worker route import depths (esp. the nested `prompts/[name]/versions/[version].get.ts` path), the Next route-handler `params` Promise convention against the repo's Next version, and the worker dev-run command — all flagged inline. ✓
+
diff --git a/docs/superpowers/specs/2026-06-08-cost-real-data-design.md b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md
new file mode 100644
index 0000000..657bfa2
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md
@@ -0,0 +1,166 @@
+# `/cost` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Draft — has open questions (see end)
+**Scope:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token counts and USD cost from the OpenInference traces the workflow ships in.
+
+## Problem
+
+The `/cost` page (`apps/dashboard/app/(cockpit)/cost/page.tsx`) renders a complete UI — spend / token KPIs, a daily-spend area chart, a per-model donut + breakdown table, and a per-workflow/task breakdown table — entirely from mock data (`AIWF_DATA.COST_BY_MODEL`, `AIWF_DATA.HOURS24`, `AIWF_DATA.WORKFLOWS`). The overview and `/runs` pages already fetch real data from the worker; `/cost` should do the same.
+
+The overview's `cost24h` (`KpisResponse`), `Run.cost`, `Run.tokens`, and `WorkflowRow.costToday` are all hardcoded `null` (`collect-kpis.ts:69`, `collect-runs.ts:171-172`, `collect-workflows.ts:81`, `derive-kpis.ts:49`) because the Vercel Workflow run store carries no usage. But the workflow already ships OpenInference traces to Arthur (per-ticket task, `apps/worker/src/sandbox/arthur-tracer.ts` + `arthur-client.ts`), and **Arthur aggregates token + cost data first-class** on those traces. So the real source already exists and is queryable — no new capture or persistence is needed.
+
+## Current state
+
+### What the screen needs (exact data shape)
+
+Read from `apps/dashboard/components/cockpit/screens/cost.tsx`:
+
+| UI element | Mock source | Real source after this change |
+| --- | --- | --- |
+| KPI: spend | `sum(COST_BY_MODEL.cost)` | `totals.totalTokenCost` (USD) |
+| KPI: Tokens | `sum(COST_BY_MODEL.tokens)` | `totals.totalTokens` |
+| KPI: Cost/run avg | hardcoded `$0.41` | `totals.costPerRun` |
+| KPI: Projection EoM | hardcoded `$1,184` | **removed** (no source) |
+| Area chart "Daily spend" | `HOURS24.map(h => h.cost*24)` | `daily[].cost` + `daily[].date` (Arthur timeseries) |
+| Donut "Model mix" | `COST_BY_MODEL[].share` + center | `byModel[].cost` → shares computed in-screen; center = `totalTokenCost` |
+| Table "Per-model breakdown" | `COST_BY_MODEL[]` | `byModel[] { model, cost, tokens }` (span-level aggregation) |
+| Table "Per-workflow breakdown" | `WORKFLOWS[]` sorted by `costToday` | `byWorkflow[]` (= per-Arthur-task; see mapping note) |
+| Header tabs "By model / workflow / actor" | inert | **removed** |
+| "Export CSV" button | inert | **removed** |
+| Sparklines (`Spark`, random `sparkSeries`) | mock RNG | **removed** |
+| Budget `$1,200`, MoM/WoW deltas | hardcoded | **removed** |
+
+Mock shapes (replaced): `CostByModel { model, vendor, cost, tokens, share }` (`apps/dashboard/lib/types.ts:36`); `HourPoint` (`apps/shared/contracts/domain.ts:129`).
+
+### How real data flows (the template — overview/runs)
+
+1. Worker route `apps/worker/src/routes/api/v1/...` returns a typed `@shared/contracts` response; wraps the collector in try/catch and degrades to an empty payload on failure (see `runs.get.ts`, `workflows.get.ts`). Sends `Cache-Control: private, max-age=15, swr=60`.
+2. Response interface declared in `apps/shared/contracts/api.ts`.
+3. Dashboard fetches server-side via `getJSON(path)` (`apps/dashboard/lib/api/server.ts`) — bearer `WORKER_API_TOKEN`, `cache: "no-store"`.
+4. A `*-data.tsx` server component calls `getJSON`, `.catch()`s to a fallback in `apps/dashboard/lib/api/fallbacks.ts`, passes a `data` prop to the client screen.
+5. The page is a thin `}>` route.
+
+This is a **single-PR conversion** — no persistence layer, no two-step rollout.
+
+## The real data source — Arthur GenAI Engine
+
+The worker already holds an Arthur client. `ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY)` (`arthur-client.ts:37`) builds a client whose `request` helper sends `Authorization: Bearer `. Both env vars are optional (`apps/worker/env.ts:83-84`) → when unset, the route falls back to the empty state. Reads require the `INFERENCE_READ` permission on the key. Arthur is org-scoped (the single deployment sees its own org) — consistent with this project's single-tenant deployment model.
+
+### Token + cost are first-class on Arthur traces
+
+Traces/spans extend `TokenCountCostSchema`:
+`{ prompt_token_count, completion_token_count, total_token_count, prompt_token_cost, completion_token_cost, total_token_cost }` (cost in USD floats, `null` if unavailable). Responses also carry `display_currency` (defaults USD).
+
+### Endpoints used
+
+1. **Totals + per-task breakdown (one call):** `POST /api/v1/traces/overview`
+ body `{ task_ids, start_time, end_time }` →
+ `{ count, overviews: [{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }] }`.
+ Multi-task in one call gives fleet totals (sum across `overviews`) **and** the per-task breakdown over a window.
+
+2. **Daily-spend chart:** `POST /api/v1/traces/overview/timeseries`
+ body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) →
+ points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`.
+ For a fleet daily-spend chart, fan out one call per task and **merge points by bucket timestamp**, summing `trace_token_cost`/`trace_token_count`. (`bucket_size` allowed values are unconfirmed — see open questions.)
+
+3. **By-model breakdown (the one manual aggregation):** `GET /api/v1/traces/spans` (and/or `GET /api/v1/traces`) extend `TokenCountCostSchema`, and spans carry `model_name`. The overview endpoint is per-**task**, not per-model, so a by-model table requires fetching span rows for the window and **summing token/cost client-side grouped by `model_name`**. This is the only client-side aggregation; flagged below.
+
+### How usage→cost is computed
+
+No client-side pricing. Arthur returns USD cost directly (`*_token_cost`), already derived from the traces. The worker just sums Arthur's pre-aggregated numbers (for totals/timeseries) or groups span rows by `model_name` (for the by-model table). The pricing table (`apps/worker/src/sandbox/agents/pricing.ts`) and the Slack `usageReport` path are untouched and not on this read path.
+
+### Reconciliation with the overview KPI (out of scope, noted)
+
+The overview's `cost24h` / `WorkflowRow.costToday` / `Run.cost` are hardcoded `null` today. The same Arthur source could backfill those so cost is computed in exactly one place going forward (e.g. `collectKpis`/`collectWorkflows` querying `/traces/overview` for the matching task/window). Out of scope for this PR, but called out so the `null` placeholders aren't reinvented elsewhere.
+
+## Proposed contract (`apps/shared/contracts/api.ts`)
+
+```ts
+export interface CostByModelEntry {
+ model: string; // Arthur span model_name
+ cost: number; // USD, summed total_token_cost over the window
+ tokens: number; // summed total_token_count over the window
+}
+
+export interface CostByWorkflowEntry {
+ /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */
+ taskId: string;
+ /** Arthur task name (= the ticket-run identifier). */
+ name: string;
+ runs: number; // trace_count for the task
+ tokens: number; // trace_token_count
+ cost: number; // trace_token_cost (USD)
+ costPerRun: number; // cost / max(1, runs)
+}
+
+export interface CostResponse {
+ generatedAt: string;
+ /** false when Arthur is unconfigured/unreachable or returns nothing. The
+ * screen renders its empty/N-A state. */
+ available: boolean;
+ /** Window the figures cover (the request's start_time/end_time). */
+ window: { start: string; end: string }; // ISO
+ totals: {
+ totalTokenCost: number; // USD, Σ overviews[].trace_token_cost
+ totalTokens: number; // Σ overviews[].trace_token_count
+ traceCount: number; // Σ overviews[].trace_count
+ costPerRun: number; // totalTokenCost / max(1, traceCount)
+ };
+ byModel: CostByModelEntry[];
+ /** Per-task (= per ticket-run) breakdown from /traces/overview. */
+ byWorkflow: CostByWorkflowEntry[];
+ /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */
+ daily: { date: string; cost: number; tokens: number }[]; // date = bucket ISO timestamp
+}
+```
+
+Notes:
+- `byWorkflow` is named to match the screen's "Per-workflow breakdown" section, but its entries are **per Arthur task** (per ticket-run), since that's the natural grain of `/traces/overview`. See the mapping open question.
+- Stripped from the contract/screen (no real source, per user decision): budget, MoM/WoW deltas, EoM projection, "By actor" tab, decorative sparklines, "Export CSV".
+
+## Fallback / unavailable state
+
+Add `costFallback(now)` to `apps/dashboard/lib/api/fallbacks.ts`:
+
+```ts
+export function costFallback(now: string): CostResponse {
+ return {
+ generatedAt: now,
+ available: false,
+ window: { start: now, end: now },
+ totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+ byModel: [],
+ byWorkflow: [],
+ daily: [],
+ };
+}
+```
+
+The worker route degrades to the same empty payload (`available:false`) when `GENAI_ENGINE_API_KEY`/`GENAI_ENGINE_TRACE_ENDPOINT` are unset or any Arthur call throws — matching `runs.get.ts`/`workflows.get.ts`. The screen renders `$0.00` / `0` / empty tables — never crashes.
+
+## Behavior
+
+- **Happy path:** `/cost` shows real spend, token totals, per-model and per-task breakdowns, and a per-day spend chart, all from Arthur over the chosen window.
+- **Arthur unconfigured / unreachable / 401:** `getJSON` returns (or the worker degrades to) `available:false` → empty/zero state. No crash.
+
+## Out of scope
+
+- Wiring tabs / "Export CSV" (removed).
+- Backfilling the overview's `cost24h`/`costToday`/`Run.cost` from Arthur (mentioned above).
+- A task→workflow mapping for a true by-workflow rollup (breakdown stays per-task).
+
+## Open questions / assumptions
+
+1. **`bucket_size` values.** `/traces/overview/timeseries` takes a `bucket_size`, but the allowed values (e.g. `"day"` vs a duration vs an enum) are unconfirmed. **Assumption:** a day-granularity bucket exists for the daily chart; confirm the exact value.
+2. **Empty `task_ids`.** Does `/traces/overview` with an empty/omitted `task_ids` return org-wide totals, or is `task_ids` required? If required, the worker must first list the org's tasks (the client already lists tasks via `/api/v2/tasks/search`) and pass their ids. **Assumption:** we enumerate tasks and pass ids explicitly.
+3. **By-model client aggregation.** Per-model totals require fetching span rows and summing by `model_name` client-side (Arthur has no per-model overview). Acceptable, given span volume per window? Or drop the by-model table for v1?
+4. **Task→workflow mapping.** Arthur tasks are per ticket-run (`AWT-42`, `AWT-42.1`). The "by workflow" section therefore shows **per-task** rows unless we add a task→workflow mapping. Stated, not blocking; per-task is the natural breakdown.
+5. **Window.** Which window do the KPIs cover — calendar MTD, rolling 30d, or 24h? Drives `start_time`/`end_time`. **Assumption:** calendar month-to-date (matches the original "MTD" framing); confirm.
+
+## Verification
+
+1. Worker + dashboard typecheck pass.
+2. `GET /api/v1/cost` returns non-empty `totals`/`byWorkflow` for a window with real Arthur traces.
+3. `/cost` renders those figures (spend, tokens, breakdowns, daily chart).
+4. With Arthur unconfigured (env unset) or unreachable, `/cost` shows the zero/empty state — no crash.
diff --git a/docs/superpowers/specs/2026-06-08-evals-real-data-design.md b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md
new file mode 100644
index 0000000..95d683a
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md
@@ -0,0 +1,227 @@
+# `/evals` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Draft (has open questions — see end)
+**Scope:** Convert the `/evals` page from mock data to live data, mirroring the overview/runs server-component fetch pattern. Unlike `/runs`, the worker does **not** yet expose an evals list endpoint and the underlying eval results are **not yet read from anywhere** — so this design also covers the prerequisite of producing/reading eval results, with the data-source decision flagged explicitly.
+
+## Problem
+
+The `/evals` dashboard page (`apps/dashboard/app/(cockpit)/evals/page.tsx`) is a 4-line stub that renders `EvalsScreen` with no data fetch. `EvalsScreen` (`apps/dashboard/components/cockpit/screens/evals.tsx`) is a `"use client"` component that reads the hardcoded `AIWF_DATA.EVALS` mock slice and draws synthetic sparklines via `jitterSeries`. Nothing on this page is real.
+
+We want `/evals` to fetch real data from the worker through the same three-layer pattern the overview and runs pages use:
+1. thin server route (`page.tsx`) → `` + server data component;
+2. `evals-data.tsx` server component calling `getJSON` with a `.catch()` fallback;
+3. client presenter `EvalsScreen` receiving a typed `data` prop.
+
+## Current state
+
+### Mock (what the screen renders today)
+
+`apps/dashboard/lib/data/mock.ts` exports `EVALS: EvalMetric[]` (the "Arthur evals" slice, lines ~82–93). The shape is `EvalMetric` from `apps/dashboard/lib/types.ts`:
+
+```ts
+export interface EvalMetric {
+ metric: string; // "Hallucination", "PII Detection", …
+ value: number; // numeric reading
+ target: string; // human string, e.g. "< 0.05", "= 0", "flags"
+ status: "pass" | "warn" | "fail";
+ trend: number; // signed delta vs prior window
+ axis: "safety" | "quality" | "ops"; // grouping bucket
+ family: string; // "output" | "agent" | "input" | "rag" | "runtime"
+ unit?: string; // optional, e.g. "flags/24h"
+}
+```
+
+`EvalsScreen` renders, per `axis` group ("safety", "quality", "ops"):
+- a `CkCard` with eyebrow=axis, title from a fixed map, a left-border accent color, and an action label `{list.length} evaluators`;
+- one cell per metric containing: `metric` name, a `pass`/`warn`/`fail` `CkChip`, the formatted `value` (`<1` → `toFixed(3)`, else as-is), optional `unit`, a `trend` indicator (`↗`/`↘`/`→` + `Math.abs(trend).toFixed(3)`; **negative trend renders green, positive red** — i.e. "down is good" by current convention), a `Spark` sparkline, and `target {e.target}`.
+
+Header chrome is decorative/hardcoded: the eyebrow "Arthur engine · continuous evaluation", the title "Evaluations & guardrails", a `CkChip` "Live · 12,408 spans · 24h", and a `+ New eval` button.
+
+**The sparkline is fake:** `Spark data={jitterSeries(...)}`. There is no per-metric time series in the mock or anywhere else.
+
+### Existing eval scaffold
+
+`apps/shared/contracts/api.ts` already declares a discriminated union:
+
+```ts
+export type EvalHealthResponse =
+ | { available: true; score: number; pass: number; warn: number; fail: number;
+ spansGraded: number; windowHours: number }
+ | { available: false; reason: string };
+```
+
+The worker route `apps/worker/src/routes/api/v1/overview/eval-health.get.ts` is a hardcoded stub returning `{ available: false, reason: "Eval grading not wired up yet." }`. The overview page already consumes it: `overview-data.tsx` fetches `/api/v1/overview/eval-health` (falls back to `evalHealthFallback()` → `{ available: false, reason: "Worker unavailable." }`), and `EvalHealthKPI` in `overview.tsx` renders a donut of pass/warn/fail + score + `spansGraded`/`windowHours` when `available`, else the `reason` string. This is a **summary** KPI tile, not the per-metric breakdown the `/evals` page needs.
+
+### Where eval results actually originate (the real data source — CONFIRMED)
+
+Arthur is integrated **write-only** today:
+- `apps/worker/src/sandbox/arthur-client.ts` — a client for the Arthur GenAI Engine **tasks/prompts** API (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). It creates one task per ticket run and hosts/tags prompt versions. It has **no** read method yet.
+- `apps/worker/src/sandbox/arthur-tracer.ts` — a bundled Python OpenInference tracer that **ships traces/spans into** Arthur Engine from inside each sandbox via `POST /api/v1/traces`. Data flows out of the worker; nothing reads it back.
+- Wiring lives in `apps/worker/src/workflows/agent.ts` (`ensureArthurTaskForTicket`, gated on `env.GENAI_ENGINE_API_KEY` + `env.GENAI_ENGINE_TRACE_ENDPOINT`).
+
+**The Arthur GenAI Engine DOES expose a read API** (ground-truthed from `arthur-ai/arthur-engine` + `arthur-common` on `main`). Auth is the **same** `Authorization: Bearer GENAI_ENGINE_API_KEY` used for writes; reads require the `INFERENCE_READ` permission. All reads are **org-scoped** — a deployment's key sees its whole org, which matches our single-tenant-per-deployment model. The relevant endpoints:
+
+- **Fleet aggregate (primary source for this page) — one call, multi-task:**
+ `POST /api/v1/traces/overview` body `TraceOverviewRequest { task_ids, start_time, end_time }` → `TraceOverviewListResponse { count, overviews: TraceOverviewResponse[] }`. Each `TraceOverviewResponse` = `{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }`. This yields fleet-wide eval health (success rate + trace/eval counts) over a 24h window with no per-task fan-out at the result-shaping layer.
+- **Per-metric breakdown (optional):** `GET /api/v1/traces/spans` (list, metadata only) → `GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results: MetricResultResponse[]` where each = `{ id, metric_type, details, prompt_tokens, completion_tokens, latency_ms, span_id, metric_id, created_at }`. `metric_type` is an enum of **only** `QueryRelevance | ResponseRelevance | ToolSelection`. `details` is an opaque JSON string (e.g. relevance → `{ bert_f_score, reranker_relevance_score, llm_relevance_score, reason }`). **There is no flat numeric score or pass/fail on a metric result** — we parse `details` and apply our own threshold.
+- **Trend/timeseries (optional):** `POST /api/v1/traces/overview/timeseries` body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) → points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`.
+
+#### CRITICAL CAVEAT — what our trace path actually yields
+
+The rich rule-based evals the mock screen implies — **hallucination, PII, toxicity, prompt-injection** Pass/Fail — live in Arthur's **legacy inference/rule model**, populated **only** by the `/validate_prompt` + `/validate_response` write path. **We never call that path; we only ship OpenInference traces (`POST /api/v1/traces`).** Therefore `GET /api/v2/inferences/query` and those rule families are **empty for us**.
+
+What our trace path actually produces:
+- `continuous_eval_success_rate`, `eval_count` (spans graded), `trace_count` — from `/traces/overview`;
+- the three relevance/tool metric types — and **only if continuous evals are configured on the task**; otherwise `eval_count = 0`.
+
+So the realistic `/evals` page = an overall **eval-health score** (`continuous_eval_success_rate × 100`), the **graded count + window**, and a **relevance / tool-selection breakdown**. The hallucination/toxicity/PII/prompt-injection families the mock shows are **dropped** from this page. Adopting Arthur's `validate_*` API to populate them is a **separate future prerequisite, explicitly out of scope** here.
+
+**Conclusion:** evals are now reachable via a confirmed read API, so this is no longer blocked. Conversion's prerequisite is to add a worker-side read path (`getTracesOverview()` on `ArthurClient` + a `collect-evals.ts` collector). When Arthur is unconfigured, or when `eval_count = 0` (no continuous evals configured / no graded spans in window), the page degrades to the documented unavailable state — exactly like `eval-health` does today.
+
+## Proposed data contract
+
+Add to `apps/shared/contracts/api.ts`. The shape now maps directly to `TraceOverviewResponse` (the fleet aggregate) plus the relevance/tool-selection breakdown. We reuse the **same discriminated-union shape** as `EvalHealthResponse` so the page handles "not wired up" / "nothing graded" identically to overview. Fields with no real source on our trace-only path are **dropped** (no synthetic sparklines, no rule families).
+
+```ts
+/** One evaluator's aggregate reading over the window. Limited to the metric
+ * types Arthur computes from our OpenInference trace path:
+ * ResponseRelevance / QueryRelevance / ToolSelection. */
+export interface EvalMetricRow {
+ metric: string; // display name, e.g. "Response Relevance"
+ metricType: // Arthur metric_type enum
+ | "QueryRelevance"
+ | "ResponseRelevance"
+ | "ToolSelection";
+ value: number; // aggregate score parsed from metric_results.details
+ status: "pass" | "warn" | "fail"; // computed against our own threshold
+ axis: "quality"; // all three are quality-axis on our path
+ // Only present when /traces/overview/timeseries is wired (see Open Q1).
+ trend?: number | null; // signed delta vs window start; omitted if not wired
+ spark?: number[]; // success-rate buckets; omitted if not wired
+}
+
+export type EvalsResponse =
+ | {
+ available: true;
+ generatedAt: string;
+ windowHours: number;
+ /** continuous_eval_success_rate × 100, fleet-wide. */
+ score: number;
+ /** Σ eval_count across tasks — "spans graded" in the window. */
+ spansGraded: number;
+ /** Σ trace_count across tasks. */
+ traceCount: number;
+ /** Per-metric-type breakdown; empty if no continuous evals configured. */
+ rows: EvalMetricRow[];
+ }
+ | { available: false; generatedAt: string; reason: string };
+```
+
+Notes:
+- `score`/`spansGraded`/`traceCount`/`windowHours` come straight from summing `TraceOverviewResponse` fields across the returned overviews.
+- `EvalMetricRow.value`/`status` require the **optional** per-span breakdown (Open Q below). If we ship the aggregate-only first cut, `rows` is `[]` and the page renders the score + graded count without the per-metric grid. This keeps the first increment small.
+- `target`/`family`/`unit` from the old draft are **removed** — they were presentation metadata for rule families we cannot populate. `axis` collapses to the single `"quality"` literal because only relevance/tool metrics exist on our path.
+- `trend`/`spark` are present **only** if `/traces/overview/timeseries` is wired (Open Q1); otherwise omitted entirely (no static placeholders).
+
+**Assumption:** the `/evals` page consumes only this trace-derived data; the existing `EvalHealthResponse` summary tile on overview is left untouched. We do **not** consolidate the two endpoints in this change (though `EvalsResponse.score`/`spansGraded` could later feed it).
+
+## Real data source & how it's obtained (worker side)
+
+New worker route `GET /api/v1/evals` → `EvalsResponse`, structured like `runs.get.ts`:
+- sets `Cache-Control: private, max-age=15, stale-while-revalidate=60`;
+- if `env.GENAI_ENGINE_API_KEY` / `env.GENAI_ENGINE_TRACE_ENDPOINT` are unset, returns `{ available: false, reason: "Arthur GenAI Engine not configured." }` (no throw);
+- otherwise builds an `ArthurClient` (via the existing `ArthurClient.fromTraceEndpoint`) and calls a new read method `getTracesOverview({ taskIds, startTime, endTime })` → `POST /api/v1/traces/overview`. The new `apps/worker/src/lib/overview/collect-evals.ts` collector sums the returned `overviews` into `score`/`spansGraded`/`traceCount`, and (optionally) shapes `rows` from the per-span metric breakdown. Returns `available: true`;
+- if `eval_count` sums to `0` (no continuous evals configured on our tasks, or nothing graded in window), return `{ available: false, reason: "No graded evals in the last 24h." }` — there is genuinely nothing to show;
+- on any error, logs `evals_list_failed` and returns `{ available: false, reason: "Eval grading not wired up yet." }` — same degrade behavior as the other routes.
+
+**Task-id enumeration:** `/traces/overview` takes `task_ids`. It is **unconfirmed** whether an empty/omitted `task_ids` means "all org tasks" (Open Q2). If it does, we pass none. If it does not, we first enumerate the org's tasks via the existing `/api/v2/tasks/search` path (the `ArthurClient` already does substring search there) and pass their ids. The collector boundary (`collect-evals.ts` taking an injected fetcher) keeps this isolated and testable, matching `collect-runs.ts`/`collect-kpis.ts`.
+
+## Dashboard changes
+
+### 1. `app/(cockpit)/evals/page.tsx` (rewrite)
+Thin server route, drops the direct screen import:
+```tsx
+import { Suspense } from "react";
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
+export default function EvalsPage() {
+ return (
+ }>
+
+
+ );
+}
+```
+
+### 2. `app/evals-data.tsx` (new server component)
+Mirrors `runs-data.tsx`:
+```tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+ const now = new Date().toISOString();
+ const data = await getJSON("/api/v1/evals").catch(() =>
+ evalsFallback(now),
+ );
+ return ;
+}
+```
+
+### 3. `lib/api/fallbacks.ts` (add)
+```ts
+export function evalsFallback(now: string): EvalsResponse {
+ return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+```
+
+### 4. `components/cockpit/screens/evals.tsx` (modify)
+- Signature `EvalsScreen()` → `EvalsScreen({ data }: { data: EvalsResponse })`.
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `const D = AIWF_DATA`, and `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped — no static placeholders).
+- Import `EvalsResponse`/`EvalMetricRow` from `@shared/contracts` (drop the mock `EvalMetric` reliance).
+- When `data.available === false`, render the existing header chrome but replace the metric cards with a single empty/unavailable panel showing `data.reason` (mirroring `EvalHealthKPI`'s reason path). This is also the state when nothing is graded.
+- When `available`:
+ - Drive the "Live · N spans · 24h" chip from `data.spansGraded` / `data.windowHours` instead of the hardcoded "12,408 spans · 24h"; optionally show `data.score`.
+ - The mock's three axis groups (safety/quality/ops) collapse to a single **Quality** group, since only relevance/tool metrics exist on our path. Render `data.rows` (all `axis: "quality"`) in one card.
+ - Each row shows `metric`, the formatted `value`, and the pass/warn/fail `CkChip`.
+ - Sparkline / trend: render `e.spark` / `e.trend` **only when present** (timeseries wired); otherwise render neither. Drop the `Spark`/`jitterSeries` usage when not wired.
+ - If `rows` is empty (aggregate-only first cut), render just the score + graded-count header — no per-metric grid.
+
+### 5. `app/evals-skeleton.tsx` (new)
+Loading fallback styled like `overview-skeleton.tsx` — header placeholder + one card-shaped block (the Quality group).
+
+## Behavior
+
+- **Happy path (Arthur configured, continuous evals graded):** `/evals` renders the fleet eval-health score + spans-graded count over the real 24h window, and (if the per-span breakdown is wired) a Quality card of relevance/tool-selection metrics. Trend/sparkline appear only when the timeseries call is wired.
+- **Arthur not configured:** worker returns `available: false`, reason "Arthur GenAI Engine not configured." Page shows header chrome + reason panel. No crash.
+- **Nothing graded (`eval_count = 0`):** worker returns `available: false`, reason "No graded evals in the last 24h." Same panel.
+- **Worker down / 401:** `getJSON` throws → `evalsFallback` → `available: false`, reason "Worker unavailable." Same silent-degrade as overview/runs.
+
+## Out of scope
+
+- Wiring up the `+ New eval` button.
+- The `EvalHealthResponse` overview tile (left as-is; could later be derived from `EvalsResponse` but not in this change).
+- **Adopting Arthur's `/validate_prompt` + `/validate_response` write path** to populate the legacy rule families (hallucination, PII, toxicity, prompt-injection). This is the prerequisite for those metrics and is a **separate future effort** — those families are simply absent from this page.
+- Per-span drill-down / individual inference detail views.
+- Synthetic sparklines — removed entirely (no static placeholders).
+
+## Open questions / assumptions (need user decision)
+
+The Arthur read API is now **confirmed** (see "Where eval results actually originate"). Remaining genuinely-open items:
+
+1. **`/traces/overview/timeseries` `bucket_size` values.** The allowed `bucket_size` values are unconfirmed. Needed only if we wire trend/sparkline; the aggregate-only first cut does not require it. **Assumption:** trend/sparkline are deferred to a second increment.
+2. **Empty `task_ids` semantics.** Does `POST /api/v1/traces/overview` treat an empty/omitted `task_ids` as "all org tasks"? If yes, one call with no ids suffices. If no, the collector must first enumerate tasks via `/api/v2/tasks/search`. **Assumption:** unconfirmed → plan covers both paths; default to enumerating tasks if empty-means-all is not verified.
+3. **Are continuous evals actually configured on our tasks in the live instance?** If continuous evals are not enabled on the per-ticket tasks, `eval_count = 0` and the page legitimately shows the "No graded evals" state. Confirming this is what determines whether the happy path ever fires today.
+
+Resolved (no longer open): read-API existence/shape, auth, org-scope/single-tenant aggregation, and the metric-family set (only relevance/tool on our path; rule families dropped).
+
+## Verification
+
+1. Shared + worker + dashboard typecheck pass (`npx tsc --noEmit`) with `EvalsResponse` imported in the route, `evals-data.tsx`, and `evals.tsx`.
+2. With the worker unreachable (or Arthur unconfigured), `/evals` renders header chrome + the reason panel, no crash.
+3. With Arthur configured and continuous evals graded, `/evals` renders the real fleet score + spans-graded count over the 24h window (and the Quality breakdown if wired).
+4. With Arthur configured but `eval_count = 0`, `/evals` shows the "No graded evals in the last 24h." panel.
diff --git a/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md
new file mode 100644
index 0000000..49be94c
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md
@@ -0,0 +1,223 @@
+# `/prompts` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Approved
+**Scope:** Swap the existing `/prompts` page from mock data to live worker data, mirroring the `/runs` and overview pattern. **Read-only display, including real Arthur version history.** No write/edit endpoints. Embellishment fields with no real backing are removed (markup deleted, not stubbed with placeholders).
+
+## Problem
+
+The `/prompts` dashboard page (`apps/dashboard/app/(cockpit)/prompts/page.tsx`) renders a full prompt-registry UI but is wired entirely to mock data (`AIWF_DATA.PROMPTS`, `PROMPT_VERSIONS`, `PROMPT_BODIES` from `@/lib/data/mock`). The overview and `/runs` pages already fetch real data from the worker through a server-component pattern. We want `/prompts` to show the prompts the worker actually drives the AI workflow with.
+
+## Real data source (the important finding)
+
+In this project, "prompts" are the three system prompts that drive each workflow phase. They live in the worker, not in a CMS:
+
+- **Static fallbacks (source of truth in code):** `apps/worker/src/lib/prompts.ts` defines three constant strings — `researchPlanPrompt`, `implementPrompt`, `reviewPrompt` — exported as `PROMPT_FALLBACKS: Record` keyed by `PROMPT_NAMES = ["research-plan", "implement", "review"]`.
+- **Optional runtime override (Arthur GenAI Engine):** `apps/worker/src/workflows/prompts-step.ts`'s `loadPrompts()` step checks whether `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT`, and `GENAI_ENGINE_PROMPT_TASK_ID` are all set. If so, it fetches the `production`-tagged version of each prompt from Arthur via `ArthurClient.getPromptByTag(taskId, name, "production")` (`apps/worker/src/sandbox/arthur-client.ts`). On 404 / error / Arthur disabled it falls back to the in-code `PROMPT_FALLBACKS` string for that name.
+- **Seeding:** `apps/worker/scripts/setup-arthur-prompts.ts` is a one-shot script that pushes the three fallback strings into a single Arthur task named `ai-workflow-prompts` and tags each `production`. This is the only writer; nothing in the request/runtime path writes prompts.
+
+**Arthur read API (ground-truthed against `arthur-ai/arthur-engine` `main`).** Auth is the same `Authorization: Bearer GENAI_ENGINE_API_KEY`; prompt reads require the `TASK_READ` scope. Three endpoints are relevant:
+
+- **List versions (metadata only):** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions` → `AgenticPromptVersionListResponse { count, versions: AgenticPromptVersionResponse[] }`. Each `AgenticPromptVersionResponse`: `{ version (int), created_at, deleted_at (nullable), model_provider, model_name, tags: string[], num_messages, num_tools }`. **No message body and no per-version eval metrics.**
+- **Fetch a version body:** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions/{prompt_version}` where `{prompt_version}` accepts `latest` | an integer | an ISO datetime | a tag → `AgenticPrompt { messages }`. This is the endpoint the existing `ArthurClient.getPromptByTag` already uses (it passes a tag). We use it to fetch the body of any specific version (the `production`-tagged one eagerly; an arbitrary version on demand).
+- **List all prompts on a task:** `GET /api/v1/tasks/{task_id}/prompts` → `LLMGetAllMetadataListResponse { count, llm_metadata: [{ name, versions, tags, created_at, latest_version_created_at, deleted_versions }] }`. Not strictly needed — our three phase-prompt names are fixed — so we don't use it.
+
+**Conclusion:** there is no editable prompt *registry* in this app, and the worker never persists prompt metadata locally — but Arthur **does** expose real version history (version number, created-at, tags, model) per named prompt, plus on-demand bodies. So the real, available data per phase prompt is: a stable name, the human phase label, the resolved **production body**, the resolved `source` (`arthur` | `fallback`), the model, and a list of **real Arthur versions** (`{ version, createdAt, tags, modelProvider, modelName, numMessages, numTools }`).
+
+This makes the conversion a faithful read-only swap **with real version history**. The mock-only fields that have **no Arthur source** — per-version eval/halluc/p95/cost metrics, traffic split, KPI deltas, `lastEditedBy`, the two-version A/B text diff — are **removed** (markup deleted, not replaced with static placeholders). Tags are real (`AgenticPromptVersionResponse.tags`), so a `production` badge and a tag filter are backed by data and kept.
+
+## Current state (mock)
+
+`apps/dashboard/components/cockpit/screens/prompts.tsx` (`PromptsScreen`) consumes three mock slices via `const D = AIWF_DATA`:
+
+1. `D.PROMPTS: Prompt[]` — 7 entries. Per the mock `Prompt` type (`apps/dashboard/lib/types.ts:64`):
+ `id`, `name`, `workflow`, `workflowName`, `span`, `versionCount`, `current`, `trafficSplit: Record`, `evalScore`, `evalDelta`, `lastEditedBy`, `lastEditedAtMin`, `tags: PromptTag[]`, `model`.
+2. `D.PROMPT_VERSIONS: Record` — only `p_plan_changes` has history. Per `PromptVersion` (`types.ts:81`):
+ `v`, `deployedAt`, `by`, `status: PromptTag`, `traffic`, `evalScore`, `runs`, `costAvg`, `p95`, `halluc`, `change`.
+3. `D.PROMPT_BODIES: Record` — body text keyed by version label (`v12`, `v11`).
+
+`PromptTag = "production" | "staging" | "draft" | "archived" | "locked" | "ab-test"`.
+
+What the screen renders from these:
+- **Header KPIs** (`CkKPI`): total prompts, count in `production`, count of `ab-test`, and a hardcoded `"+0.4%"` avg eval delta.
+- **Left rail `PromptList`:** tag filter pills (`all/production/staging/draft/locked`), per-prompt row showing `name`, `current` version, `workflowName`, tag chips, and an `evalScore`/`evalDelta` figure.
+- **Right pane `PromptDetail`:** header eyebrow `Arthur · {workflowName} → {span}`, `+ New version` / `Deploy` buttons, four `Stat`s (current version, version count, eval score, traffic split), a **version timeline** of `PromptVersion[]`, a two-column **text diff** between two selected versions (`PromptDiff`, reads `PROMPT_BODIES`), and a **side-by-side metrics** table (`PromptMetrics`: evalScore/halluc/p95/costAvg/runs). It already has graceful empty states: "Select a prompt to inspect." and "Detailed version history not yet captured for this prompt." (rendered when `versions.length === 0`).
+
+The page (`app/(cockpit)/prompts/page.tsx`) is a 4-line stub that renders `` with no data fetch.
+
+## Existing pattern (template)
+
+Real data flows through three layers (see `app/overview-data.tsx`, `app/runs-data.tsx`):
+
+1. `app/(cockpit)//page.tsx` — thin server route: `}>`.
+2. `app/-data.tsx` — **server component**: calls `getJSON(path)` (`lib/api/server.ts`, server-only fetch with `Bearer WORKER_API_TOKEN`, `cache: "no-store"`, 10s timeout), `.catch()`es to a fallback in `lib/api/fallbacks.ts`, passes a `data` prop to the client screen.
+3. `components/cockpit/screens/.tsx` — **client presenter**: receives `data`, renders. Untracked metrics arrive `null`/empty and render as `—` or an empty state.
+
+Worker routes live under `apps/worker/src/routes/api/v1/*.get.ts` as h3 `defineEventHandler`s returning a typed `@shared/contracts` response, gated by the shared bearer token (`apps/worker/src/lib/api-auth.ts`). Response interfaces are declared in `apps/shared/contracts/api.ts`; row/entity types in `apps/shared/contracts/domain.ts`.
+
+## Proposed data contract
+
+Add to `apps/shared/contracts/api.ts`. Entity type goes in `domain.ts` (currently has no prompt type).
+
+### `apps/shared/contracts/domain.ts` (new entities)
+
+```ts
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+ /** Arthur integer version number. */
+ version: number;
+ /** ISO timestamp the version was created. */
+ createdAt: string;
+ /** Real Arthur tags on this version, e.g. ["production"]. */
+ tags: string[];
+ modelProvider: string;
+ modelName: string;
+ numMessages: number;
+ numTools: number;
+ /** Body text. Present only for the production version (eager); other
+ * versions are fetched on demand via the by-version endpoint. */
+ body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+ /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+ name: string;
+ /** Human label for the workflow phase, e.g. "Research & Plan". */
+ phase: string;
+ /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+ body: string;
+ /** Where the resolved `body` came from. */
+ source: "arthur" | "fallback";
+ /** Model the agent runs this prompt with (env-derived). */
+ model: string;
+ /** Real Arthur version history, newest first. Empty when source is "fallback". */
+ versions: PromptVersion[];
+}
+```
+
+### `apps/shared/contracts/api.ts` (new response)
+
+```ts
+export interface PromptsResponse {
+ generatedAt: string;
+ /** `false` when the worker can't resolve prompts (degrades to empty list). */
+ available: boolean;
+ /** Whether Arthur is configured (key + endpoint + task id all set). When
+ * false, every prompt's `source` is "fallback" and `versions` is empty. */
+ arthurEnabled: boolean;
+ rows: PromptDef[];
+ total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+ generatedAt: string;
+ available: boolean;
+ body: string | null;
+}
+```
+
+**Body fetch strategy — decided: eager for the production version, lazy for the rest.** The list response carries every phase prompt with its full `versions` metadata array and the **production body eagerly** on `PromptDef.body` (we already fetch it to resolve what the workflow uses, so it's free). Non-production version bodies are NOT shipped in this response — `PromptVersion.body` is `undefined` for them. When the user expands a historical version, the screen fetches that single body on demand through a second worker route (see "Worker routes"). This keeps the list response small (3 bodies, not N) and avoids fanning out an unbounded number of Arthur body calls per page load.
+
+Notes:
+- `available` follows the `RunsResponse`/`RunDetailResponse` convention: `true` on a successful resolve, `false` in the fallback object.
+- `arthurEnabled` lets the screen honestly say "showing in-code defaults" vs "showing production prompts from Arthur".
+- Per-version eval/halluc/p95/cost metrics, traffic split, and `lastEditedBy` are **not** in the contract — Arthur's version list is metadata only and has no such source. The screen markup that rendered them is removed.
+
+## Worker routes
+
+### `GET /api/v1/prompts` — list (new file `apps/worker/src/routes/api/v1/prompts.get.ts`, mirrors `runs.get.ts`)
+
+- `defineEventHandler` returning `PromptsResponse`, same `Cache-Control: private, max-age=15, stale-while-revalidate=60` header.
+- Resolve all three phase prompts via a shared helper `resolvePrompts()`. The exact production-body resolution already lives in `loadPrompts()` (`workflows/prompts-step.ts`), which is a `"use step"` durable step returning `{ research, implement, review }` — not callable from a plain h3 route. **Decision (option A, confirmed OK to touch the step):** extract the pure resolution into `apps/worker/src/lib/prompts/resolve.ts`, returning `PromptDef[]` + `arthurEnabled`, and have **both** `loadPrompts()` and the route call it. Single source of truth, no drift.
+- Per prompt, `resolvePrompts()` does:
+ - `model` = `env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL` (same expression as `runs.get.ts`).
+ - `phase` from a static label map: `research-plan → "Research & Plan"`, `implement → "Implement"`, `review → "Review"`.
+ - When Arthur is enabled: fetch the `production`-tagged body via the existing `ArthurClient.getPromptByTag(taskId, name, "production")` (→ `body`, `source: "arthur"`), AND fetch the version list via a new `ArthurClient.listPromptVersions(taskId, name)` (→ `versions: PromptVersion[]`, newest first). Any single failure degrades that prompt to its in-code fallback body, `source: "fallback"`, `versions: []` — same per-prompt try/catch the current step already has.
+ - When Arthur is disabled: `body` = `PROMPT_FALLBACKS[name]`, `source: "fallback"`, `versions: []`.
+- `available: true` on success; the `catch` returns the empty `available:false` object (matching `runs.get.ts`). Resolution rarely fully throws because each prompt independently falls back, so the happy path always has three rows.
+
+### `GET /api/v1/prompts/[name]/versions/[version]` — on-demand body (new file)
+
+Backs lazy body fetching for historical versions the user expands. New file `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts` (h3 dynamic-segment pattern, same as the existing `runs/[runId].get.ts`):
+
+- Reads route params `name` and `version`, validates `name` against `PROMPT_NAMES` (404/empty otherwise), calls a new `ArthurClient.getPromptVersionBody(taskId, name, version)` which hits `GET /api/v1/tasks/{task_id}/prompts/{name}/versions/{version}` and returns the first message content (the existing `getPromptByTag` already parses this `AgenticPrompt.messages[0].content` shape — generalize it to accept any `{prompt_version}`).
+- Returns a small typed response `PromptVersionBodyResponse { generatedAt; available: boolean; body: string | null }` (add to `api.ts`). When Arthur is disabled or the version is missing → `available:false, body:null`.
+- Same `Cache-Control` header and bearer gate as the other v1 routes.
+
+## Dashboard wiring
+
+1. **`lib/api/fallbacks.ts`** — add `promptsFallback(now)`:
+ ```ts
+ export function promptsFallback(now: string): PromptsResponse {
+ return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+ }
+ ```
+2. **`app/prompts-data.tsx`** (new server component), single fetch like `runs-data.tsx`:
+ ```ts
+ const data = await getJSON("/api/v1/prompts").catch(() => promptsFallback(now));
+ return ;
+ ```
+3. **`app/prompts-skeleton.tsx`** (new) — header + KPI row + two-column (rail + detail) block, styled like `overview-skeleton.tsx`.
+4. **`app/(cockpit)/prompts/page.tsx`** — rewrite to the `}>` shape.
+5. **`components/cockpit/screens/prompts.tsx`** — change `PromptsScreen()` to `PromptsScreen({ data }: { data: PromptsResponse })`. Map the real `PromptDef[]` onto the existing UI. Keep the tag filter and version timeline (now real), but **delete** the per-version metrics grid and the two-column A/B diff (no backing data). Historical-version body expansion fetches lazily from the on-demand route.
+6. **On-demand version-body fetch (client).** `PromptsScreen` is a `"use client"` presenter, so expanding a historical version does a client-side `fetch`. The bearer-gated worker API is not directly reachable from the browser (the `WORKER_API_TOKEN` is server-only — see `lib/api/server.ts`). So add a thin Next route handler `app/api/prompts/[name]/versions/[version]/route.ts` that re-uses `getJSON("/api/v1/prompts//versions/")` server-side and returns it to the client. The screen fetches `/api/prompts/{name}/versions/{version}` (same-origin, no token exposure). Cache the resolved body in component state so re-expanding doesn't refetch.
+
+### Screen mapping (mock field → real field / behavior)
+
+| Mock usage | Real replacement |
+|---|---|
+| `D.PROMPTS` list | `data.rows` (3 `PromptDef`) |
+| `p.id` (row key, selection) | `p.name` (stable key) |
+| `p.name` | `p.name` |
+| `p.workflowName` / `p.span` (eyebrow) | `p.phase` (eyebrow `{data.arthurEnabled ? "Arthur" : "In-code"} · {p.phase}`) |
+| `p.current` version badge | real: highest `p.versions[].version`, or the production-tagged version number; show `source` chip alongside |
+| `p.tags` chips + tag filter pills | **kept, real** — derive the row's tags from its production version's `tags` (`p.versions.find(v => v.tags.includes("production"))?.tags`), and per-version `tags` in the timeline. Filter pills reduced to tags that actually occur (e.g. `all` + `production`). |
+| `p.evalScore` / `p.evalDelta` | **removed** (no Arthur source — markup deleted) |
+| `D.PROMPT_VERSIONS[id]` timeline | **kept, real** — `p.versions` (`{version, createdAt, tags, modelName, numMessages, numTools}`), newest first. Each entry shows version number, `createdAt`, tag chips, `modelName`, message/tool counts. The mock's eval/halluc/p95/cost rows in each timeline card are **removed**. |
+| `D.PROMPT_BODIES[v]` two-column diff (`PromptDiff`) | **removed** — replaced by a single read-only body panel. Shows `p.body` (production) by default; clicking a timeline version fetches that version's body via the on-demand route and renders it in the same panel. |
+| `PromptMetrics` side-by-side table | **removed** (no per-version metrics) |
+| Header KPIs (total / production / ab-test / avg Δ) | total = `data.total`; "In production" = count of rows whose versions include a `production` tag; ab-test and avg-Δ tiles **removed** (no source) |
+| `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons | left inert (read-only), matching how `/runs` left its `+ Filter` / `Export` buttons |
+
+Faithful render: left rail lists the 3 prompts by `name` + `phase` + `model` + production tag chip; right pane shows a read-only body panel (production body by default, swappable to a selected historical version fetched on demand) plus the real version timeline. Reuses `CkCard`/`CkKPI`/`Stat`, the chip styling (repurposed for real `tags`), and the single-column body markup lifted from the old `PromptDiff`.
+
+## Behavior
+
+- **Happy path (Arthur disabled — current production reality):** `/prompts` lists the 3 workflow prompts with their in-code fallback bodies, `source: "fallback"`, `arthurEnabled: false`, `versions: []`. Eyebrow reflects "In-code". The version timeline section is empty (no markup, since there are no versions). Bodies are exactly what the agent runs.
+- **Happy path (Arthur enabled):** each prompt's production body and full real version history come from Arthur (`source: "arthur"`). The timeline lists every Arthur version with its real `version`, `createdAt`, `tags`, and `modelName`. Expanding a historical version fetches its body on demand via `GET /api/v1/prompts/[name]/versions/[version]`. A prompt that fails to resolve from Arthur degrades to its fallback body with `versions: []`.
+- **Worker down / 401:** `getJSON` throws → `promptsFallback` → empty list, `available:false`. The screen shows its "Select a prompt to inspect." empty state with `0 prompts`. No crash. Same silent-fallback as `/runs`. An on-demand body fetch that fails renders an inline "version body unavailable" note, not a page crash.
+
+## Out of scope
+
+- Editing, creating, deploying, or version-bumping prompts (the `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons stay inert).
+- Per-version eval/halluc/p95/cost metrics and the two-version A/B text diff — no Arthur source; markup removed.
+- Traffic split, `lastEditedBy`, eval deltas — no source; markup removed.
+- Wiring the `/editor` view (separate `workflow-editor` screen).
+
+## Open questions / assumptions
+
+Resolved by user decisions and Arthur API ground-truthing:
+
+- **Read-only — confirmed.** No write endpoints; action buttons stay inert.
+- **Version history — confirmed in scope.** Real Arthur version history (metadata + on-demand bodies) is included. Per-version eval metrics are NOT available from Arthur's version-list endpoint (metadata only: `{version, created_at, tags, model_name, num_messages, num_tools}`), so the mock's per-version metrics are dropped — confirmed acceptable.
+- **Tags are real.** The `production` badge and the tag filter are backed by `AgenticPromptVersionResponse.tags`; kept.
+- **Resolution-helper extraction — confirmed.** Shared `resolvePrompts()` used by both `loadPrompts()` and the route; OK to touch `prompts-step.ts`.
+- **Embellishment fields — removed, not stubbed.** Per the user decision, fields with no real backing have their markup deleted rather than rendered as static placeholders.
+
+Still open:
+
+1. **Lazy vs eager body fetch — proposed eager-for-production, lazy-for-history.** Stated above; flagged here in case you'd rather ship all version bodies eagerly (simpler client, larger/slower response) or fetch even the production body lazily (smaller list response, extra round-trip on first view).
+2. **Version pagination depth.** Arthur's `…/versions` endpoint is paginated. Assumption: fetch the first page only (newest N, e.g. default page size) and not the full history — sufficient for the timeline. Confirm whether deep history (all pages) is required.
+
+## Verification
+
+1. `apps/shared` + `apps/worker` typecheck (`pnpm -F @apps/worker typecheck` or `npx tsc --noEmit`).
+2. Worker `GET /api/v1/prompts` returns 3 rows with non-empty `body`, correct `source`, `arthurEnabled` reflecting env, and (Arthur on) a non-empty `versions[]` with real `version`/`createdAt`/`tags`. Existing `prompts-step` tests still pass.
+3. Worker `GET /api/v1/prompts/research-plan/versions/` returns that version's `body` (Arthur on) or `available:false` (Arthur off / missing).
+4. Dashboard typecheck passes.
+5. `/prompts` renders the 3 real prompts; selecting one shows its production body; the timeline lists real Arthur versions; expanding one fetches and shows that version's body. With Arthur disabled, `source` is `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`.
+6. With the worker unreachable, `/prompts` shows the empty state (`0 prompts`), not an error.
+
+