diff --git a/.claude/learnings.md b/.claude/learnings.md
index ab7a529..890f278 100644
--- a/.claude/learnings.md
+++ b/.claude/learnings.md
@@ -48,3 +48,8 @@ In flow-editor.tsx (FlowCanvas), touch drag/pan on iOS jumped ~5px then stopped,
 
 ## 2026-06-05 — CORRECTION: real root cause was onPointerLeave, not page-scroll
 On-device HUD instrumentation proved the actual cause of the iOS canvas drag bug: the canvas had `onPointerLeave={onPointerUp}`, and iOS Safari spuriously fires `pointerleave` mid-gesture (finger still down and inside the element, even with pointer capture set). That ended the drag one move in — node/canvas jumped ~5px then froze while the finger kept moving. Fix: gate it to non-touch only `onPointerLeave={(e)=>{ if(e.pointerType!=="touch") onPointerUp(e); }}` — pointer capture guarantees a real pointerup/cancel, so pointerleave is only needed as the desktop mouse-left-window fallback. The non-passive touchmove preventDefault was kept (prevents page-scroll hijack / pointercancel) but was NOT the primary fix. Lesson: when reasoning about WebKit event quirks fails twice, add an on-screen HUD logging the raw pointer event stream instead of guessing.
+
+## Arthur GenAI Engine — eval read path (for /evals page)
+- Arthur is integrated **write-only** in the worker today: `arthur-tracer.ts` ships OpenInference traces via `POST /api/v1/traces`; `arthur-client.ts` only does tasks/prompts (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). Nothing reads evals back.
+- Arthur DOES expose a read API (same `Bearer GENAI_ENGINE_API_KEY`, needs `INFERENCE_READ`, org-scoped → matches single-tenant). Primary for fleet eval health: `POST /api/v1/traces/overview` { task_ids, start_time, end_time } → overviews of { trace_count, eval_count, continuous_eval_success_rate, ... }. Per-metric detail: span `metric_results` (metric_type enum is ONLY QueryRelevance|ResponseRelevance|ToolSelection; `details` is opaque JSON, no flat score/pass-fail — parse + threshold yourself). Timeseries: `POST /api/v1/traces/overview/timeseries` (single task per call).
+- CRITICAL: the rich rule families the mock shows (hallucination/PII/toxicity/prompt-injection) come ONLY from the legacy `/validate_prompt` + `/validate_response` write path, which we never call. `GET /api/v2/inferences/query` is empty for us. Our trace path yields only success-rate + eval/trace counts + the 3 relevance/tool metric types (and only if continuous evals are configured on the task; else eval_count=0).
diff --git a/apps/dashboard/app/(cockpit)/cost/page.tsx b/apps/dashboard/app/(cockpit)/cost/page.tsx
index 4ff4588..556bcef 100644
--- a/apps/dashboard/app/(cockpit)/cost/page.tsx
+++ b/apps/dashboard/app/(cockpit)/cost/page.tsx
@@ -1,5 +1,12 @@
 // apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost")
-import { CostScreen } from "@/components/cockpit/screens/cost";
+import { Suspense } from "react";
+import { CostData } from "@/app/cost-data";
+import { CostSkeleton } from "@/app/cost-skeleton";
+
 export default function CostPage() {
-  return <CostScreen />;
+  return (
+    <Suspense fallback={<CostSkeleton />}>
+      <CostData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/(cockpit)/evals/page.tsx b/apps/dashboard/app/(cockpit)/evals/page.tsx
index 24320a5..108427f 100644
--- a/apps/dashboard/app/(cockpit)/evals/page.tsx
+++ b/apps/dashboard/app/(cockpit)/evals/page.tsx
@@ -1,5 +1,13 @@
 // apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals")
-import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import { Suspense } from "react";
+
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
 export default function EvalsPage() {
-  return <EvalsScreen />;
+  return (
+    <Suspense fallback={<EvalsSkeleton />}>
+      <EvalsData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/(cockpit)/prompts/page.tsx b/apps/dashboard/app/(cockpit)/prompts/page.tsx
index d3f5a66..d44f3bb 100644
--- a/apps/dashboard/app/(cockpit)/prompts/page.tsx
+++ b/apps/dashboard/app/(cockpit)/prompts/page.tsx
@@ -1,5 +1,13 @@
 // apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts")
-import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import { Suspense } from "react";
+
+import { PromptsData } from "@/app/prompts-data";
+import { PromptsSkeleton } from "@/app/prompts-skeleton";
+
 export default function PromptsPage() {
-  return <PromptsScreen />;
+  return (
+    <Suspense fallback={<PromptsSkeleton />}>
+      <PromptsData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts b/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
new file mode 100644
index 0000000..3979644
--- /dev/null
+++ b/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
@@ -0,0 +1,18 @@
+// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
+// Same-origin proxy so the client can lazily fetch a historical prompt-version
+// body without the server-only WORKER_API_TOKEN ever reaching the browser.
+import { NextResponse } from "next/server";
+import { getJSON } from "@/lib/api/server";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+
+export async function GET(
+  _req: Request,
+  { params }: { params: Promise<{ name: string; version: string }> },
+) {
+  const { name, version } = await params;
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptVersionBodyResponse>(
+    `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`,
+  ).catch(() => ({ generatedAt: now, available: false, body: null }));
+  return NextResponse.json(data);
+}
diff --git a/apps/dashboard/app/cost-data.tsx b/apps/dashboard/app/cost-data.tsx
new file mode 100644
index 0000000..6620777
--- /dev/null
+++ b/apps/dashboard/app/cost-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/cost-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { CostScreen } from "@/components/cockpit/screens/cost";
+import type { CostResponse } from "@shared/contracts";
+import { costFallback } from "@/lib/api/fallbacks";
+
+export async function CostData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<CostResponse>("/api/v1/cost").catch(() =>
+    costFallback(now),
+  );
+  return <CostScreen data={data} />;
+}
diff --git a/apps/dashboard/app/cost-skeleton.tsx b/apps/dashboard/app/cost-skeleton.tsx
new file mode 100644
index 0000000..1a95fa2
--- /dev/null
+++ b/apps/dashboard/app/cost-skeleton.tsx
@@ -0,0 +1,18 @@
+// apps/dashboard/app/cost-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function CostSkeleton() {
+  return (
+    <div className="px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="grid grid-cols-3 gap-3">
+        {Array.from({ length: 3 }, (_, i) => <Block key={i} className="h-[100px]" />)}
+      </div>
+      <div className="grid lg:grid-cols-[1.5fr_1fr] gap-3">
+        <Block className="h-[260px]" />
+        <Block className="h-[260px]" />
+      </div>
+      <Block className="h-[300px]" />
+      <Block className="h-[300px]" />
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/evals-data.tsx b/apps/dashboard/app/evals-data.tsx
new file mode 100644
index 0000000..fd2cd2f
--- /dev/null
+++ b/apps/dashboard/app/evals-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/evals-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<EvalsResponse>("/api/v1/evals").catch(() =>
+    evalsFallback(now),
+  );
+  return <EvalsScreen data={data} />;
+}
diff --git a/apps/dashboard/app/evals-skeleton.tsx b/apps/dashboard/app/evals-skeleton.tsx
new file mode 100644
index 0000000..86ba867
--- /dev/null
+++ b/apps/dashboard/app/evals-skeleton.tsx
@@ -0,0 +1,16 @@
+// apps/dashboard/app/evals-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function EvalsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      {/* Header (eyebrow + title, chip) */}
+      <div className="flex items-center justify-between">
+        <Block className="h-10 w-72" />
+        <Block className="h-8 w-64" />
+      </div>
+      {/* Quality group card */}
+      <Block className="h-[200px]" />
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/prompts-data.tsx b/apps/dashboard/app/prompts-data.tsx
new file mode 100644
index 0000000..34d7504
--- /dev/null
+++ b/apps/dashboard/app/prompts-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/prompts-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import type { PromptsResponse } from "@shared/contracts";
+import { promptsFallback } from "@/lib/api/fallbacks";
+
+export async function PromptsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptsResponse>("/api/v1/prompts").catch(() =>
+    promptsFallback(now),
+  );
+  return <PromptsScreen data={data} />;
+}
diff --git a/apps/dashboard/app/prompts-skeleton.tsx b/apps/dashboard/app/prompts-skeleton.tsx
new file mode 100644
index 0000000..feb45c0
--- /dev/null
+++ b/apps/dashboard/app/prompts-skeleton.tsx
@@ -0,0 +1,22 @@
+// apps/dashboard/app/prompts-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function PromptsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="flex items-end justify-between">
+        <Block className="h-10 w-56" />
+        <Block className="h-9 w-64" />
+      </div>
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
+        {Array.from({ length: 2 }, (_, i) => (
+          <Block key={i} className="h-[124px]" />
+        ))}
+      </div>
+      <div className="flex flex-col lg:grid lg:grid-cols-[340px_1fr] gap-3 lg:min-h-[720px]">
+        <Block className="lg:h-full h-[300px]" />
+        <Block className="lg:h-full h-[400px]" />
+      </div>
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/skeleton-block.tsx b/apps/dashboard/app/skeleton-block.tsx
new file mode 100644
index 0000000..03c2647
--- /dev/null
+++ b/apps/dashboard/app/skeleton-block.tsx
@@ -0,0 +1,4 @@
+// apps/dashboard/app/skeleton-block.tsx
+export function Block({ className = "" }: { className?: string }) {
+  return <div className={`bg-neutral-200/60 rounded-sm animate-pulse ${className}`} />;
+}
diff --git a/apps/dashboard/components/cockpit/screens/cost.tsx b/apps/dashboard/components/cockpit/screens/cost.tsx
index 499a78a..470378b 100644
--- a/apps/dashboard/components/cockpit/screens/cost.tsx
+++ b/apps/dashboard/components/cockpit/screens/cost.tsx
@@ -1,141 +1,169 @@
 "use client";
 
 import React from "react";
-import { CkCard, CkKPI, CkChip, CkTabs, CkDot } from "@/components/ui";
-import { Spark, AreaChart, Donut } from "@/components/charts";
-import { AIWF_DATA } from "@/lib/data/mock";
-import { sparkSeries } from "@/lib/rng";
+import { CkCard, CkKPI, CkDot } from "@/components/ui";
+import { AreaChart, Donut } from "@/components/charts";
+import type { CostResponse } from "@shared/contracts";
 
-const D = AIWF_DATA;
+const DONUT_COLORS = ["#3C43E7", "#FD6027", "#FFC800", "#181B20", "#8FC548"];
+
+/** Short label from an ISO/bucket date string for the daily-spend x-axis. */
+function shortDate(date: string): string {
+  const d = new Date(date);
+  if (Number.isNaN(d.getTime())) return date;
+  return d.toLocaleDateString("en-US", { month: "short", day: "numeric" });
+}
+
+export function CostScreen({ data }: { data: CostResponse }) {
+  if (!data.available) {
+    return (
+      <div className="flex flex-col gap-4 px-4 lg:px-6 pt-5 pb-8">
+        <div className="flex items-end justify-between">
+          <div>
+            <div className="font-mono text-[10px] uppercase tracking-[0.06em] text-neutral-500">Arthur · token usage</div>
+            <h2 className="font-display text-2xl font-medium leading-[1.2] text-neutral-900 m-0">Cost & token usage</h2>
+          </div>
+        </div>
+        <div className="bg-panel border border-neutral-200 rounded-sm px-5 py-8 font-body text-sm text-neutral-500">
+          Cost data is unavailable — Arthur GenAI Engine is not configured or unreachable.
+        </div>
+      </div>
+    );
+  }
+
+  const { totals, byModel, byWorkflow, daily } = data;
+  const total = totals.totalTokenCost;
+  const modelCostTotal = byModel.reduce((a, m) => a + m.cost, 0);
 
-export function CostScreen() {
-  const total = D.COST_BY_MODEL.reduce((a, m) => a + m.cost, 0);
-  const tokensTotal = D.COST_BY_MODEL.reduce((a, m) => a + m.tokens, 0);
   return (
     <div className="flex flex-col gap-4 px-4 lg:px-6 pt-5 pb-8">
       <div className="flex items-end justify-between">
         <div>
-          <div className="font-mono text-[10px] uppercase tracking-[0.06em] text-neutral-500">Vercel ai gateway · billing</div>
+          <div className="font-mono text-[10px] uppercase tracking-[0.06em] text-neutral-500">Arthur · token usage</div>
           <h2 className="font-display text-2xl font-medium leading-[1.2] text-neutral-900 m-0">Cost & token usage</h2>
         </div>
-        <div className="flex gap-2">
-          <CkTabs active="model" onChange={() => {}} tabs={[
-            { id: "model", label: "By model" }, { id: "wf", label: "By workflow" }, { id: "actor", label: "By actor" }]
-          } />
-          <button className="appearance-none border border-neutral-200 bg-panel px-3 py-1.5 rounded-[3px] font-mono text-[11px] text-neutral-900 uppercase tracking-[0.04em] cursor-pointer">Export CSV</button>
-        </div>
       </div>
 
-      <div className="grid grid-cols-2 lg:grid-cols-4 gap-2.5 lg:gap-3">
-        <CkKPI label="MTD spend" value={"$" + total.toFixed(2)} sub="of $1,200 budget" delta="↗ +18% MoM" deltaTone="bad" />
-        <CkKPI label="Tokens · MTD" value={(tokensTotal / 1_000_000).toFixed(2) + "M"} delta="↗ +24% MoM" deltaTone="bad" />
-        <CkKPI label="Cost / run avg" value="$0.41" sub="all workflows" delta="↘ −$0.03 WoW" deltaTone="good" />
-        <CkKPI label="Projection · EoM" value="$1,184" sub="98.7% of budget" delta="⚠ tight" deltaTone="bad" />
+      <div className="grid grid-cols-1 lg:grid-cols-3 gap-2.5 lg:gap-3">
+        <CkKPI label="MTD spend" value={"$" + total.toFixed(2)} />
+        <CkKPI label="Tokens · MTD" value={(totals.totalTokens / 1_000_000).toFixed(2) + "M"} />
+        <CkKPI label="Cost / run avg" value={"$" + totals.costPerRun.toFixed(2)} sub="all workflows" />
       </div>
 
       <div className="flex flex-col lg:grid lg:grid-cols-[1.5fr_1fr] gap-3">
-        <CkCard eyebrow="Spend trajectory" title="Daily spend · MTD"
-          action={<CkTabs active="cost" onChange={() => {}} tabs={[{ id: "cost", label: "Cost" }, { id: "tokens", label: "Tokens" }]} />}>
-          <div className="overflow-x-auto">
-            <AreaChart data={D.HOURS24.map((h) => h.cost * 24)} w={680} h={200} stroke="#FD6027" fill="#FD6027" labels={D.HOURS24.map((_, i) => "D" + (i + 1))} valueFmt={(v) => "$" + Math.round(v)} />
-          </div>
+        <CkCard eyebrow="Spend trajectory" title="Daily spend · MTD">
+          {daily.length > 0 ? (
+            <div className="overflow-x-auto">
+              <AreaChart
+                data={daily.map((d) => d.cost)}
+                w={680}
+                h={200}
+                stroke="#FD6027"
+                fill="#FD6027"
+                labels={daily.map((d) => shortDate(d.date))}
+                valueFmt={(v) => "$" + Math.round(Number(v))}
+              />
+            </div>
+          ) : (
+            <div className="px-5 py-10 text-center text-neutral-500 text-sm">No spend data</div>
+          )}
         </CkCard>
 
-        <CkCard eyebrow="Vercel AI Gateway" title="Model mix">
-          <div className="flex items-center gap-[18px]">
-            <Donut shares={D.COST_BY_MODEL.map((m) => m.share)} size={140} thickness={22} colors={["#3C43E7", "#FD6027", "#FFC800", "#181B20", "#8FC548"]} centerLabel={"$" + Math.round(total)} centerSub="MTD" />
-            <div className="flex flex-1 flex-col gap-2.5">
-              {D.COST_BY_MODEL.map((m, i) =>
-                <div key={m.model} className="flex items-center gap-2 font-body text-xs">
-                  <CkDot color={["#3C43E7", "#FD6027", "#FFC800", "#181B20", "#8FC548"][i]} />
-                  <span className="flex-1 font-mono text-neutral-900">{m.model}</span>
-                  <span className="font-mono font-medium text-neutral-700">${m.cost.toFixed(0)}</span>
-                </div>
-              )}
+        <CkCard eyebrow="Arthur" title="Model mix">
+          {byModel.length > 0 ? (
+            <div className="flex items-center gap-[18px]">
+              <Donut
+                shares={byModel.map((m) => (modelCostTotal ? m.cost / modelCostTotal : 0))}
+                size={140}
+                thickness={22}
+                colors={DONUT_COLORS}
+                centerLabel={"$" + Math.round(total)}
+                centerSub="MTD"
+              />
+              <div className="flex flex-1 flex-col gap-2.5">
+                {byModel.map((m, i) =>
+                  <div key={m.model} className="flex items-center gap-2 font-body text-xs">
+                    <CkDot color={DONUT_COLORS[i % DONUT_COLORS.length]} />
+                    <span className="flex-1 font-mono text-neutral-900">{m.model}</span>
+                    <span className="font-mono font-medium text-neutral-700">${m.cost.toFixed(0)}</span>
+                  </div>
+                )}
+              </div>
             </div>
-          </div>
+          ) : (
+            <div className="px-5 py-10 text-center text-neutral-500 text-sm">No model data</div>
+          )}
         </CkCard>
       </div>
 
       <CkCard eyebrow="Per-model breakdown" title="Spend & throughput" pad={0}>
-        <div className="overflow-x-auto">
-        <table className="w-full border-collapse font-body text-[13px]">
-          <thead>
-            <tr className="bg-neutral-100 text-neutral-700 font-mono text-[10px] uppercase tracking-[0.06em]">
-              {["Model", "Vendor", "Tokens", "Cost", "Share", "Trend"].map((h, i) =>
-                <th key={i} className={`px-4 py-2.5 font-medium border-b border-neutral-200 ${i >= 2 ? "text-right" : "text-left"}`}>{h}</th>
-              )}
-            </tr>
-          </thead>
-          <tbody>
-            {D.COST_BY_MODEL.map((m, i) =>
-              <tr key={m.model} className={i < D.COST_BY_MODEL.length - 1 ? "border-b border-neutral-200" : ""}>
-                <td className="px-4 py-3 font-mono font-medium text-neutral-900">{m.model}</td>
-                <td className="px-4 py-3 font-body text-neutral-700">{m.vendor}</td>
-                <td className="px-4 py-3 text-right font-mono">{(m.tokens / 1_000_000).toFixed(2)}M</td>
-                <td className="px-4 py-3 text-right font-mono font-semibold">${m.cost.toFixed(2)}</td>
-                <td className="px-4 py-3 text-right">
-                  <div className="inline-flex items-center gap-2">
-                    <div className="w-20 h-1.5 bg-app-bg rounded-[1px]">
-                      <div className="h-full bg-mariner rounded-[1px]" style={{ width: m.share * 100 + "%" }} />
-                    </div>
-                    <span className="font-mono text-[11px] w-9 text-right">{(m.share * 100).toFixed(0)}%</span>
-                  </div>
-                </td>
-                <td className="px-4 py-3 text-right text-neutral-700">
-                  <Spark data={sparkSeries(i + 1, 14, 0.5, 1)} w={80} h={20} stroke="#3C43E7" />
-                </td>
+        {byModel.length > 0 ? (
+          <div className="overflow-x-auto">
+          <table className="w-full border-collapse font-body text-[13px]">
+            <thead>
+              <tr className="bg-neutral-100 text-neutral-700 font-mono text-[10px] uppercase tracking-[0.06em]">
+                {["Model", "Tokens", "Cost", "Share"].map((h, i) =>
+                  <th key={i} className={`px-4 py-2.5 font-medium border-b border-neutral-200 ${i >= 1 ? "text-right" : "text-left"}`}>{h}</th>
+                )}
               </tr>
-            )}
-          </tbody>
-        </table>
-        </div>
+            </thead>
+            <tbody>
+              {byModel.map((m, i) => {
+                const share = modelCostTotal ? m.cost / modelCostTotal : 0;
+                return (
+                  <tr key={m.model} className={i < byModel.length - 1 ? "border-b border-neutral-200" : ""}>
+                    <td className="px-4 py-3 font-mono font-medium text-neutral-900">{m.model}</td>
+                    <td className="px-4 py-3 text-right font-mono">{(m.tokens / 1_000_000).toFixed(2)}M</td>
+                    <td className="px-4 py-3 text-right font-mono font-semibold">${m.cost.toFixed(2)}</td>
+                    <td className="px-4 py-3 text-right">
+                      <div className="inline-flex items-center gap-2">
+                        <div className="w-20 h-1.5 bg-app-bg rounded-[1px]">
+                          <div className="h-full bg-mariner rounded-[1px]" style={{ width: share * 100 + "%" }} />
+                        </div>
+                        <span className="font-mono text-[11px] w-9 text-right">{(share * 100).toFixed(0)}%</span>
+                      </div>
+                    </td>
+                  </tr>
+                );
+              })}
+            </tbody>
+          </table>
+          </div>
+        ) : (
+          <div className="px-5 py-10 text-center text-neutral-500 text-sm">No model breakdown available</div>
+        )}
       </CkCard>
 
       <CkCard eyebrow="Per-workflow breakdown" title="Where the spend is going" pad={0}>
-        <div className="overflow-x-auto">
-        <table className="w-full border-collapse font-body text-[13px]">
-          <thead>
-            <tr className="bg-neutral-100 text-neutral-700 font-mono text-[10px] uppercase tracking-[0.06em]">
-              {["Workflow", "Runs 24h", "Tokens", "Cost today", "$/run", "Trend"].map((h, i) =>
-                <th key={i} className={`px-4 py-2.5 font-medium border-b border-neutral-200 ${i >= 1 ? "text-right" : "text-left"}`}>{h}</th>
-              )}
-            </tr>
-          </thead>
-          <tbody>
-            {D.WORKFLOWS.slice().sort((a, b) => b.costToday - a.costToday).map((w, i, arr) => {
-              const tokens = Math.round(w.runs24h * 2400);
-              const perRun = w.costToday / Math.max(1, w.runs24h);
-              const trendUp = i % 2 === 0;
-              return (
-                <tr key={w.id} className={i < arr.length - 1 ? "border-b border-neutral-200" : ""}>
+        {byWorkflow.length > 0 ? (
+          <div className="overflow-x-auto">
+          <table className="w-full border-collapse font-body text-[13px]">
+            <thead>
+              <tr className="bg-neutral-100 text-neutral-700 font-mono text-[10px] uppercase tracking-[0.06em]">
+                {["Workflow", "Runs", "Tokens", "Cost", "$/run"].map((h, i) =>
+                  <th key={i} className={`px-4 py-2.5 font-medium border-b border-neutral-200 ${i >= 1 ? "text-right" : "text-left"}`}>{h}</th>
+                )}
+              </tr>
+            </thead>
+            <tbody>
+              {[...byWorkflow].sort((a, b) => b.cost - a.cost).map((w, i, arr) =>
+                <tr key={w.taskId} className={i < arr.length - 1 ? "border-b border-neutral-200" : ""}>
                   <td className="px-4 py-3">
-                    <div className="flex items-center gap-2">
-                      <span className="font-semibold text-neutral-900">{w.name}</span>
-                      {w.primary && <CkChip tone="mariner">primary</CkChip>}
-                    </div>
-                    <div className="text-[11px] text-neutral-500 font-mono mt-0.5">{w.id} · gateway: {w.gateway}</div>
+                    <span className="font-semibold text-neutral-900">{w.name}</span>
+                    <div className="text-[11px] text-neutral-500 font-mono mt-0.5">{w.taskId}</div>
                   </td>
-                  <td className="px-4 py-3 text-right font-mono">{w.runs24h.toLocaleString("en-US")}</td>
-                  <td className="px-4 py-3 text-right font-mono text-neutral-700">{(tokens / 1000).toFixed(0)}k</td>
-                  <td className="px-4 py-3 text-right">
-                    <div className="inline-flex items-center gap-2 justify-end">
-                      <div className="w-[100px] h-1.5 bg-app-bg rounded-[1px]">
-                        <div className="h-full bg-burnt-orange rounded-[1px]" style={{ width: Math.min(100, w.costToday / 200 * 100) + "%" }} />
-                      </div>
-                      <span className="font-mono font-semibold w-16 text-right">${w.costToday.toFixed(2)}</span>
-                    </div>
-                  </td>
-                  <td className="px-4 py-3 text-right font-mono text-neutral-700">${perRun.toFixed(3)}</td>
-                  <td className="px-4 py-3 text-right text-neutral-700">
-                    <Spark data={sparkSeries(100 + i, 14, 0.4, 0.8)} w={80} h={20} stroke={trendUp ? "#D14343" : "#5BB04A"} />
-                  </td>
-                </tr>);
-
-            })}
-          </tbody>
-        </table>
-        </div>
+                  <td className="px-4 py-3 text-right font-mono">{w.runs.toLocaleString("en-US")}</td>
+                  <td className="px-4 py-3 text-right font-mono text-neutral-700">{(w.tokens / 1000).toFixed(0)}k</td>
+                  <td className="px-4 py-3 text-right font-mono font-semibold">${w.cost.toFixed(2)}</td>
+                  <td className="px-4 py-3 text-right font-mono text-neutral-700">${w.costPerRun.toFixed(3)}</td>
+                </tr>
+              )}
+            </tbody>
+          </table>
+          </div>
+        ) : (
+          <div className="px-5 py-10 text-center text-neutral-500 text-sm">No workflow breakdown available</div>
+        )}
       </CkCard>
     </div>
   );
diff --git a/apps/dashboard/components/cockpit/screens/evals.tsx b/apps/dashboard/components/cockpit/screens/evals.tsx
index 8bc30fe..dbb8d9b 100644
--- a/apps/dashboard/components/cockpit/screens/evals.tsx
+++ b/apps/dashboard/components/cockpit/screens/evals.tsx
@@ -1,69 +1,65 @@
 "use client";
 
 import { CkCard, CkChip } from "@/components/ui";
-import { Spark } from "@/components/charts";
-import { AIWF_DATA } from "@/lib/data/mock";
-import { jitterSeries } from "@/lib/rng";
+import type { EvalsResponse } from "@shared/contracts";
 
-const D = AIWF_DATA;
+const QUALITY_ACCENT = "#3C43E7";
 
 /* ───────────────────── ARTHUR EVALS ───────────────────── */
 
-export function EvalsScreen() {
-  const groups = ["safety", "quality", "ops"];
+function Header({ chip }: { chip: React.ReactNode }) {
   return (
-    <div className="flex flex-col gap-4 px-4 lg:px-6 pt-5 pb-8">
-      <div className="flex items-end justify-between">
-        <div>
-          <div className="font-mono text-[10px] uppercase tracking-[0.06em] text-neutral-500">Arthur engine · continuous evaluation</div>
-          <h2 className="font-display text-2xl font-medium leading-[1.2] text-neutral-900 m-0">Evaluations & guardrails</h2>
-        </div>
-        <div className="flex gap-2">
-          <CkChip tone="success">Live · 12,408 spans · 24h</CkChip>
-          <button className="appearance-none border border-neutral-200 bg-panel px-3.5 py-2 rounded-[3px] font-mono text-[11px] text-neutral-900 uppercase tracking-[0.04em] cursor-pointer">+ New eval</button>
+    <div className="flex items-end justify-between">
+      <div>
+        <div className="font-mono text-[10px] uppercase tracking-[0.06em] text-neutral-500">Arthur engine · continuous evaluation</div>
+        <h2 className="font-display text-2xl font-medium leading-[1.2] text-neutral-900 m-0">Evaluations & guardrails</h2>
+      </div>
+      <div className="flex gap-2">{chip}</div>
+    </div>
+  );
+}
+
+export function EvalsScreen({ data }: { data: EvalsResponse }) {
+  if (!data.available) {
+    return (
+      <div className="flex flex-col gap-4 px-4 lg:px-6 pt-5 pb-8">
+        <Header chip={<CkChip tone="neutral">No data</CkChip>} />
+        <div className="bg-panel border border-neutral-200 rounded-sm px-5 py-8 font-body text-sm text-neutral-500">
+          {data.reason}
         </div>
       </div>
+    );
+  }
 
-      {groups.map((g) => {
-        const list = D.EVALS.filter((e) => e.axis === g);
-        const titles: Record<string, string> = { safety: "Safety", quality: "Quality", ops: "Operations" };
-        const accents: Record<string, string> = { safety: "#FD6027", quality: "#3C43E7", ops: "#181B20" };
-        return (
-          <CkCard key={g}
-          eyebrow={g}
-          title={titles[g]}
-          action={<span className="font-mono text-[11px] text-neutral-700 uppercase tracking-[0.04em]">{list.length} evaluators</span>}
-          style={{ borderLeft: "3px solid " + accents[g] }}
-          pad={0}>
+  return (
+    <div className="flex flex-col gap-4 px-4 lg:px-6 pt-5 pb-8">
+      <Header
+        chip={
+          <CkChip tone="success">
+            Live · {data.spansGraded.toLocaleString("en-US")} spans · {data.windowHours}h
+          </CkChip>
+        }
+      />
 
-            <div className="grid grid-cols-1 lg:grid-cols-2">
-              {list.map((e, i) =>
-              <div key={e.metric} className={`flex flex-col gap-2.5 px-5 py-4 ${i < list.length - 1 ? "border-b border-neutral-200" : ""} ${i >= list.length - (list.length % 2 === 0 ? 2 : 1) ? "lg:border-b-0" : ""} ${i % 2 === 0 ? "lg:border-r lg:border-neutral-200" : ""}`}>
-                  <div className="flex items-center justify-between">
-                    <span className="font-body text-sm font-medium text-neutral-900">{e.metric}</span>
-                    {e.status === "pass" ? <CkChip tone="success">Pass</CkChip> :
-                  e.status === "warn" ? <CkChip tone="warn">Warn</CkChip> :
-                  <CkChip tone="failed">Fail</CkChip>}
-                  </div>
-                  <div className="flex items-baseline gap-2.5">
-                    <span className="font-display text-[28px] font-semibold leading-none tracking-[-0.02em] text-neutral-900">
-                      {typeof e.value === "number" ? e.value < 1 ? e.value.toFixed(3) : e.value : e.value}
-                    </span>
-                    {e.unit && <span className="font-mono text-[11px] text-neutral-500">{e.unit}</span>}
-                    <span className={`font-mono text-[11px] ml-auto ${e.trend < 0 ? "text-success-fg" : e.trend > 0 ? "text-fail-fg" : "text-neutral-500"}`}>
-                      {e.trend > 0 ? "↗" : e.trend < 0 ? "↘" : "→"} {Math.abs(e.trend).toFixed(3)}
-                    </span>
-                  </div>
-                  <div className="flex items-center gap-2">
-                    <Spark data={jitterSeries(i + 1, 24, (typeof e.value === "number" ? e.value : 0.5), 0.05)} w={140} h={22} stroke={accents[g]} fill={accents[g]} />
-                    <span className="ml-auto font-mono text-[11px] text-neutral-500">target {e.target}</span>
-                  </div>
-                </div>
-              )}
-            </div>
-          </CkCard>);
+      <CkCard
+        eyebrow="quality"
+        title="Quality"
+        action={
+          <span className="font-mono text-[11px] text-neutral-700 uppercase tracking-[0.04em]">
+            {data.score.toFixed(1)}% pass
+          </span>
+        }
+        style={{ borderLeft: "3px solid " + QUALITY_ACCENT }}>
 
-      })}
+        <div className="flex items-baseline gap-2.5">
+          <span className="font-display text-[28px] font-semibold leading-none tracking-[-0.02em] text-neutral-900">
+            {data.score.toFixed(1)}%
+          </span>
+          <span className="font-mono text-[11px] text-neutral-500">
+            {data.spansGraded.toLocaleString("en-US")} spans graded · {data.traceCount.toLocaleString("en-US")} traces · {data.windowHours}h
+          </span>
+        </div>
+      </CkCard>
     </div>);
 
 }
diff --git a/apps/dashboard/components/cockpit/screens/prompts.tsx b/apps/dashboard/components/cockpit/screens/prompts.tsx
index 03a1af6..b236e60 100644
--- a/apps/dashboard/components/cockpit/screens/prompts.tsx
+++ b/apps/dashboard/components/cockpit/screens/prompts.tsx
@@ -1,11 +1,8 @@
 "use client";
 
 import React, { useState, useEffect } from "react";
-import { CkCard, CkKPI, CkChip } from "@/components/ui";
-import { AIWF_DATA } from "@/lib/data/mock";
-import type { Prompt, PromptVersion, PromptTag } from "@/lib/types";
-
-const D = AIWF_DATA;
+import { CkCard, CkKPI } from "@/components/ui";
+import type { PromptsResponse, PromptDef, PromptVersion } from "@shared/contracts";
 
 const PROMPT_STATUS_COLOR: Record<string, { bg: string; fg: string; dot: string }> = {
   production: { bg: "#EAF7E0", fg: "#3F6B1E", dot: "#5BB04A" },
@@ -13,7 +10,8 @@ const PROMPT_STATUS_COLOR: Record<string, { bg: string; fg: string; dot: string
   draft:      { bg: "#FFF4CC", fg: "#7A5A00", dot: "#FFC800" },
   archived:   { bg: "#F2F4F6", fg: "#5F666F", dot: "#9EA3AA" },
   locked:     { bg: "#181B20", fg: "#fff",    dot: "#fff"    },
-  "ab-test":  { bg: "#FFEFE9", fg: "#A2351C", dot: "#FD6027" },
+  arthur:     { bg: "#ECECFD", fg: "#3C43E7", dot: "#3C43E7" },
+  fallback:   { bg: "#F2F4F6", fg: "#5F666F", dot: "#9EA3AA" },
 };
 
 function PromptStatusChip({ status }: { status: string }) {
@@ -29,56 +27,74 @@ function PromptStatusChip({ status }: { status: string }) {
   );
 }
 
+/** The version tagged "production", if any (used for the row's tag chip). */
+function productionVersion(p: PromptDef): PromptVersion | undefined {
+  return p.versions.find((v) => v.tags.includes("production"));
+}
+
 /* ───── Prompts list (left rail) ───── */
-function PromptList({ active, onSelect }: { active: string; onSelect: (id: string) => void }) {
+function PromptList({
+  rows,
+  active,
+  onSelect,
+  arthurEnabled,
+}: {
+  rows: PromptDef[];
+  active: string;
+  onSelect: (name: string) => void;
+  arthurEnabled: boolean;
+}) {
   const [filter, setFilter] = useState("all");
-  const list = filter === "all" ? D.PROMPTS : D.PROMPTS.filter(p => p.tags.includes(filter as PromptTag));
+  // Derive the tag filter set from tags that actually occur across all versions.
+  const allTags = Array.from(
+    new Set(rows.flatMap((p) => p.versions.flatMap((v) => v.tags))),
+  );
+  const filters = ["all", ...allTags];
+  const list =
+    filter === "all"
+      ? rows
+      : rows.filter((p) => p.versions.some((v) => v.tags.includes(filter)));
+
   return (
     <CkCard
-      eyebrow={`Arthur · ${D.PROMPTS.length} prompts`}
+      eyebrow={`${arthurEnabled ? "Arthur" : "In-code"} · ${rows.length} prompts`}
       title="Registry"
-      action={
-        <input
-          placeholder="Search…"
-          className="h-6 px-2 border border-neutral-200 rounded-xs font-mono text-[11px] text-neutral-900 outline-none bg-off-white w-full lg:w-[120px]"
-        />
-      }
       pad={0}
       className="lg:h-full"
       style={{ display: "flex", flexDirection: "column" }}
     >
-      <div className="px-3.5 py-2 border-b border-neutral-200 flex gap-1 flex-wrap">
-        {["all","production","staging","draft","locked"].map(t => (
-          <button
-            key={t}
-            onClick={() => setFilter(t)}
-            className={`appearance-none cursor-pointer px-2 py-1 rounded-xs font-mono text-[9px] font-medium tracking-[0.04em] uppercase border ${filter === t ? "border-coal bg-coal text-white" : "border-neutral-200 bg-panel text-neutral-700"}`}
-          >
-            {t}
-          </button>
-        ))}
-      </div>
+      {filters.length > 1 && (
+        <div className="px-3.5 py-2 border-b border-neutral-200 flex gap-1 flex-wrap">
+          {filters.map((t) => (
+            <button
+              key={t}
+              onClick={() => setFilter(t)}
+              className={`appearance-none cursor-pointer px-2 py-1 rounded-xs font-mono text-[9px] font-medium tracking-[0.04em] uppercase border ${filter === t ? "border-coal bg-coal text-white" : "border-neutral-200 bg-panel text-neutral-700"}`}
+            >
+              {t}
+            </button>
+          ))}
+        </div>
+      )}
       <div className="flex-1 overflow-auto">
         {list.map((p, i) => {
-          const on = active === p.id;
+          const on = active === p.name;
+          const prod = productionVersion(p);
           return (
             <button
               type="button"
-              key={p.id}
-              onClick={() => onSelect(p.id)}
+              key={p.name}
+              onClick={() => onSelect(p.name)}
               className={`block w-full appearance-none text-left px-4 py-[14px] cursor-pointer transition-all duration-100 border-l-[3px] focus-visible:outline focus-visible:outline-2 focus-visible:outline-mariner focus-visible:outline-offset-[-2px] ${i < list.length - 1 ? "border-b border-b-neutral-200" : ""} ${on ? "border-l-mariner bg-off-white" : "border-l-transparent bg-panel hover:bg-[#FAFBFC]"}`}
             >
               <div className="flex items-center justify-between gap-2">
                 <span className="font-mono text-[13px] font-semibold text-neutral-900">{p.name}</span>
-                <span className="font-mono text-[10px] text-mariner font-semibold">{p.current}</span>
+                <span className="font-mono text-[10px] text-mariner font-semibold">{p.model}</span>
               </div>
-              <div className="text-[11px] text-neutral-500 mt-[3px]">{p.workflowName}</div>
+              <div className="text-[11px] text-neutral-500 mt-[3px]">{p.phase}</div>
               <div className="flex items-center gap-1.5 mt-1.5">
-                {p.tags.map(t => <PromptStatusChip key={t} status={t} />)}
-                <span className={`ml-auto inline-flex items-center gap-1.5 font-mono text-[10px] ${p.evalDelta > 0 ? "text-[#3F6B1E]" : p.evalDelta < 0 ? "text-[#A2351C]" : "text-neutral-500"}`}>
-                  {(p.evalScore * 100).toFixed(0)}
-                  <span>{p.evalDelta > 0 ? "↗" : p.evalDelta < 0 ? "↘" : "→"}</span>
-                </span>
+                {prod && <PromptStatusChip status="production" />}
+                <PromptStatusChip status={p.source} />
               </div>
             </button>
           );
@@ -88,19 +104,30 @@ function PromptList({ active, onSelect }: { active: string; onSelect: (id: strin
   );
 }
 
+/* ───── Mini stat (used in prompt header) ───── */
+function Stat({ label, value, sub }: { label: React.ReactNode; value: React.ReactNode; sub?: React.ReactNode }) {
+  return (
+    <div>
+      <div className="font-mono text-[10px] text-neutral-700 tracking-[0.06em] uppercase">{label}</div>
+      <div className="font-display font-medium text-[26px] leading-[1.1] tracking-[-0.02em] text-neutral-900 mt-1">{value}</div>
+      {sub && <div className="font-mono text-[11px] mt-0.5 text-neutral-500">{sub}</div>}
+    </div>
+  );
+}
+
 /* ───── Selected-prompt detail (right pane) ───── */
-function PromptDetail({ promptId }: { promptId: string }) {
-  const p = D.PROMPTS.find((x: Prompt) => x.id === promptId);
-  const versions: PromptVersion[] = D.PROMPT_VERSIONS[promptId] || [];
-  const [selA, setSelA] = useState<string | null>(versions[0]?.v || null);
-  const [selB, setSelB] = useState<string | null>(versions[1]?.v || null);
+function PromptDetail({ prompt }: { prompt: PromptDef | undefined }) {
+  const [selectedVersion, setSelectedVersion] = useState<number | null>(null);
+  const [bodyCache, setBodyCache] = useState<Record<number, string>>({});
+  const [loading, setLoading] = useState(false);
+
+  // Reset the selected historical version whenever the active prompt changes —
+  // default view is always the resolved production body.
   useEffect(() => {
-    setSelA(versions[0]?.v || null);
-    setSelB(versions[1]?.v || null);
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [promptId]);
+    setSelectedVersion(null);
+  }, [prompt?.name]);
 
-  if (!p) {
+  if (!prompt) {
     return (
       <CkCard style={{ height: "100%" }}>
         <div className="p-10 text-center text-neutral-500 font-body">Select a prompt to inspect.</div>
@@ -108,228 +135,137 @@ function PromptDetail({ promptId }: { promptId: string }) {
     );
   }
 
-  if (!versions.length) {
-    return (
-      <CkCard
-        eyebrow={`Arthur · ${p.workflowName} → ${p.span}`}
-        title={p.name}
-        action={
-          <div className="flex gap-1.5">
-            {p.tags.map(t => <PromptStatusChip key={t} status={t} />)}
-          </div>
-        }
-        style={{ height: "100%" }}
-      >
-        <div className="py-10 text-center text-neutral-500 font-body">
-          Detailed version history not yet captured for this prompt.<br/>
-          <span className="font-mono text-[11px] text-neutral-700">Current: {p.current} · {p.versionCount} versions total</span>
-        </div>
-      </CkCard>
-    );
+  async function showVersion(v: PromptVersion) {
+    if (!prompt) return;
+    setSelectedVersion(v.version);
+    if (v.body !== undefined) {
+      setBodyCache((c) => ({ ...c, [v.version]: v.body! }));
+      return;
+    }
+    if (bodyCache[v.version] !== undefined) return;
+    setLoading(true);
+    try {
+      const res = await fetch(
+        `/api/prompts/${encodeURIComponent(prompt.name)}/versions/${v.version}`,
+      );
+      const json = (await res.json()) as { body: string | null };
+      setBodyCache((c) => ({ ...c, [v.version]: json.body ?? "(version body unavailable)" }));
+    } catch {
+      setBodyCache((c) => ({ ...c, [v.version]: "(version body unavailable)" }));
+    } finally {
+      setLoading(false);
+    }
   }
 
+  const shownBody =
+    selectedVersion != null
+      ? bodyCache[selectedVersion] ?? (loading ? "Loading…" : "")
+      : prompt.body;
+  const shownLabel = selectedVersion != null ? `v${selectedVersion}` : "production";
+
   return (
     <div className="flex flex-col gap-3 lg:h-full">
       <CkCard
-        eyebrow={`Arthur · ${p.workflowName} → ${p.span}`}
-        title={p.name}
+        eyebrow={`${prompt.source === "arthur" ? "Arthur" : "In-code"} · ${prompt.phase}`}
+        title={prompt.name}
         action={
           <div className="flex items-center gap-2">
-            {p.tags.map(t => <PromptStatusChip key={t} status={t} />)}
+            <PromptStatusChip status={prompt.source} />
             <span className="w-px h-4 bg-neutral-200" />
             <button className="appearance-none border border-neutral-200 bg-panel px-3 py-1.5 rounded-[3px] font-mono text-[11px] text-neutral-900 uppercase tracking-[0.04em] cursor-pointer">+ New version</button>
             <button className="appearance-none border border-coal bg-coal text-white px-3 py-1.5 rounded-[3px] font-mono text-[11px] uppercase tracking-[0.04em] cursor-pointer">Deploy</button>
           </div>
         }
       >
-        <div className="grid grid-cols-1 lg:grid-cols-4 gap-4">
-          <Stat label="Current version" value={p.current} sub={`by ${p.lastEditedBy} · ${(p.lastEditedAtMin/60).toFixed(0)}h ago`} />
-          <Stat label="Versions"        value={p.versionCount} sub="lifetime" />
-          <Stat label="Eval score"      value={(p.evalScore*100).toFixed(0)} sub={`${p.evalDelta > 0 ? "↗" : "↘"} ${Math.abs(p.evalDelta).toFixed(3)} vs prev`} tone={p.evalDelta > 0 ? "good" : "bad"} />
-          <Stat label="Traffic split"   value={Object.keys(p.trafficSplit).length + "-way"} sub={Object.entries(p.trafficSplit).map(([v,s]) => v + " " + (s*100).toFixed(0) + "%").join(" / ")} />
+        <div className="grid grid-cols-2 lg:grid-cols-4 gap-4">
+          <Stat label="Phase" value={prompt.phase} />
+          <Stat label="Source" value={prompt.source} />
+          <Stat label="Model" value={prompt.model} />
+          <Stat label="Versions" value={prompt.versions.length} sub="in Arthur" />
         </div>
       </CkCard>
 
-      {/* Version timeline */}
-      <CkCard eyebrow="Version timeline" title="History"
+      {/* Version timeline (real Arthur metadata) */}
+      {prompt.versions.length > 0 && (
+        <CkCard
+          eyebrow="Version timeline"
+          title="History"
+          action={
+            <span className="font-mono text-[10px] text-neutral-700 tracking-[0.04em] uppercase">
+              Click to inspect
+            </span>
+          }
+        >
+          <div className="flex flex-col lg:flex-row lg:items-stretch gap-0">
+            {prompt.versions.map((v, i) => {
+              const on = selectedVersion === v.version;
+              const notLast = i < prompt.versions.length - 1;
+              const dropDesktopRight = notLast && !on;
+              return (
+                <button
+                  key={v.version}
+                  onClick={() => showVersion(v)}
+                  className={`lg:flex-1 appearance-none cursor-pointer text-left px-4 py-[14px] relative border ${on ? "border-[#3C43E7]" : "border-[#E6E8EB]"} ${notLast ? "border-b-0" : ""} lg:border-b ${dropDesktopRight ? "lg:border-r-0" : "lg:border-r"} ${on ? "bg-mariner-100" : "bg-panel"}`}
+                >
+                  <div className="flex items-center justify-between mb-1.5">
+                    <span className="font-mono text-sm font-semibold text-neutral-900">v{v.version}</span>
+                    <div className="flex gap-1">
+                      {v.tags.map((t) => <PromptStatusChip key={t} status={t} />)}
+                    </div>
+                  </div>
+                  <div className="font-mono text-[10px] text-neutral-500 mb-2">{v.createdAt}</div>
+                  <div className="grid grid-cols-2 gap-1 font-mono text-[10px]">
+                    <span className="text-neutral-700">model</span><span className="text-neutral-900 text-right">{v.modelName}</span>
+                    <span className="text-neutral-700">messages</span><span className="text-neutral-900 text-right">{v.numMessages}</span>
+                    <span className="text-neutral-700">tools</span><span className="text-neutral-900 text-right">{v.numTools}</span>
+                  </div>
+                </button>
+              );
+            })}
+          </div>
+        </CkCard>
+      )}
+
+      {/* Body panel (single column, read-only) */}
+      <CkCard
+        eyebrow="Prompt body · text"
+        title={shownLabel}
         action={
-          <span className="font-mono text-[10px] text-neutral-700 tracking-[0.04em] uppercase">
-            Click to inspect · ⇧-click to compare
-          </span>
+          selectedVersion != null ? (
+            <button
+              onClick={() => setSelectedVersion(null)}
+              className="appearance-none border border-neutral-200 bg-panel px-3 py-1.5 rounded-[3px] font-mono text-[11px] text-neutral-900 uppercase tracking-[0.04em] cursor-pointer"
+            >
+              Show production
+            </button>
+          ) : undefined
         }
       >
-        <div className="flex flex-col lg:flex-row lg:items-stretch gap-0">
-          {versions.map((v, i) => {
-            const isA = selA === v.v;
-            const isB = selB === v.v;
-            const borderClass = isA ? "border-[#3C43E7]" : isB ? "border-[#FD6027]" : "border-[#E6E8EB]";
-            const notLast = i < versions.length - 1;
-            // Mobile (stacked): drop bottom border on all but the last so stacked
-            // buttons share one horizontal divider.
-            // Desktop (row, lg): drop right border on interior neutral buttons so
-            // side-by-side buttons share one vertical divider (selected keep full border).
-            const dropMobileBottom = notLast;
-            const dropDesktopRight = notLast && !isA && !isB;
-            return (
-              <button
-                key={v.v}
-                onClick={(e) => { if (e.shiftKey) setSelB(v.v); else setSelA(v.v); }}
-                className={`lg:flex-1 appearance-none cursor-pointer text-left px-4 py-[14px] relative border ${borderClass} ${dropMobileBottom ? "border-b-0" : ""} lg:border-b ${dropDesktopRight ? "lg:border-r-0" : "lg:border-r"} ${isA ? "bg-mariner-100" : isB ? "bg-[#FFEFE9]" : "bg-panel"}`}
-              >
-                <div className="flex items-center justify-between mb-1.5">
-                  <span className="font-mono text-sm font-semibold text-neutral-900">{v.v}</span>
-                  <PromptStatusChip status={v.status} />
-                </div>
-                <div className="font-mono text-[10px] text-neutral-500 mb-2">{v.deployedAt} · {v.by}</div>
-                <div className="grid grid-cols-2 gap-1 font-mono text-[10px]">
-                  <span className="text-neutral-700">eval</span><span className="text-neutral-900 font-semibold text-right">{(v.evalScore*100).toFixed(0)}</span>
-                  <span className="text-neutral-700">halluc</span><span className="text-neutral-900 text-right">{v.halluc.toFixed(3)}</span>
-                  <span className="text-neutral-700">p95</span><span className="text-neutral-900 text-right">{v.p95}s</span>
-                  <span className="text-neutral-700">$/run</span><span className="text-neutral-900 text-right">${v.costAvg.toFixed(3)}</span>
-                </div>
-                {v.traffic > 0 && (
-                  <div className="mt-2 h-1 bg-app-bg rounded-[1px]">
-                    <div className={`h-full rounded-[1px] ${v.status === "production" ? "bg-[#5BB04A]" : "bg-mariner"}`} style={{ width: (v.traffic*100) + "%" }} />
-                  </div>
-                )}
-                {(isA || isB) && (
-                  <span className={`absolute -top-2 left-3 px-1.5 py-px rounded-full text-white font-mono text-[9px] tracking-[0.04em] uppercase ${isA ? "bg-mariner" : "bg-burnt-orange"}`}>
-                    {isA ? "A" : "B"}
-                  </span>
-                )}
-              </button>
-            );
-          })}
+        <div className="border border-neutral-200 rounded-xs overflow-hidden max-h-[420px]">
+          <div className="overflow-auto max-h-[420px] font-mono text-[11px] leading-[1.55] bg-panel text-neutral-900 whitespace-pre-wrap break-words p-3">
+            {shownBody}
+          </div>
         </div>
       </CkCard>
-
-      {/* Diff + metrics */}
-      <div className="flex flex-col lg:grid lg:grid-cols-[1.6fr_1fr] gap-3">
-        <PromptDiff a={selA} b={selB} versions={versions} />
-        <PromptMetrics versions={versions} selA={selA} selB={selB} />
-      </div>
-    </div>
-  );
-}
-
-/* ───── Mini stat (used in prompt header) ───── */
-function Stat({ label, value, sub, tone }: { label: React.ReactNode; value: React.ReactNode; sub?: React.ReactNode; tone?: "good" | "bad" }) {
-  return (
-    <div>
-      <div className="font-mono text-[10px] text-neutral-700 tracking-[0.06em] uppercase">{label}</div>
-      <div className="font-display font-medium text-[26px] leading-[1.1] tracking-[-0.02em] text-neutral-900 mt-1">{value}</div>
-      {sub && <div className={`font-mono text-[11px] mt-0.5 ${tone === "good" ? "text-[#3F6B1E]" : tone === "bad" ? "text-[#A2351C]" : "text-neutral-500"}`}>{sub}</div>}
     </div>
   );
 }
 
-/* ───── Diff viewer ───── */
-function PromptDiff({ a, b, versions }: { a: string | null; b: string | null; versions: PromptVersion[] }) {
-  const bodyA = (a && D.PROMPT_BODIES[a]) || `# ${a}\n(prompt body not captured in mock)`;
-  const bodyB = (b && D.PROMPT_BODIES[b]) || `# ${b}\n(prompt body not captured in mock)`;
-  // Naive line-diff: pair lines by index, mark added/removed/equal.
-  const linesA = bodyA.split("\n");
-  const linesB = bodyB.split("\n");
-  const max = Math.max(linesA.length, linesB.length);
-  return (
-    <CkCard
-      eyebrow="Prompt diff · text"
-      title={`${b} → ${a}`}
-      action={
-        <div className="flex gap-2">
-          <CkChip style={{ background: "#FFEFE9", color: "#A2351C" }}>−{linesB.length} from {b}</CkChip>
-          <CkChip style={{ background: "#EAF7E0", color: "#3F6B1E" }}>+{linesA.length} into {a}</CkChip>
-        </div>
-      }
-    >
-      <div className="border border-neutral-200 rounded-xs overflow-hidden max-h-[340px]">
-        <div className="overflow-auto max-h-[340px] font-mono text-[11px] leading-[1.55]">
-          {Array.from({ length: max }).map((_, i) => {
-            const la = linesA[i] ?? "";
-            const lb = linesB[i] ?? "";
-            const same = la === lb;
-            return (
-              <div key={i} className="grid grid-cols-2">
-                <div className={`flex border-r border-neutral-200 ${same ? "bg-panel text-neutral-700" : (lb ? "bg-[#FCE6E2] text-[#80261C]" : "bg-off-white text-neutral-700")}`}>
-                  <span className="flex-[0_0_36px] text-right px-2 py-px text-[#D2D6DA] select-none">{lb ? (i + 1) : ""}</span>
-                  <span className={`flex-[0_0_14px] text-center font-semibold ${same ? "text-[#D2D6DA]" : "text-[#A2351C]"}`}>{same ? " " : (lb ? "−" : " ")}</span>
-                  <span className="flex-1 px-1.5 py-px whitespace-pre-wrap break-words">{lb}</span>
-                </div>
-                <div className={`flex ${same ? "bg-panel text-neutral-900" : (la ? "bg-[#EAF7E0] text-[#1C4A0E]" : "bg-off-white text-neutral-900")}`}>
-                  <span className="flex-[0_0_36px] text-right px-2 py-px text-[#D2D6DA] select-none">{la ? (i + 1) : ""}</span>
-                  <span className={`flex-[0_0_14px] text-center font-semibold ${same ? "text-[#D2D6DA]" : "text-[#3F6B1E]"}`}>{same ? " " : (la ? "+" : " ")}</span>
-                  <span className="flex-1 px-1.5 py-px whitespace-pre-wrap break-words">{la}</span>
-                </div>
-              </div>
-            );
-          })}
-        </div>
-      </div>
-    </CkCard>
-  );
-}
-
-/* ───── Metrics comparison ───── */
-function PromptMetrics({ versions, selA, selB }: { versions: PromptVersion[]; selA: string | null; selB: string | null }) {
-  const a = versions.find(v => v.v === selA);
-  const b = versions.find(v => v.v === selB);
-  const rows: { k: keyof PromptVersion; l: string; fmt: (v: number) => string; better: "higher" | "lower" | null }[] = [
-    { k: "evalScore",  l: "Eval score",       fmt: (v) => (v*100).toFixed(0), better: "higher" },
-    { k: "halluc",     l: "Hallucination",    fmt: (v) => v.toFixed(3),        better: "lower"  },
-    { k: "p95",        l: "p95 latency",      fmt: (v) => v.toFixed(1) + "s",  better: "lower"  },
-    { k: "costAvg",    l: "Cost / run",       fmt: (v) => "$" + v.toFixed(3),  better: "lower"  },
-    { k: "runs",       l: "Runs (lifetime)",  fmt: (v) => v.toLocaleString("en-US"),  better: null     },
-  ];
-  return (
-    <CkCard eyebrow="Side-by-side" title="Metrics">
-      <div className="grid grid-cols-[1fr_auto_auto] gap-y-3 gap-x-4 items-center">
-        <span />
-        <span className="font-mono text-[10px] text-mariner font-semibold tracking-[0.06em] uppercase">A · {selA}</span>
-        <span className="font-mono text-[10px] text-burnt-orange font-semibold tracking-[0.06em] uppercase">B · {selB}</span>
-        {rows.map(r => {
-          const av = a ? (a[r.k] as number) : null;
-          const bv = b ? (b[r.k] as number) : null;
-          let aWins = false, bWins = false;
-          if (av !== null && bv !== null && r.better) {
-            if (r.better === "higher") {
-              aWins = av > bv;
-              bWins = bv > av;
-            }
-            if (r.better === "lower") {
-              aWins = av < bv;
-              bWins = bv < av;
-            }
-          }
-          return (
-            <React.Fragment key={r.k}>
-              <span className="font-body font-medium text-[13px] leading-none text-neutral-800">{r.l}</span>
-              <span className={`font-mono text-sm font-semibold text-right ${aWins ? "text-[#3F6B1E]" : "text-neutral-900"}`}>
-                {av != null ? r.fmt(av) : "—"}
-                {aWins && <span className="ml-1 text-[#3F6B1E] text-[11px]">✓</span>}
-              </span>
-              <span className={`font-mono text-sm font-semibold text-right ${bWins ? "text-[#3F6B1E]" : "text-neutral-500"}`}>
-                {bv != null ? r.fmt(bv) : "—"}
-                {bWins && <span className="ml-1 text-[#3F6B1E] text-[11px]">✓</span>}
-              </span>
-            </React.Fragment>
-          );
-        })}
-      </div>
-      <div className="mt-4 pt-3 border-t border-neutral-200 font-mono text-[10px] text-neutral-700 tracking-[0.04em] uppercase">
-        Sample size: {a?.runs.toLocaleString("en-US")} vs {b?.runs.toLocaleString("en-US")} runs
-      </div>
-    </CkCard>
-  );
-}
-
 /* ───── Top-level screen ───── */
-export function PromptsScreen() {
-  const [active, setActive] = useState(D.PROMPTS[0]?.id ?? "");
+export function PromptsScreen({ data }: { data: PromptsResponse }) {
+  const [active, setActive] = useState(data.rows[0]?.name ?? "");
+  const selected = data.rows.find((p) => p.name === active);
+  const inProd = data.rows.filter((p) =>
+    p.versions.some((v) => v.tags.includes("production")),
+  ).length;
+
   return (
     <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
       <div className="flex items-end justify-between">
         <div>
-          <div className="font-mono text-[10px] text-neutral-500 tracking-[0.06em] uppercase">Arthur engine · prompt versioning</div>
+          <div className="font-mono text-[10px] text-neutral-500 tracking-[0.06em] uppercase">
+            {data.arthurEnabled ? "Arthur engine · prompt versioning" : "In-code defaults · prompt versioning"}
+          </div>
           <h2 className="font-display font-medium text-2xl leading-[1.2] m-0 text-neutral-900">Prompt registry</h2>
         </div>
         <div className="flex gap-2">
@@ -338,16 +274,18 @@ export function PromptsScreen() {
         </div>
       </div>
 
-      <div className="grid grid-cols-1 lg:grid-cols-4 gap-3">
-        <CkKPI label="Prompts"         value={D.PROMPTS.length.toString()}                                    sub="across 6 workflows" />
-        <CkKPI label="In production"   value={D.PROMPTS.filter(p => p.tags.includes("production")).length.toString()} sub="serving traffic" />
-        <CkKPI label="A/B tests"       value={D.PROMPTS.filter(p => p.tags.includes("ab-test")).length.toString()}    sub="live experiments" />
-        <CkKPI label="Avg eval Δ · 7d" value="+0.4%"                                                          sub="across all prompts" delta="↗ improving" deltaTone="good" />
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
+        <CkKPI label="Prompts" value={data.total.toString()} sub="workflow phases" />
+        <CkKPI
+          label="In production"
+          value={inProd.toString()}
+          sub={data.arthurEnabled ? "tagged in Arthur" : "in-code defaults"}
+        />
       </div>
 
       <div className="flex flex-col lg:grid lg:grid-cols-[340px_1fr] gap-3 lg:min-h-[720px]">
-        <PromptList active={active} onSelect={setActive} />
-        <PromptDetail promptId={active} />
+        <PromptList rows={data.rows} active={active} onSelect={setActive} arthurEnabled={data.arthurEnabled} />
+        <PromptDetail prompt={selected} />
       </div>
     </div>
   );
diff --git a/apps/dashboard/lib/api/fallbacks.ts b/apps/dashboard/lib/api/fallbacks.ts
index d7c81cb..d0eec95 100644
--- a/apps/dashboard/lib/api/fallbacks.ts
+++ b/apps/dashboard/lib/api/fallbacks.ts
@@ -1,6 +1,9 @@
 import type {
   KpisResponse,
   EvalHealthResponse,
+  EvalsResponse,
+  CostResponse,
+  PromptsResponse,
   RunsResponse,
   RunDetailResponse,
   LiveRunsResponse,
@@ -42,3 +45,23 @@ export function liveRunsFallback(now: string): LiveRunsResponse {
 export function workflowsFallback(now: string): WorkflowsResponse {
   return { generatedAt: now, rows: [], total: 0 };
 }
+
+export function evalsFallback(now: string): EvalsResponse {
+  return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+
+export function costFallback(now: string): CostResponse {
+  return {
+    generatedAt: now,
+    available: false,
+    window: { start: now, end: now },
+    totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+    byModel: [],
+    byWorkflow: [],
+    daily: [],
+  };
+}
+
+export function promptsFallback(now: string): PromptsResponse {
+  return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+}
diff --git a/apps/shared/contracts/api.ts b/apps/shared/contracts/api.ts
index 32eecbc..4f3d171 100644
--- a/apps/shared/contracts/api.ts
+++ b/apps/shared/contracts/api.ts
@@ -1,4 +1,4 @@
-import type { Run, RunDetail, RunStep, Workflow } from "./domain.js";
+import type { PromptDef, Run, RunDetail, RunStep, Workflow } from "./domain.js";
 
 export interface ErrorEnvelope {
   error: { code: string; message: string; details?: unknown };
@@ -24,6 +24,90 @@ export type EvalHealthResponse =
     }
   | { available: false; reason: string };
 
+export type EvalsResponse =
+  | {
+      available: true;
+      generatedAt: string;
+      windowHours: number;
+      /** continuous_eval_success_rate × 100, fleet-wide. */
+      score: number;
+      /** Σ eval_count across tasks — "spans graded" in the window. */
+      spansGraded: number;
+      /** Σ trace_count across tasks. */
+      traceCount: number;
+    }
+  | { available: false; generatedAt: string; reason: string };
+
+export interface CostByModelEntry {
+  /** Arthur span model_name. */
+  model: string;
+  /** USD, summed total_token_cost over the window. */
+  cost: number;
+  /** Summed total_token_count over the window. */
+  tokens: number;
+}
+
+export interface CostByWorkflowEntry {
+  /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */
+  taskId: string;
+  /** Arthur task name (= the ticket-run identifier). */
+  name: string;
+  /** trace_count for the task. */
+  runs: number;
+  /** trace_token_count. */
+  tokens: number;
+  /** trace_token_cost (USD). */
+  cost: number;
+  /** cost / max(1, runs). */
+  costPerRun: number;
+}
+
+export interface CostResponse {
+  generatedAt: string;
+  /**
+   * false when Arthur is unconfigured/unreachable or returns nothing. The
+   * screen renders its empty/N-A state.
+   */
+  available: boolean;
+  /** Window the figures cover (the request's start_time/end_time). ISO. */
+  window: { start: string; end: string };
+  totals: {
+    /** USD, Σ overviews[].trace_token_cost. */
+    totalTokenCost: number;
+    /** Σ overviews[].trace_token_count. */
+    totalTokens: number;
+    /** Σ overviews[].trace_count. */
+    traceCount: number;
+    /** totalTokenCost / max(1, traceCount). */
+    costPerRun: number;
+  };
+  byModel: CostByModelEntry[];
+  /** Per-task (= per ticket-run) breakdown from /traces/overview. */
+  byWorkflow: CostByWorkflowEntry[];
+  /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */
+  daily: { date: string; cost: number; tokens: number }[];
+}
+
+export interface PromptsResponse {
+  generatedAt: string;
+  /** `false` when the worker can't resolve prompts (degrades to empty list). */
+  available: boolean;
+  /**
+   * Whether Arthur is configured (key + endpoint + task id all set). When
+   * false, every prompt's `source` is "fallback" and `versions` is empty.
+   */
+  arthurEnabled: boolean;
+  rows: PromptDef[];
+  total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+  generatedAt: string;
+  available: boolean;
+  body: string | null;
+}
+
 export interface LiveRunsResponse {
   generatedAt: string;
   rows: Run[];
diff --git a/apps/shared/contracts/domain.ts b/apps/shared/contracts/domain.ts
index 6aba292..d4868a4 100644
--- a/apps/shared/contracts/domain.ts
+++ b/apps/shared/contracts/domain.ts
@@ -133,3 +133,36 @@ export interface HourPoint {
   p95: number;
   errors: number;
 }
+
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+  /** Arthur integer version number. */
+  version: number;
+  /** ISO timestamp the version was created. */
+  createdAt: string;
+  /** Real Arthur tags on this version, e.g. ["production"]. */
+  tags: string[];
+  modelProvider: string;
+  modelName: string;
+  numMessages: number;
+  numTools: number;
+  /** Body text. Present only for the production version (eager); other
+   *  versions are fetched on demand via the by-version endpoint. */
+  body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+  /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+  name: string;
+  /** Human label for the workflow phase, e.g. "Research & Plan". */
+  phase: string;
+  /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+  body: string;
+  /** Where the resolved `body` came from. */
+  source: "arthur" | "fallback";
+  /** Model the agent runs this prompt with (env-derived). */
+  model: string;
+  /** Real Arthur version history, newest first. Empty when source is "fallback". */
+  versions: PromptVersion[];
+}
diff --git a/apps/worker/src/lib/overview/collect-cost.test.ts b/apps/worker/src/lib/overview/collect-cost.test.ts
new file mode 100644
index 0000000..f0d9736
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-cost.test.ts
@@ -0,0 +1,186 @@
+import { describe, it, expect, vi } from "vitest";
+import { collectCost, type CostArthurClient } from "./collect-cost.js";
+import type {
+  TraceOverviewListResponse,
+  TraceTimeseriesPoint,
+  ModelTokenCost,
+} from "../../sandbox/arthur-client.js";
+
+const NOW = new Date("2026-06-08T12:00:00.000Z");
+
+function makeClient(opts: {
+  overview: TraceOverviewListResponse;
+  timeseries: Record<string, TraceTimeseriesPoint[]>;
+  byModel: ModelTokenCost[];
+}): CostArthurClient {
+  return {
+    getTracesOverview: vi.fn().mockResolvedValue(opts.overview),
+    getTracesTimeseries: vi
+      .fn()
+      .mockImplementation((taskId: string) =>
+        Promise.resolve(opts.timeseries[taskId] ?? []),
+      ),
+    aggregateSpanTokensByModel: vi.fn().mockResolvedValue(opts.byModel),
+  };
+}
+
+describe("collectCost", () => {
+  it("aggregates totals, per-task breakdown, by-model, and merged daily series", async () => {
+    const client = makeClient({
+      overview: {
+        count: 2,
+        overviews: [
+          {
+            task_id: "t1",
+            trace_count: 4,
+            trace_token_count: 1000,
+            trace_token_cost: 2.0,
+            eval_count: 0,
+            continuous_eval_success_rate: 1,
+            last_active: "2026-06-08",
+          },
+          {
+            task_id: "t2",
+            trace_count: 6,
+            trace_token_count: 3000,
+            trace_token_cost: 4.0,
+            eval_count: 0,
+            continuous_eval_success_rate: 1,
+          },
+        ],
+      },
+      timeseries: {
+        t1: [
+          { timestamp: "2026-06-06", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 },
+          { timestamp: "2026-06-07", trace_count: 2, trace_token_count: 500, trace_token_cost: 1.0 },
+        ],
+        t2: [
+          { timestamp: "2026-06-07", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 },
+          { timestamp: "2026-06-08", trace_count: 3, trace_token_count: 1500, trace_token_cost: 2.0 },
+        ],
+      },
+      byModel: [
+        { model: "claude-opus-4-6", tokens: 3000, cost: 5.0 },
+        { model: "claude-haiku", tokens: 1000, cost: 1.0 },
+      ],
+    });
+
+    const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+    // totals
+    expect(data.totals).toEqual({
+      totalTokenCost: 6.0,
+      totalTokens: 4000,
+      traceCount: 10,
+      costPerRun: 0.6,
+    });
+
+    // window = calendar MTD
+    expect(data.window.start).toBe("2026-06-01T00:00:00.000Z");
+    expect(data.window.end).toBe(NOW.toISOString());
+
+    // byWorkflow = per-task, with costPerRun guarded
+    expect(data.byWorkflow).toEqual([
+      { taskId: "t1", name: "t1", runs: 4, tokens: 1000, cost: 2.0, costPerRun: 0.5 },
+      { taskId: "t2", name: "t2", runs: 6, tokens: 3000, cost: 4.0, costPerRun: 4 / 6 },
+    ]);
+
+    // byModel passthrough mapped to contract shape
+    expect(data.byModel).toEqual([
+      { model: "claude-opus-4-6", cost: 5.0, tokens: 3000 },
+      { model: "claude-haiku", cost: 1.0, tokens: 1000 },
+    ]);
+
+    // daily merged by timestamp, oldest -> newest
+    expect(data.daily).toEqual([
+      { date: "2026-06-06", cost: 1.0, tokens: 500 },
+      { date: "2026-06-07", cost: 3.0, tokens: 2000 },
+      { date: "2026-06-08", cost: 2.0, tokens: 1500 },
+    ]);
+  });
+
+  it("treats null trace_token_cost as 0 and guards divide-by-zero", async () => {
+    const client = makeClient({
+      overview: {
+        count: 1,
+        overviews: [
+          {
+            task_id: "t1",
+            trace_count: 0,
+            trace_token_count: 0,
+            trace_token_cost: null,
+            eval_count: 0,
+            continuous_eval_success_rate: 0,
+          },
+        ],
+      },
+      timeseries: { t1: [] },
+      byModel: [],
+    });
+
+    const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+    expect(data.totals).toEqual({
+      totalTokenCost: 0,
+      totalTokens: 0,
+      traceCount: 0,
+      costPerRun: 0,
+    });
+    expect(data.byWorkflow).toEqual([
+      { taskId: "t1", name: "t1", runs: 0, tokens: 0, cost: 0, costPerRun: 0 },
+    ]);
+    expect(data.byModel).toEqual([]);
+    expect(data.daily).toEqual([]);
+  });
+
+  it("returns empty aggregates when Arthur has no tasks", async () => {
+    const client = makeClient({
+      overview: { count: 0, overviews: [] },
+      timeseries: {},
+      byModel: [],
+    });
+
+    const data = await collectCost(client, { now: NOW, bucketSize: "day" });
+
+    expect(data.totals).toEqual({
+      totalTokenCost: 0,
+      totalTokens: 0,
+      traceCount: 0,
+      costPerRun: 0,
+    });
+    expect(data.byWorkflow).toEqual([]);
+    expect(data.byModel).toEqual([]);
+    expect(data.daily).toEqual([]);
+    // No tasks -> no per-task timeseries fan-out.
+    expect(client.getTracesTimeseries).not.toHaveBeenCalled();
+  });
+
+  it("caps the daily timeseries fan-out to the 50 most-active tasks", async () => {
+    // 60 tasks, each with a distinct trace_count so the top-50 are deterministic.
+    const overviews = Array.from({ length: 60 }, (_, i) => ({
+      task_id: `t${i}`,
+      trace_count: i, // t59 most active, t0 least
+      trace_token_count: 0,
+      trace_token_cost: 0,
+      eval_count: 0,
+      continuous_eval_success_rate: 0,
+    }));
+    const client = makeClient({
+      overview: { count: overviews.length, overviews },
+      timeseries: {},
+      byModel: [],
+    });
+
+    await collectCost(client, { now: NOW, bucketSize: "day" });
+
+    // Only the 50 highest-trace_count tasks are queried (t10..t59).
+    expect(client.getTracesTimeseries).toHaveBeenCalledTimes(50);
+    const queried = (client.getTracesTimeseries as ReturnType<typeof vi.fn>).mock.calls.map(
+      (c) => c[0],
+    );
+    expect(queried).not.toContain("t0");
+    expect(queried).not.toContain("t9");
+    expect(queried).toContain("t10");
+    expect(queried).toContain("t59");
+  });
+});
diff --git a/apps/worker/src/lib/overview/collect-cost.ts b/apps/worker/src/lib/overview/collect-cost.ts
new file mode 100644
index 0000000..258cb0e
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-cost.ts
@@ -0,0 +1,142 @@
+import type { CostResponse } from "@shared/contracts";
+import { logger } from "../logger.js";
+import type {
+  TraceOverviewListResponse,
+  TraceTimeseriesPoint,
+  ModelTokenCost,
+} from "../../sandbox/arthur-client.js";
+
+/**
+ * The slice of `ArthurClient` the cost collector depends on. The real object is
+ * an `ArthurClient`; this narrow interface keeps the aggregation testable with a
+ * fake (mirrors `RunsLister` for the run-store collectors).
+ */
+export interface CostArthurClient {
+  getTracesOverview(
+    taskIds: string[],
+    startTime: string,
+    endTime: string,
+  ): Promise<TraceOverviewListResponse>;
+  getTracesTimeseries(
+    taskId: string,
+    startTime: string,
+    endTime: string,
+    bucketSize: string,
+  ): Promise<TraceTimeseriesPoint[]>;
+  aggregateSpanTokensByModel(
+    taskIds: string[],
+    startTime: string,
+    endTime: string,
+  ): Promise<ModelTokenCost[]>;
+}
+
+export interface CollectCostOptions {
+  now: Date;
+  /** Bucket granularity for the daily-spend timeseries. */
+  bucketSize: string;
+}
+
+/**
+ * Shapes a `CostResponse` (minus `generatedAt`/`available`) from Arthur's
+ * pre-aggregated token/cost data. Cost comes straight from Arthur's
+ * `*_token_cost` fields — no client-side pricing.
+ *
+ * - `totals` + `byWorkflow` come from one `getTracesOverview` call. Arthur tasks
+ *   ARE the workflow grouping (per ticket-run), so each overview row is one
+ *   `byWorkflow` entry.
+ * - `byModel` comes from `aggregateSpanTokensByModel` (the one client-side
+ *   grouping, since Arthur has no per-model overview).
+ * - `daily` fans out one `getTracesTimeseries` call per task that appears in the
+ *   overview and merges points by bucket timestamp.
+ */
+export async function collectCost(
+  client: CostArthurClient,
+  opts: CollectCostOptions,
+): Promise<Omit<CostResponse, "generatedAt" | "available">> {
+  const { now, bucketSize } = opts;
+  // Assumption: calendar month-to-date (matches the original "MTD" framing).
+  // TODO(arthur-verify): confirm the intended window (calendar MTD vs rolling 30d/24h).
+  const start = startOfMonthUTC(now).toISOString();
+  const end = now.toISOString();
+
+  // TODO(arthur-verify): empty `task_ids` is assumed to mean org-wide. If Arthur
+  // requires explicit ids, enumerate the org's tasks and pass them instead.
+  const { overviews } = await client.getTracesOverview([], start, end);
+
+  let totalTokenCost = 0;
+  let totalTokens = 0;
+  let traceCount = 0;
+  const byWorkflow = overviews.map((o) => {
+    // trace_token_cost is null when Arthur has no cost data — treat as 0.
+    const cost = o.trace_token_cost ?? 0;
+    totalTokenCost += cost;
+    totalTokens += o.trace_token_count;
+    traceCount += o.trace_count;
+    return {
+      taskId: o.task_id,
+      // Arthur task name = the ticket-run identifier; overview omits it, so the
+      // task_id (which IS that identifier) doubles as the display name.
+      // TODO(arthur-verify): task->workflow mapping — rows stay per-task.
+      name: o.task_id,
+      runs: o.trace_count,
+      tokens: o.trace_token_count,
+      cost,
+      costPerRun: o.trace_count > 0 ? cost / o.trace_count : 0,
+    };
+  });
+
+  const totals = {
+    totalTokenCost,
+    totalTokens,
+    traceCount,
+    costPerRun: traceCount > 0 ? totalTokenCost / traceCount : 0,
+  };
+
+  const byModelRaw = await client.aggregateSpanTokensByModel([], start, end);
+  const byModel = byModelRaw.map((m) => ({
+    model: m.model,
+    cost: m.cost,
+    tokens: m.tokens,
+  }));
+
+  // Fan out one timeseries call per task that has data, then merge by bucket.
+  // Tasks are per-ticket-run, so a busy month can be hundreds — cap the fan-out
+  // to the most-active tasks to avoid an unbounded burst of requests.
+  // TODO(arthur-verify): cap is by trace_count, on the assumption the highest-
+  // traffic tasks dominate the daily-spend curve; revisit if the chart looks short.
+  const DAILY_FANOUT_CAP = 50;
+  const sortedByActivity = [...overviews].sort((a, b) => b.trace_count - a.trace_count);
+  const fanoutTasks = sortedByActivity.slice(0, DAILY_FANOUT_CAP);
+  if (sortedByActivity.length > DAILY_FANOUT_CAP) {
+    logger.info(
+      {
+        total: sortedByActivity.length,
+        capped: DAILY_FANOUT_CAP,
+        dropped: sortedByActivity.slice(DAILY_FANOUT_CAP).map((o) => o.task_id),
+      },
+      "cost_daily_fanout_capped",
+    );
+  }
+  const taskIds = fanoutTasks.map((o) => o.task_id);
+  const series = await Promise.all(
+    taskIds.map((id) => client.getTracesTimeseries(id, start, end, bucketSize)),
+  );
+  const merged = new Map<string, { cost: number; tokens: number }>();
+  for (const points of series) {
+    for (const p of points) {
+      const row = merged.get(p.timestamp) ?? { cost: 0, tokens: 0 };
+      row.cost += p.trace_token_cost ?? 0;
+      row.tokens += p.trace_token_count;
+      merged.set(p.timestamp, row);
+    }
+  }
+  const daily = [...merged.entries()]
+    .map(([date, v]) => ({ date, cost: v.cost, tokens: v.tokens }))
+    .sort((a, b) => (a.date < b.date ? -1 : a.date > b.date ? 1 : 0));
+
+  return { window: { start, end }, totals, byModel, byWorkflow, daily };
+}
+
+function startOfMonthUTC(now: Date): Date {
+  return new Date(Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), 1));
+}
diff --git a/apps/worker/src/lib/overview/collect-evals.test.ts b/apps/worker/src/lib/overview/collect-evals.test.ts
new file mode 100644
index 0000000..1b40a6f
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-evals.test.ts
@@ -0,0 +1,95 @@
+import { describe, it, expect, vi } from "vitest";
+import { collectEvals } from "./collect-evals.js";
+import type { TraceOverview } from "../../sandbox/arthur-client.js";
+
+const NOW = new Date("2026-06-08T12:00:00.000Z");
+
+function makeClient(overviews: TraceOverview[]) {
+  return { getTracesOverview: vi.fn().mockResolvedValue({ overviews }) };
+}
+
+function overview(over: Partial<TraceOverview>): TraceOverview {
+  return {
+    task_id: "t",
+    trace_count: 0,
+    trace_token_count: 0,
+    trace_token_cost: 0,
+    eval_count: 0,
+    continuous_eval_success_rate: 0,
+    ...over,
+  };
+}
+
+describe("collectEvals", () => {
+  it("sums spansGraded/traceCount and eval-count-weights the score", async () => {
+    const client = makeClient([
+      overview({ task_id: "a", trace_count: 10, eval_count: 8, continuous_eval_success_rate: 1.0 }),
+      overview({ task_id: "b", trace_count: 4, eval_count: 2, continuous_eval_success_rate: 0.5 }),
+    ]);
+
+    const result = await collectEvals({
+      client,
+      taskIds: [],
+      windowHours: 24,
+      now: NOW,
+    });
+
+    expect(result.spansGraded).toBe(10);
+    expect(result.traceCount).toBe(14);
+    // (1.0*8 + 0.5*2) / 10 * 100 = (8 + 1) / 10 * 100 = 90
+    expect(result.score).toBe(90);
+    expect(result.windowHours).toBe(24);
+  });
+
+  it("yields score 0 when nothing is graded (eval_count sums to 0)", async () => {
+    const client = makeClient([
+      overview({ task_id: "a", trace_count: 5, eval_count: 0 }),
+    ]);
+
+    const result = await collectEvals({
+      client,
+      taskIds: [],
+      windowHours: 24,
+      now: NOW,
+    });
+
+    expect(result.spansGraded).toBe(0);
+    expect(result.traceCount).toBe(5);
+    expect(result.score).toBe(0);
+  });
+
+  it("computes the window start from windowHours and passes the ISO range to the client", async () => {
+    const client = makeClient([]);
+
+    await collectEvals({
+      client,
+      taskIds: ["x", "y"],
+      windowHours: 24,
+      now: NOW,
+    });
+
+    expect(client.getTracesOverview).toHaveBeenCalledWith(
+      ["x", "y"],
+      "2026-06-07T12:00:00.000Z",
+      "2026-06-08T12:00:00.000Z",
+    );
+  });
+
+  it("returns zeroed aggregates when no overviews are returned", async () => {
+    const client = makeClient([]);
+
+    const result = await collectEvals({
+      client,
+      taskIds: [],
+      windowHours: 24,
+      now: NOW,
+    });
+
+    expect(result).toEqual({
+      windowHours: 24,
+      score: 0,
+      spansGraded: 0,
+      traceCount: 0,
+    });
+  });
+});
diff --git a/apps/worker/src/lib/overview/collect-evals.ts b/apps/worker/src/lib/overview/collect-evals.ts
new file mode 100644
index 0000000..b144232
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-evals.ts
@@ -0,0 +1,74 @@
+import type { EvalsResponse } from "@shared/contracts";
+import type { TraceOverview } from "../../sandbox/arthur-client.js";
+
+const HOUR = 3_600_000;
+
+/** Fleet aggregate fields the route spreads onto an `available: true` response. */
+export type EvalsAggregate = Pick<
+  Extract<EvalsResponse, { available: true }>,
+  "windowHours" | "score" | "spansGraded" | "traceCount"
+>;
+
+/**
+ * The slice of `ArthurClient` the eval collector depends on. The real object is
+ * an `ArthurClient`; this narrow interface keeps the aggregation testable with a
+ * fake (mirrors `CostArthurClient` for the cost collector).
+ */
+export interface EvalsArthurClient {
+  getTracesOverview(
+    taskIds: string[],
+    startTime: string,
+    endTime: string,
+  ): Promise<{ overviews: TraceOverview[] }>;
+}
+
+export interface CollectEvalsOptions {
+  client: EvalsArthurClient;
+  // TODO(arthur-verify): unconfirmed whether `taskIds: []` means "all org tasks"
+  // on POST /api/v1/traces/overview. If not, the route must enumerate tasks first.
+  taskIds: string[];
+  windowHours: number;
+  now: Date;
+}
+
+/**
+ * Aggregates Arthur's per-task trace overviews into fleet-wide eval health:
+ * eval-count-weighted success rate × 100, summed spans-graded and trace counts
+ * over the window. When `spansGraded` sums to 0 (no continuous evals configured
+ * / nothing graded), `score` is 0 and the route turns that into
+ * `available: false`.
+ */
+export async function collectEvals(
+  opts: CollectEvalsOptions,
+): Promise<EvalsAggregate> {
+  const endTime = opts.now.toISOString();
+  const startTime = new Date(
+    opts.now.getTime() - opts.windowHours * HOUR,
+  ).toISOString();
+
+  const { overviews } = await opts.client.getTracesOverview(
+    opts.taskIds,
+    startTime,
+    endTime,
+  );
+
+  const spansGraded = sum(overviews, (o) => o.eval_count);
+  const traceCount = sum(overviews, (o) => o.trace_count);
+  const score =
+    spansGraded === 0
+      ? 0
+      : (sum(overviews, (o) => o.continuous_eval_success_rate * o.eval_count) /
+          spansGraded) *
+        100;
+
+  return {
+    windowHours: opts.windowHours,
+    score,
+    spansGraded,
+    traceCount,
+  };
+}
+
+function sum<T>(items: T[], pick: (item: T) => number): number {
+  return items.reduce((acc, item) => acc + (pick(item) || 0), 0);
+}
diff --git a/apps/worker/src/lib/overview/collect-prompts.test.ts b/apps/worker/src/lib/overview/collect-prompts.test.ts
new file mode 100644
index 0000000..8d382cb
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-prompts.test.ts
@@ -0,0 +1,164 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+
+vi.mock("../../../env.js", () => ({ env: {} }));
+
+const mockGetPromptByTag = vi.fn();
+const mockListPromptVersions = vi.fn();
+vi.mock("../../sandbox/arthur-client.js", () => ({
+  ArthurClient: {
+    fromTraceEndpoint: vi.fn(() => ({
+      getPromptByTag: mockGetPromptByTag,
+      listPromptVersions: mockListPromptVersions,
+    })),
+  },
+}));
+
+import { resolvePrompts } from "./collect-prompts.js";
+import { PROMPT_FALLBACKS } from "../prompts.js";
+
+async function setEnv(partial: Record<string, string | undefined>) {
+  const mod = (await import("../../../env.js")) as unknown as {
+    env: Record<string, string | undefined>;
+  };
+  mod.env = { ...mod.env, ...partial };
+}
+
+function arthurVersion(version: number, tags: string[]) {
+  return {
+    version,
+    created_at: `2026-06-0${version}T00:00:00.000Z`,
+    deleted_at: null,
+    model_provider: "anthropic",
+    model_name: "claude-opus-4-6",
+    tags,
+    num_messages: 1,
+    num_tools: 0,
+  };
+}
+
+describe("resolvePrompts", () => {
+  beforeEach(async () => {
+    mockGetPromptByTag.mockReset();
+    mockListPromptVersions.mockReset();
+    await setEnv({
+      AGENT_KIND: "claude",
+      CLAUDE_MODEL: "claude-opus-4-6",
+      CODEX_MODEL: "gpt-5-codex",
+      GENAI_ENGINE_API_KEY: undefined,
+      GENAI_ENGINE_TRACE_ENDPOINT: undefined,
+      GENAI_ENGINE_PROMPT_TASK_ID: undefined,
+    });
+  });
+
+  it("returns fallbacks with empty versions when Arthur is disabled", async () => {
+    const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+    expect(arthurEnabled).toBe(false);
+    expect(prompts).toHaveLength(3);
+    expect(prompts.map((p) => p.name)).toEqual(["research-plan", "implement", "review"]);
+    for (const p of prompts) {
+      expect(p.source).toBe("fallback");
+      expect(p.versions).toEqual([]);
+      expect(p.model).toBe("claude-opus-4-6");
+    }
+    expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+    expect(prompts[0].phase).toBe("Research & Plan");
+    expect(mockGetPromptByTag).not.toHaveBeenCalled();
+  });
+
+  it("returns fallbacks when PROMPT_TASK_ID is missing even if key+endpoint are set", async () => {
+    await setEnv({
+      GENAI_ENGINE_API_KEY: "k",
+      GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+      GENAI_ENGINE_PROMPT_TASK_ID: undefined,
+    });
+    const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+    expect(arthurEnabled).toBe(false);
+    expect(prompts[0].source).toBe("fallback");
+    expect(mockGetPromptByTag).not.toHaveBeenCalled();
+  });
+
+  it("resolves Arthur bodies + version history when enabled, attaching the production body", async () => {
+    await setEnv({
+      GENAI_ENGINE_API_KEY: "k",
+      GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+      GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+    });
+    mockGetPromptByTag.mockResolvedValue("arthur body");
+    mockListPromptVersions.mockResolvedValue([
+      arthurVersion(2, ["production"]),
+      arthurVersion(1, []),
+    ]);
+
+    const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+    expect(arthurEnabled).toBe(true);
+    expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
+    const research = prompts[0];
+    expect(research.source).toBe("arthur");
+    expect(research.body).toBe("arthur body");
+    expect(research.versions).toHaveLength(2);
+    expect(research.versions[0]).toMatchObject({
+      version: 2,
+      createdAt: "2026-06-02T00:00:00.000Z",
+      tags: ["production"],
+      modelProvider: "anthropic",
+      modelName: "claude-opus-4-6",
+      numMessages: 1,
+      numTools: 0,
+    });
+    // production version carries the eager body; the other does not
+    expect(research.versions[0].body).toBe("arthur body");
+    expect(research.versions[1].body).toBeUndefined();
+  });
+
+  it("falls back per-prompt when the production body is missing but keeps versions", async () => {
+    await setEnv({
+      GENAI_ENGINE_API_KEY: "k",
+      GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+      GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+    });
+    mockGetPromptByTag.mockResolvedValue(null);
+    mockListPromptVersions.mockResolvedValue([arthurVersion(1, [])]);
+
+    const { prompts } = await resolvePrompts({ withVersions: true });
+    expect(prompts[0].source).toBe("fallback");
+    expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+    expect(prompts[0].versions).toHaveLength(1);
+  });
+
+  it("degrades a prompt to fallback with empty versions when the body fetch throws", async () => {
+    await setEnv({
+      GENAI_ENGINE_API_KEY: "k",
+      GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+      GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+    });
+    mockGetPromptByTag.mockRejectedValue(new Error("boom"));
+    mockListPromptVersions.mockResolvedValue([]);
+
+    const { prompts } = await resolvePrompts({ withVersions: true });
+    expect(prompts[0].source).toBe("fallback");
+    expect(prompts[0].body).toBe(PROMPT_FALLBACKS["research-plan"]);
+    expect(prompts[0].versions).toEqual([]);
+  });
+
+  it("skips the version fan-out and resolves empty versions when withVersions is false", async () => {
+    await setEnv({
+      GENAI_ENGINE_API_KEY: "k",
+      GENAI_ENGINE_TRACE_ENDPOINT: "https://host/api/v1/traces",
+      GENAI_ENGINE_PROMPT_TASK_ID: "00000000-0000-0000-0000-000000000000",
+    });
+    mockGetPromptByTag.mockResolvedValue("arthur body");
+
+    const { prompts } = await resolvePrompts({ withVersions: false });
+    expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
+    expect(mockListPromptVersions).not.toHaveBeenCalled();
+    expect(prompts[0].source).toBe("arthur");
+    expect(prompts[0].body).toBe("arthur body");
+    expect(prompts[0].versions).toEqual([]);
+  });
+
+  it("uses the codex model when AGENT_KIND=codex", async () => {
+    await setEnv({ AGENT_KIND: "codex" });
+    const { prompts } = await resolvePrompts({ withVersions: true });
+    expect(prompts[0].model).toBe("gpt-5-codex");
+  });
+});
diff --git a/apps/worker/src/lib/overview/collect-prompts.ts b/apps/worker/src/lib/overview/collect-prompts.ts
new file mode 100644
index 0000000..b36ee40
--- /dev/null
+++ b/apps/worker/src/lib/overview/collect-prompts.ts
@@ -0,0 +1,113 @@
+import type { PromptVersion } from "@shared/contracts";
+import { env } from "../../../env.js";
+import { logger } from "../logger.js";
+import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js";
+
+const PHASE_LABEL: Record<PromptName, string> = {
+  "research-plan": "Research & Plan",
+  "implement": "Implement",
+  "review": "Review",
+};
+
+export interface ResolvedPrompt {
+  name: PromptName;
+  phase: string;
+  body: string;
+  source: "arthur" | "fallback";
+  model: string;
+  versions: PromptVersion[];
+}
+
+export interface ResolvePromptsResult {
+  arthurEnabled: boolean;
+  prompts: ResolvedPrompt[];
+}
+
+/**
+ * Resolve each workflow phase prompt to its production body + (optionally) real
+ * Arthur version history. Shared by the durable `loadPrompts()` step and the
+ * `GET /api/v1/prompts` route so the two never drift.
+ *
+ * Version history is a dashboard-only concern, so `withVersions` lets the
+ * durable step skip the per-prompt `listPromptVersions` fan-out it would
+ * otherwise discard. When false, `versions` resolves to `[]` and only the
+ * production body is fetched.
+ *
+ * When Arthur is unconfigured (`GENAI_ENGINE_*`, incl. `GENAI_ENGINE_PROMPT_TASK_ID`,
+ * unset) every prompt resolves to its in-code `PROMPT_FALLBACKS` string with
+ * `source: "fallback"` and an empty version history.
+ */
+export async function resolvePrompts(opts: { withVersions: boolean }): Promise<ResolvePromptsResult> {
+  const { withVersions } = opts;
+  const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL;
+  const arthurEnabled =
+    !!env.GENAI_ENGINE_API_KEY &&
+    !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+    !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+  const base = (
+    name: PromptName,
+    body: string,
+    source: "arthur" | "fallback",
+    versions: PromptVersion[] = [],
+  ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions });
+
+  if (!arthurEnabled) {
+    logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved");
+    return {
+      arthurEnabled,
+      prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")),
+    };
+  }
+
+  const { ArthurClient } = await import("../../sandbox/arthur-client.js");
+  const client = ArthurClient.fromTraceEndpoint(
+    env.GENAI_ENGINE_TRACE_ENDPOINT!,
+    env.GENAI_ENGINE_API_KEY!,
+  );
+  const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
+  const TAG = "production";
+
+  async function one(name: PromptName): Promise<ResolvedPrompt> {
+    try {
+      // TODO(arthur-verify): version-list pagination depth — first page only.
+      let body: string | null;
+      let versions: PromptVersion[] = [];
+      if (withVersions) {
+        const [rawBody, rawVersions] = await Promise.all([
+          client.getPromptByTag(taskId, name, TAG),
+          client.listPromptVersions(taskId, name).catch(() => []),
+        ]);
+        body = rawBody;
+        versions = rawVersions.map((v) => ({
+          version: v.version,
+          createdAt: v.created_at,
+          tags: v.tags,
+          modelProvider: v.model_provider,
+          modelName: v.model_name,
+          numMessages: v.num_messages,
+          numTools: v.num_tools,
+        }));
+        // Attach the eager production body to its matching version entry; other
+        // version bodies are fetched on demand via the by-version route.
+        const prodVersion = versions.find((v) => v.tags.includes(TAG));
+        if (prodVersion && body !== null) prodVersion.body = body;
+      } else {
+        body = await client.getPromptByTag(taskId, name, TAG);
+      }
+
+      if (body === null) {
+        logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved");
+        return base(name, PROMPT_FALLBACKS[name], "fallback", versions);
+      }
+      logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved");
+      return base(name, body, "arthur", versions);
+    } catch (err) {
+      logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved");
+      return base(name, PROMPT_FALLBACKS[name], "fallback");
+    }
+  }
+
+  const prompts = await Promise.all(PROMPT_NAMES.map(one));
+  return { arthurEnabled, prompts };
+}
diff --git a/apps/worker/src/routes/api/v1/cost.get.ts b/apps/worker/src/routes/api/v1/cost.get.ts
new file mode 100644
index 0000000..6c51680
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/cost.get.ts
@@ -0,0 +1,43 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { CostResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectCost } from "../../../lib/overview/collect-cost.js";
+import { logger } from "../../../lib/logger.js";
+
+const EMPTY: Omit<CostResponse, "generatedAt" | "available"> = {
+  window: { start: "", end: "" },
+  totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+  byModel: [],
+  byWorkflow: [],
+  daily: [],
+};
+
+export default defineEventHandler(async (event): Promise<CostResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+
+  const generatedAt = new Date().toISOString();
+
+  // Arthur unconfigured — degrade to the documented empty state (no crash).
+  if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+    return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } };
+  }
+
+  try {
+    const client = ArthurClient.fromTraceEndpoint(
+      env.GENAI_ENGINE_TRACE_ENDPOINT,
+      env.GENAI_ENGINE_API_KEY,
+    );
+    // TODO(arthur-verify): bucket_size value ("day") is unconfirmed against a live instance.
+    const data = await collectCost(client, { now: new Date(), bucketSize: "day" });
+    return { generatedAt, available: true, ...data };
+  } catch (err) {
+    // Arthur unreachable / 401 / unexpected shape — degrade like runs.get.ts.
+    logger.warn({ err: (err as Error).message }, "cost_collect_failed");
+    return { generatedAt, available: false, ...EMPTY, window: { start: generatedAt, end: generatedAt } };
+  }
+});
diff --git a/apps/worker/src/routes/api/v1/evals.get.ts b/apps/worker/src/routes/api/v1/evals.get.ts
new file mode 100644
index 0000000..54300d5
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/evals.get.ts
@@ -0,0 +1,68 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { EvalsResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectEvals } from "../../../lib/overview/collect-evals.js";
+import { logger } from "../../../lib/logger.js";
+
+const WINDOW_HOURS = 24;
+
+export default defineEventHandler(async (event): Promise<EvalsResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+
+  const generatedAt = new Date().toISOString();
+
+  if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+    return {
+      available: false,
+      generatedAt,
+      reason: "Arthur GenAI Engine not configured.",
+    };
+  }
+
+  try {
+    const client = ArthurClient.fromTraceEndpoint(
+      env.GENAI_ENGINE_TRACE_ENDPOINT,
+      env.GENAI_ENGINE_API_KEY,
+    );
+    // TODO(arthur-verify): pass [] if empty task_ids === all org tasks on
+    // POST /api/v1/traces/overview; otherwise enumerate via /api/v2/tasks/search.
+    const taskIds: string[] = [];
+
+    const { windowHours, score, spansGraded, traceCount } =
+      await collectEvals({
+        client,
+        taskIds,
+        windowHours: WINDOW_HOURS,
+        now: new Date(),
+      });
+
+    if (spansGraded === 0) {
+      return {
+        available: false,
+        generatedAt,
+        reason: "No graded evals in the last 24h.",
+      };
+    }
+
+    return {
+      available: true,
+      generatedAt,
+      windowHours,
+      score,
+      spansGraded,
+      traceCount,
+    };
+  } catch (err) {
+    logger.warn({ err: (err as Error).message }, "evals_list_failed");
+    return {
+      available: false,
+      generatedAt,
+      reason: "Eval grading not wired up yet.",
+    };
+  }
+});
diff --git a/apps/worker/src/routes/api/v1/prompts.get.ts b/apps/worker/src/routes/api/v1/prompts.get.ts
new file mode 100644
index 0000000..d0686d4
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/prompts.get.ts
@@ -0,0 +1,29 @@
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { PromptsResponse } from "@shared/contracts";
+import { resolvePrompts } from "../../../lib/overview/collect-prompts.js";
+import { logger } from "../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise<PromptsResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+
+  const generatedAt = new Date().toISOString();
+  try {
+    const { arthurEnabled, prompts } = await resolvePrompts({ withVersions: true });
+    return {
+      generatedAt,
+      available: true,
+      arthurEnabled,
+      rows: prompts,
+      total: prompts.length,
+    };
+  } catch (err) {
+    // Arthur unreachable / unexpected failure — degrade to the documented empty
+    // state so the dashboard renders its N/A view instead of a 500.
+    logger.warn({ err: (err as Error).message }, "prompts_resolve_failed");
+    return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 };
+  }
+});
diff --git a/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
new file mode 100644
index 0000000..c30fffd
--- /dev/null
+++ b/apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
@@ -0,0 +1,44 @@
+import { defineEventHandler, getRouterParam, setResponseHeader } from "h3";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+import { env } from "../../../../../../../env.js";
+import { PROMPT_NAMES, type PromptName } from "../../../../../../lib/prompts.js";
+import { logger } from "../../../../../../lib/logger.js";
+
+// TODO(arthur-verify): lazy-vs-eager body — historical bodies are fetched on
+// demand here; the production body ships eagerly on the list route.
+export default defineEventHandler(async (event): Promise<PromptVersionBodyResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+  const generatedAt = new Date().toISOString();
+
+  const name = getRouterParam(event, "name") ?? "";
+  const version = getRouterParam(event, "version") ?? "";
+  const arthurEnabled =
+    !!env.GENAI_ENGINE_API_KEY &&
+    !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+    !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+  if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) {
+    return { generatedAt, available: false, body: null };
+  }
+
+  try {
+    const { ArthurClient } = await import("../../../../../../sandbox/arthur-client.js");
+    const client = ArthurClient.fromTraceEndpoint(
+      env.GENAI_ENGINE_TRACE_ENDPOINT!,
+      env.GENAI_ENGINE_API_KEY!,
+    );
+    const body = await client.getPromptVersionBody(
+      env.GENAI_ENGINE_PROMPT_TASK_ID!,
+      name,
+      version,
+    );
+    return { generatedAt, available: body !== null, body };
+  } catch (err) {
+    logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed");
+    return { generatedAt, available: false, body: null };
+  }
+});
diff --git a/apps/worker/src/sandbox/arthur-client.test.ts b/apps/worker/src/sandbox/arthur-client.test.ts
index a5e4a80..bccd57b 100644
--- a/apps/worker/src/sandbox/arthur-client.test.ts
+++ b/apps/worker/src/sandbox/arthur-client.test.ts
@@ -239,4 +239,197 @@ describe("ArthurClient", () => {
       await expect(client.getPromptByTag("t", "x", "production")).rejects.toThrow(/500/);
     });
   });
+
+  describe("getTracesOverview", () => {
+    it("POSTs task_ids/start/end and returns the parsed list response", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        count: 1,
+        overviews: [
+          {
+            task_id: "AWT-42",
+            trace_count: 3,
+            trace_token_count: 1200,
+            trace_token_cost: 0.42,
+            eval_count: 6,
+            continuous_eval_success_rate: 0.9,
+            last_active: "2026-06-08T00:00:00Z",
+          },
+        ],
+      }));
+      const client = new ArthurClient("http://host", "secret");
+      const res = await client.getTracesOverview(["AWT-42"], "2026-06-01T00:00:00Z", "2026-06-08T00:00:00Z");
+
+      expect(res.count).toBe(1);
+      expect(res.overviews[0].task_id).toBe("AWT-42");
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/traces/overview");
+      expect(init.method).toBe("POST");
+      expect(init.headers.Authorization).toBe("Bearer secret");
+      expect(JSON.parse(init.body)).toEqual({
+        task_ids: ["AWT-42"],
+        start_time: "2026-06-01T00:00:00Z",
+        end_time: "2026-06-08T00:00:00Z",
+      });
+    });
+  });
+
+  describe("getTracesTimeseries", () => {
+    it("POSTs single task_id + bucket_size and unwraps the { points } envelope", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        points: [
+          { timestamp: "2026-06-07T00:00:00Z", trace_count: 1, trace_token_count: 400, trace_token_cost: 0.1 },
+        ],
+      }));
+      const client = new ArthurClient("http://host", "k");
+      const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day");
+
+      expect(points).toHaveLength(1);
+      expect(points[0].trace_token_cost).toBe(0.1);
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/traces/overview/timeseries");
+      expect(init.method).toBe("POST");
+      expect(JSON.parse(init.body)).toEqual({
+        task_id: "AWT-42",
+        start_time: "s",
+        end_time: "e",
+        bucket_size: "day",
+      });
+    });
+
+    it("accepts a bare array response", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse([
+        { timestamp: "t", trace_count: 2, trace_token_count: 10, trace_token_cost: null },
+      ]));
+      const client = new ArthurClient("http://host", "k");
+      const points = await client.getTracesTimeseries("AWT-42", "s", "e", "day");
+      expect(points).toHaveLength(1);
+    });
+  });
+
+  describe("aggregateSpanTokensByModel", () => {
+    it("sums tokens/cost grouped by model_name and skips null models", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        spans: [
+          { model_name: "claude-opus-4-6", total_token_count: 100, total_token_cost: 0.5 },
+          { model_name: "claude-opus-4-6", total_token_count: 50, total_token_cost: 0.25 },
+          { model_name: "gpt-5", total_token_count: 200, total_token_cost: 1.0 },
+          { model_name: null, total_token_count: 999, total_token_cost: 9.0 },
+        ],
+      }));
+      const client = new ArthurClient("http://host", "k");
+      const rows = await client.aggregateSpanTokensByModel(["AWT-42"], "s", "e");
+
+      expect(rows).toEqual([
+        { model: "claude-opus-4-6", tokens: 150, cost: 0.75 },
+        { model: "gpt-5", tokens: 200, cost: 1.0 },
+      ]);
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/traces/spans");
+      expect(JSON.parse(init.body)).toEqual({
+        task_ids: ["AWT-42"],
+        start_time: "s",
+        end_time: "e",
+        limit: 1000,
+      });
+    });
+
+    it("treats null token/cost as 0", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse([
+        { model_name: "m", total_token_count: null, total_token_cost: null },
+      ]));
+      const client = new ArthurClient("http://host", "k");
+      const rows = await client.aggregateSpanTokensByModel([], "s", "e");
+      expect(rows).toEqual([{ model: "m", tokens: 0, cost: 0 }]);
+    });
+  });
+
+  describe("listPromptVersions", () => {
+    it("GETs the versions endpoint and sorts newest-first", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        count: 2,
+        versions: [
+          {
+            version: 1,
+            created_at: "2026-06-01T00:00:00Z",
+            deleted_at: null,
+            model_provider: "anthropic",
+            model_name: "claude-opus-4-6",
+            tags: [],
+            num_messages: 1,
+            num_tools: 0,
+          },
+          {
+            version: 2,
+            created_at: "2026-06-02T00:00:00Z",
+            deleted_at: null,
+            model_provider: "anthropic",
+            model_name: "claude-opus-4-6",
+            tags: ["production"],
+            num_messages: 1,
+            num_tools: 0,
+          },
+        ],
+      }));
+      const client = new ArthurClient("http://host", "k");
+      const versions = await client.listPromptVersions("task-uuid", "research-plan");
+
+      expect(versions.map((v) => v.version)).toEqual([2, 1]);
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions");
+      expect(init.method).toBe("GET");
+      expect(init.headers.Authorization).toBe("Bearer k");
+    });
+
+    it("returns [] on 404", async () => {
+      mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 }));
+      const client = new ArthurClient("http://host", "k");
+      expect(await client.listPromptVersions("t", "research-plan")).toEqual([]);
+    });
+
+    it("throws on 5xx", async () => {
+      mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 }));
+      const client = new ArthurClient("http://host", "k");
+      await expect(client.listPromptVersions("t", "x")).rejects.toThrow(/500/);
+    });
+  });
+
+  describe("getPromptVersionBody", () => {
+    it("GETs the by-version endpoint and returns messages[0].content", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        name: "research-plan",
+        version: 3,
+        messages: [{ role: "user", content: "v3 body" }],
+      }));
+      const client = new ArthurClient("http://host", "k");
+      const body = await client.getPromptVersionBody("task-uuid", "research-plan", 3);
+      expect(body).toBe("v3 body");
+      const [url, init] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/tasks/task-uuid/prompts/research-plan/versions/3");
+      expect(init.method).toBe("GET");
+    });
+
+    it("accepts a string version specifier (latest/tag/datetime)", async () => {
+      mockFetch.mockResolvedValueOnce(jsonResponse({
+        name: "implement",
+        messages: [{ role: "user", content: "latest body" }],
+      }));
+      const client = new ArthurClient("http://host", "k");
+      const body = await client.getPromptVersionBody("t", "implement", "latest");
+      expect(body).toBe("latest body");
+      const [url] = mockFetch.mock.calls[0];
+      expect(url).toBe("http://host/api/v1/tasks/t/prompts/implement/versions/latest");
+    });
+
+    it("returns null on 404", async () => {
+      mockFetch.mockResolvedValueOnce(new Response("not found", { status: 404 }));
+      const client = new ArthurClient("http://host", "k");
+      expect(await client.getPromptVersionBody("t", "x", 1)).toBeNull();
+    });
+
+    it("throws on 5xx", async () => {
+      mockFetch.mockResolvedValueOnce(new Response("boom", { status: 500 }));
+      const client = new ArthurClient("http://host", "k");
+      await expect(client.getPromptVersionBody("t", "x", 1)).rejects.toThrow(/500/);
+    });
+  });
 });
diff --git a/apps/worker/src/sandbox/arthur-client.ts b/apps/worker/src/sandbox/arthur-client.ts
index 8afc77c..66d8b8f 100644
--- a/apps/worker/src/sandbox/arthur-client.ts
+++ b/apps/worker/src/sandbox/arthur-client.ts
@@ -23,6 +23,68 @@ interface SearchResponse {
   tasks: ArthurTask[];
 }
 
+/**
+ * Per-task aggregate over a window from `POST /api/v1/traces/overview`.
+ * Token/cost fields come from Arthur's `TokenCountCostSchema`; `trace_token_cost`
+ * may be null when cost is unavailable. Typed per the documented shape — these
+ * read endpoints are UNVERIFIED against a live instance, so parsing stays
+ * defensive (callers treat null cost as 0).
+ */
+export interface TraceOverview {
+  task_id: string;
+  trace_count: number;
+  trace_token_count: number;
+  trace_token_cost: number | null;
+  eval_count: number;
+  continuous_eval_success_rate: number;
+  last_active?: string;
+}
+
+export interface TraceOverviewListResponse {
+  count: number;
+  overviews: TraceOverview[];
+}
+
+/** One bucket from `POST /api/v1/traces/overview/timeseries` (single task). */
+export interface TraceTimeseriesPoint {
+  timestamp: string;
+  trace_count: number;
+  trace_token_count: number;
+  trace_token_cost: number | null;
+  continuous_eval_success_rate?: number;
+}
+
+/** Token/cost-by-model aggregation result (one row per Arthur `model_name`). */
+export interface ModelTokenCost {
+  model: string;
+  tokens: number;
+  cost: number;
+}
+
+/** A span row from `GET /api/v1/traces/spans` carrying model + token/cost fields. */
+interface SpanTokenCost {
+  model_name: string | null;
+  total_token_count: number | null;
+  total_token_cost: number | null;
+}
+
+/** One Arthur prompt version's metadata (no message body). */
+export interface ArthurPromptVersion {
+  version: number;
+  created_at: string;
+  deleted_at: string | null;
+  model_provider: string;
+  model_name: string;
+  tags: string[];
+  num_messages: number;
+  num_tools: number;
+}
+
+interface AgenticPromptVersionListResponse {
+  count: number;
+  versions: ArthurPromptVersion[];
+}
+
 export class ArthurClient {
   constructor(
     private readonly baseUrl: string,
@@ -56,6 +118,20 @@ export class ArthurClient {
     return (await res.json()) as T;
   }
 
+  /** GET that treats 404 as "absent" (returns null) instead of throwing — for the prompt read paths. */
+  private async getAllowing404<T>(path: string): Promise<T | null> {
+    const res = await fetch(`${this.baseUrl}${path}`, {
+      method: "GET",
+      headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+    });
+    if (res.status === 404) return null;
+    if (!res.ok) {
+      const body = await res.text().catch(() => "");
+      throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+    }
+    return (await res.json()) as T;
+  }
+
   /**
    * Return tasks whose name equals `prefix` or matches `^prefix\.\d+$`.
    * Arthur's `task_name` search is substring-based, so we post-filter to
@@ -124,21 +200,8 @@ export class ArthurClient {
   /** Fetch a tagged prompt version. Returns the first message's content, or null if 404. */
   async getPromptByTag(taskId: string, name: string, tag: string): Promise<string | null> {
     const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/tags/${encodeURIComponent(tag)}`;
-    const res = await fetch(`${this.baseUrl}${path}`, {
-      method: "GET",
-      headers: {
-        "Authorization": `Bearer ${this.apiKey}`,
-        "ngrok-skip-browser-warning": "true",
-      },
-    });
-    if (res.status === 404) return null;
-    if (!res.ok) {
-      const body = await res.text().catch(() => "");
-      throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
-    }
-    const prompt = (await res.json()) as AgenticPrompt;
-    const first = prompt.messages?.[0];
-    return first?.content ?? null;
+    const prompt = await this.getAllowing404<AgenticPrompt>(path);
+    return prompt?.messages?.[0]?.content ?? null;
   }
 
   /** Create a new version of a named prompt on a task. Content is sent as a single user message. */
@@ -171,4 +234,110 @@ export class ArthurClient {
       },
     );
   }
+
+  /**
+   * Fleet eval/cost aggregate over a window. One call covers multiple tasks;
+   * sum across `overviews` for fleet totals. `taskIds` may be empty (see the
+   * empty-means-all-org open question in the specs). Shared by /evals + /cost.
+   */
+  async getTracesOverview(
+    taskIds: string[],
+    startTime: string,
+    endTime: string,
+  ): Promise<TraceOverviewListResponse> {
+    return this.request<TraceOverviewListResponse>("/api/v1/traces/overview", {
+      method: "POST",
+      body: JSON.stringify({
+        task_ids: taskIds,
+        start_time: startTime,
+        end_time: endTime,
+      }),
+    });
+  }
+
+  /**
+   * Per-bucket timeseries for a single task. The caller fans out one call per
+   * task and merges points by timestamp. The response envelope key is
+   * unverified, so accept both a bare array and a `{ points }` wrapper.
+   */
+  async getTracesTimeseries(
+    taskId: string,
+    startTime: string,
+    endTime: string,
+    bucketSize: string,
+  ): Promise<TraceTimeseriesPoint[]> {
+    const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>(
+      "/api/v1/traces/overview/timeseries",
+      {
+        method: "POST",
+        body: JSON.stringify({
+          task_id: taskId,
+          start_time: startTime,
+          end_time: endTime,
+          bucket_size: bucketSize,
+        }),
+      },
+    );
+    return Array.isArray(res) ? res : (res.points ?? []);
+  }
+
+  /**
+   * By-model token/cost aggregation — Arthur has no per-model overview, so we
+   * fetch span rows (which carry `model_name` + token/cost fields) and sum
+   * grouped by `model_name`. Spans with a null `model_name` are skipped.
+   */
+  async aggregateSpanTokensByModel(
+    taskIds: string[],
+    startTime: string,
+    endTime: string,
+  ): Promise<ModelTokenCost[]> {
+    // TODO(arthur-verify): pagination — first page only, bounded to N spans. The
+    // read endpoints are unverified, so we send a bounded `limit` rather than
+    // looping pages; this makes the ceiling explicit instead of pulling an
+    // unbounded result set and summing it silently in memory.
+    const res = await this.request<{ spans?: SpanTokenCost[] } | SpanTokenCost[]>(
+      "/api/v1/traces/spans",
+      {
+        method: "POST",
+        body: JSON.stringify({
+          task_ids: taskIds,
+          start_time: startTime,
+          end_time: endTime,
+          limit: 1000,
+        }),
+      },
+    );
+    const spans = Array.isArray(res) ? res : (res.spans ?? []);
+    const byModel = new Map<string, ModelTokenCost>();
+    for (const span of spans) {
+      if (!span.model_name) continue;
+      const row = byModel.get(span.model_name) ?? {
+        model: span.model_name,
+        tokens: 0,
+        cost: 0,
+      };
+      row.tokens += span.total_token_count ?? 0;
+      row.cost += span.total_token_cost ?? 0;
+      byModel.set(span.model_name, row);
+    }
+    return [...byModel.values()];
+  }
+
+  /** List version metadata for a named prompt (newest first). First page only. Empty on 404. */
+  async listPromptVersions(taskId: string, name: string): Promise<ArthurPromptVersion[]> {
+    const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`;
+    const data = await this.getAllowing404<AgenticPromptVersionListResponse>(path);
+    return [...(data?.versions ?? [])].sort((a, b) => b.version - a.version);
+  }
+
+  /**
+   * Fetch the body of a specific version. `version` accepts an integer,
+   * `"latest"`, an ISO datetime, or a tag. Returns the first message's content,
+   * or null on 404. Generalizes the by-version GET that `getPromptByTag` uses.
+   */
+  async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise<string | null> {
+    const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`;
+    const prompt = await this.getAllowing404<AgenticPrompt>(path);
+    return prompt?.messages?.[0]?.content ?? null;
+  }
 }
diff --git a/apps/worker/src/workflows/prompts-step.test.ts b/apps/worker/src/workflows/prompts-step.test.ts
index ab09934..4350061 100644
--- a/apps/worker/src/workflows/prompts-step.test.ts
+++ b/apps/worker/src/workflows/prompts-step.test.ts
@@ -3,9 +3,13 @@ import { describe, it, expect, vi, beforeEach } from "vitest";
 vi.mock("../../env.js", () => ({ env: {} }));
 
 const mockGetPromptByTag = vi.fn();
+const mockListPromptVersions = vi.fn();
 vi.mock("../sandbox/arthur-client.js", () => ({
   ArthurClient: {
-    fromTraceEndpoint: vi.fn(() => ({ getPromptByTag: mockGetPromptByTag })),
+    fromTraceEndpoint: vi.fn(() => ({
+      getPromptByTag: mockGetPromptByTag,
+      listPromptVersions: mockListPromptVersions,
+    })),
   },
 }));
 
@@ -20,6 +24,8 @@ async function setEnv(partial: Record<string, string | undefined>) {
 describe("loadPrompts", () => {
   beforeEach(async () => {
     mockGetPromptByTag.mockReset();
+    mockListPromptVersions.mockReset();
+    mockListPromptVersions.mockResolvedValue([]);
     await setEnv({
       GENAI_ENGINE_API_KEY: undefined,
       GENAI_ENGINE_TRACE_ENDPOINT: undefined,
@@ -65,6 +71,9 @@ describe("loadPrompts", () => {
     expect(mockGetPromptByTag).toHaveBeenCalledTimes(3);
     const names = mockGetPromptByTag.mock.calls.map((c) => c[1]);
     expect(names).toEqual(["research-plan", "implement", "review"]);
+    // The step throws version metadata away, so it must not pay for the
+    // dashboard-only listPromptVersions fan-out.
+    expect(mockListPromptVersions).not.toHaveBeenCalled();
   });
 
   it("falls back per-prompt when Arthur returns null or throws", async () => {
diff --git a/apps/worker/src/workflows/prompts-step.ts b/apps/worker/src/workflows/prompts-step.ts
index 9baae40..bc4a44f 100644
--- a/apps/worker/src/workflows/prompts-step.ts
+++ b/apps/worker/src/workflows/prompts-step.ts
@@ -6,53 +6,18 @@ export interface LoadedPrompts {
 
 export async function loadPrompts(): Promise<LoadedPrompts> {
   "use step";
-  const { env } = await import("../../env.js");
-  const { logger } = await import("../lib/logger.js");
-  const { PROMPT_FALLBACKS } = await import("../lib/prompts.js");
-  type PromptName = keyof typeof PROMPT_FALLBACKS;
-
-  const arthurEnabled =
-    !!env.GENAI_ENGINE_API_KEY &&
-    !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
-    !!env.GENAI_ENGINE_PROMPT_TASK_ID;
-
-  if (!arthurEnabled) {
-    logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_loaded");
-    return {
-      research: PROMPT_FALLBACKS["research-plan"],
-      implement: PROMPT_FALLBACKS["implement"],
-      review: PROMPT_FALLBACKS["review"],
-    };
-  }
-
-  const { ArthurClient } = await import("../sandbox/arthur-client.js");
-  const client = ArthurClient.fromTraceEndpoint(
-    env.GENAI_ENGINE_TRACE_ENDPOINT!,
-    env.GENAI_ENGINE_API_KEY!,
-  );
-  const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
-  const TAG = "production";
-
-  async function one(name: PromptName): Promise<string> {
-    try {
-      const body = await client.getPromptByTag(taskId, name, TAG);
-      if (body === null) {
-        logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_loaded");
-        return PROMPT_FALLBACKS[name];
-      }
-      logger.info({ name, source: "arthur" }, "prompts_loaded");
-      return body;
-    } catch (err) {
-      logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_loaded");
-      return PROMPT_FALLBACKS[name];
-    }
-  }
-
-  const [research, implement, review] = await Promise.all([
-    one("research-plan"),
-    one("implement"),
-    one("review"),
-  ]);
-  return { research, implement, review };
+  // Delegate to the shared resolver so the durable step and the
+  // GET /api/v1/prompts route share one source of truth. The resolver carries
+  // the same logger.info/logger.warn (fallback / arthur / per-prompt error)
+  // calls the step used to make. Version history is dashboard-only, so skip the
+  // listPromptVersions fan-out here — the step only consumes prompt bodies.
+  const { resolvePrompts } = await import("../lib/overview/collect-prompts.js");
+  const { prompts } = await resolvePrompts({ withVersions: false });
+  const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body]));
+  return {
+    research: byName["research-plan"],
+    implement: byName["implement"],
+    review: byName["review"],
+  };
 }
 loadPrompts.maxRetries = 0;
diff --git a/docs/superpowers/plans/2026-06-08-cost-real-data.md b/docs/superpowers/plans/2026-06-08-cost-real-data.md
new file mode 100644
index 0000000..091c029
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-cost-real-data.md
@@ -0,0 +1,329 @@
+# `/cost` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token/cost from the OpenInference traces the workflow ships in. **Single PR** — no persistence, no capture.
+
+**Architecture:** New Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) on the existing `ArthurClient`. A worker collector `collect-cost.ts` calls them and shapes a `CostResponse` (totals, by-task breakdown, by-model breakdown, merged daily series). A new route `GET /api/v1/cost` exposes it, degrading to empty when Arthur is unconfigured/unreachable. The dashboard fetches it server-side via `getJSON`, falls back to an empty `CostResponse`, and passes `data` to the `CostScreen` client presenter. Thin `page.tsx` wraps `cost-data.tsx` in `<Suspense>`. Identical read-path shape to `overview-data.tsx` / `runs-data.tsx`.
+
+**Tech Stack:** Next.js App Router, React, TypeScript, `@shared/contracts`, h3 worker routes, existing `ArthurClient` (fetch + Bearer). Worker has Vitest (`*.test.ts`); dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-cost-real-data-design.md`
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do.
+
+**Live open questions (resolve with the user; the plan assumes the spec's defaults):** `bucket_size` allowed values for the timeseries; whether empty `task_ids` means org-wide (else enumerate tasks); by-model client aggregation acceptable; task→workflow mapping (breakdown stays per-task); window = calendar MTD. See the spec's "Open questions".
+
+---
+
+### Task 1: Add Arthur read methods + types
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts`
+- Modify: `apps/worker/src/sandbox/arthur-client.test.ts`
+
+- [ ] **Step 1: Add response types**
+
+Add interfaces mirroring Arthur's shapes:
+
+```ts
+export interface TraceOverviewEntry {
+  task_id: string;
+  trace_count: number;
+  trace_token_count: number;
+  trace_token_cost: number | null;
+  last_active?: string;
+}
+export interface TraceTimeseriesPoint {
+  timestamp: string;
+  trace_count: number;
+  trace_token_count: number;
+  trace_token_cost: number | null;
+}
+export interface SpanTokenCost {
+  model_name: string | null;
+  total_token_count: number | null;
+  total_token_cost: number | null;
+}
+```
+
+- [ ] **Step 2: Add `getTracesOverview`**
+
+```ts
+async getTracesOverview(taskIds: string[], startTime: string, endTime: string): Promise<TraceOverviewEntry[]> {
+  const { overviews } = await this.request<{ count: number; overviews: TraceOverviewEntry[] }>(
+    "/api/v1/traces/overview",
+    { method: "POST", body: JSON.stringify({ task_ids: taskIds, start_time: startTime, end_time: endTime }) },
+  );
+  return overviews;
+}
+```
+
+- [ ] **Step 3: Add `getTracesTimeseries`** (single task per call; caller fans out + merges)
+
+```ts
+async getTracesTimeseries(taskId: string, startTime: string, endTime: string, bucketSize: string): Promise<TraceTimeseriesPoint[]> {
+  const res = await this.request<{ points?: TraceTimeseriesPoint[] } | TraceTimeseriesPoint[]>(
+    "/api/v1/traces/overview/timeseries",
+    { method: "POST", body: JSON.stringify({ task_id: taskId, start_time: startTime, end_time: endTime, bucket_size: bucketSize }) },
+  );
+  return Array.isArray(res) ? res : (res.points ?? []);
+}
+```
+
+> The response envelope key is unconfirmed — handle both array and `{ points }`. Confirm against a live call.
+
+- [ ] **Step 4: Add `aggregateSpanTokensByModel`** (the one client-side aggregation)
+
+Fetch span rows for the window via `GET /api/v1/traces/spans` (paginate if the API requires it), then sum `total_token_count`/`total_token_cost` grouped by `model_name`. Return `Array<{ model: string; tokens: number; cost: number }>`. Skip rows with null `model_name`.
+
+- [ ] **Step 5: Test**
+
+Run: `cd apps/worker && pnpm vitest run src/sandbox/arthur-client.test.ts`
+Expected: add tests with a stubbed `fetch` asserting each method posts the right body and parses the response (mirror the existing client tests). PASS.
+
+---
+
+### Task 2: Add the `CostResponse` contract
+
+**Files:**
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add the interfaces**
+
+Add `CostByModelEntry`, `CostByWorkflowEntry`, and `CostResponse` exactly as specified in the spec ("Proposed contract").
+
+- [ ] **Step 2: Typecheck shared**
+
+Run: `cd apps/shared && npx tsc --noEmit` (or root `pnpm -w typecheck` if defined)
+Expected: PASS.
+
+---
+
+### Task 3: Add the `collectCost` aggregator + worker route
+
+**Files:**
+- Create: `apps/worker/src/lib/overview/collect-cost.ts`
+- Create: `apps/worker/src/lib/overview/collect-cost.test.ts`
+- Create: `apps/worker/src/routes/api/v1/cost.get.ts`
+
+- [ ] **Step 1: Write `collectCost`**
+
+Signature: `collectCost(client: ArthurClient, opts: { now: Date; bucketSize: string }): Promise<Omit<CostResponse, "generatedAt">>`.
+
+Logic:
+1. Resolve the window: `start = startOfMonth(now)`, `end = now` (ISO). (Assumption: calendar MTD — see open Q5.)
+2. Resolve `taskIds`: enumerate the org's tasks (assumption from open Q2 — pass ids explicitly). Reuse/extend the client's task listing (`/api/v2/tasks/search`); if a true org-wide overview via empty `task_ids` is confirmed, pass `[]` instead.
+3. `overviews = await client.getTracesOverview(taskIds, start, end)`.
+   - `totals`: sum `trace_token_cost` (→ `totalTokenCost`), `trace_token_count` (→ `totalTokens`), `trace_count` (→ `traceCount`); `costPerRun = totalTokenCost / max(1, traceCount)`. Treat null `trace_token_cost` as 0.
+   - `byWorkflow`: one entry per overview → `{ taskId, name, runs, tokens, cost, costPerRun }`. `name` from the task listing (task name = ticket-run id).
+4. `byModel = await client.aggregateSpanTokensByModel(...)` → map to `{ model, cost, tokens }`.
+5. `daily`: fan out `getTracesTimeseries(taskId, start, end, bucketSize)` per task; **merge points by `timestamp`** summing cost/tokens; sort oldest→newest → `{ date, cost, tokens }[]`.
+
+Keep I/O behind the injected `client` so the aggregation is unit-testable with a fake client (mirror how `collect-runs.ts` takes a `RunsLister`).
+
+- [ ] **Step 2: Write the route**
+
+Mirror `workflows.get.ts`:
+```ts
+setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60");
+const generatedAt = new Date().toISOString();
+if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+  return { generatedAt, available: false, ...EMPTY };
+}
+try {
+  const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY);
+  const data = await collectCost(client, { now: new Date(), bucketSize: "day" });
+  return { generatedAt, available: true, ...data };
+} catch (err) {
+  logger.warn({ err: (err as Error).message }, "cost_collect_failed");
+  return { generatedAt, available: false, ...EMPTY };
+}
+```
+`EMPTY` = the empty totals/arrays/window matching `costFallback`.
+
+- [ ] **Step 3: Test the aggregator**
+
+Run: `cd apps/worker && pnpm vitest run src/lib/overview/collect-cost.test.ts`
+Expected: with a fake client returning fixtures (2 tasks, 2 models, multi-day timeseries), assert totals, `byWorkflow` rows + `costPerRun`, `byModel` grouping, and merged-by-timestamp `daily`. Empty/null inputs → zeros/empty arrays. PASS.
+
+- [ ] **Step 4: Worker typecheck**
+
+Run: `cd apps/worker && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 4: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Add `costFallback`**
+
+```ts
+export function costFallback(now: string): CostResponse {
+  return {
+    generatedAt: now,
+    available: false,
+    window: { start: now, end: now },
+    totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+    byModel: [],
+    byWorkflow: [],
+    daily: [],
+  };
+}
+```
+
+Add `CostResponse` to the existing `@shared/contracts` import.
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (no consumers yet).
+
+---
+
+### Task 5: Add the skeleton + server data component, and convert `CostScreen`
+
+**Files:**
+- Create: `apps/dashboard/app/cost-skeleton.tsx`
+- Create: `apps/dashboard/app/cost-data.tsx`
+- Modify: `apps/dashboard/components/cockpit/screens/cost.tsx`
+
+- [ ] **Step 1: Create the skeleton**
+
+Mirror `overview-skeleton.tsx`, shaped to the cost layout (after embellishments are stripped: 3 KPI blocks, a chart+donut row, two table blocks):
+
+```tsx
+// apps/dashboard/app/cost-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+  return <div className={`bg-neutral-200/60 rounded-sm animate-pulse ${className}`} />;
+}
+export function CostSkeleton() {
+  return (
+    <div className="px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="grid grid-cols-3 gap-3">
+        {Array.from({ length: 3 }, (_, i) => <Block key={i} className="h-[100px]" />)}
+      </div>
+      <div className="grid lg:grid-cols-[1.5fr_1fr] gap-3">
+        <Block className="h-[260px]" /><Block className="h-[260px]" />
+      </div>
+      <Block className="h-[300px]" />
+      <Block className="h-[300px]" />
+    </div>
+  );
+}
+```
+
+- [ ] **Step 2: Create the server data component**
+
+```tsx
+// apps/dashboard/app/cost-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { CostScreen } from "@/components/cockpit/screens/cost";
+import type { CostResponse } from "@shared/contracts";
+import { costFallback } from "@/lib/api/fallbacks";
+
+export async function CostData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<CostResponse>("/api/v1/cost").catch(() =>
+    costFallback(now),
+  );
+  return <CostScreen data={data} />;
+}
+```
+
+> Will not typecheck until Step 3 changes `CostScreen`'s signature. The full gate is in Task 6.
+
+- [ ] **Step 3: Convert `CostScreen` to consume `data` and strip embellishments**
+
+In `components/cockpit/screens/cost.tsx`:
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `import { sparkSeries } from "@/lib/rng"`, the `Spark` import (no longer used), and `const D = AIWF_DATA`.
+- Add `import type { CostResponse } from "@shared/contracts";`.
+- Signature → `export function CostScreen({ data }: { data: CostResponse })`.
+- KPIs: `total = data.totals.totalTokenCost`; tokens = `data.totals.totalTokens`; "Cost / run avg" = `$${data.totals.costPerRun.toFixed(2)}`. **Remove** the "Projection · EoM" KPI tile, the `of $1,200 budget` sub, and all `delta`/`deltaTone` props (no source).
+- Header: **remove** the `<CkTabs ... By model/workflow/actor>` and the `Export CSV` button.
+- Area chart: feed `data.daily.map(d => d.cost)` and labels `data.daily.map(d => d.date)` (format the ISO date to a short label in-screen); **remove** the inner Cost/Tokens `CkTabs` action.
+- Donut: shares computed in-screen from `byModel` — `const totalCost = data.byModel.reduce((a,m)=>a+m.cost,0); shares = data.byModel.map(m => totalCost ? m.cost/totalCost : 0)`; center = `"$" + Math.round(total)`.
+- Per-model table: map `data.byModel` → columns `{ m.model, m.tokens, m.cost, share }`. **Remove** the `Vendor` column (not in contract) and the `Trend`/`Spark` column.
+- Per-workflow table: map `data.byWorkflow` (already aggregated) → `{ w.name, w.taskId, w.runs, w.tokens, w.cost, w.costPerRun }`. **Remove** the in-component `tokens = runs24h*2400`/`perRun` derivations, the `primary` chip / `gateway` line (not in contract), and the `Trend`/`Spark` column. Header label can stay "Per-workflow breakdown" (rows are per task — see spec mapping note).
+
+- [ ] **Step 4: Verify no mock/embellishment refs remain**
+
+Run: `grep -nE "\bD\.|AIWF_DATA|sparkSeries|Spark|COST_BY_MODEL|HOURS24|Export CSV|deltaTone|By actor" apps/dashboard/components/cockpit/screens/cost.tsx`
+Expected: no matches.
+
+---
+
+### Task 6: Rewrite the route + full verification
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/cost/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost")
+import { Suspense } from "react";
+import { CostData } from "@/app/cost-data";
+import { CostSkeleton } from "@/app/cost-skeleton";
+
+export default function CostPage() {
+  return (
+    <Suspense fallback={<CostSkeleton />}>
+      <CostData />
+    </Suspense>
+  );
+}
+```
+
+- [ ] **Step 2: Typecheck both apps**
+
+Run: `cd apps/worker && npx tsc --noEmit && cd ../dashboard && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed dashboard files**
+
+Run: `cd apps/dashboard && npx next lint --file app/cost-data.tsx --file app/cost-skeleton.tsx --file "app/(cockpit)/cost/page.tsx" --file components/cockpit/screens/cost.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev`, open `http://localhost:3001/cost`.
+Expected:
+- With Arthur configured + traces present: real spend, token totals, per-model donut/table, per-task table, and per-day spend chart render.
+- With Arthur unconfigured (env unset) or unreachable: zero/empty state — KPIs `$0.00`/`0`, empty tables, empty chart — no crash.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts \
+  apps/worker/src/sandbox/arthur-client.ts \
+  apps/worker/src/lib/overview/collect-cost.ts apps/worker/src/routes/api/v1/cost.get.ts \
+  apps/dashboard/lib/api/fallbacks.ts \
+  apps/dashboard/app/cost-data.tsx apps/dashboard/app/cost-skeleton.tsx \
+  "apps/dashboard/app/(cockpit)/cost/page.tsx" \
+  apps/dashboard/components/cockpit/screens/cost.tsx
+git commit -m "feat: wire /cost to real Arthur usage data"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- Arthur read methods (`getTracesOverview`, `getTracesTimeseries`, `aggregateSpanTokensByModel`) → Task 1. ✓
+- `CostResponse` contract with field-level types → Task 2 (from spec). ✓
+- `collectCost` aggregator (totals / byWorkflow=per-task / byModel / merged daily) + `/api/v1/cost` route with Arthur-unconfigured degrade → Task 3. ✓
+- `costFallback` empty state → Task 4. ✓
+- `cost-data.tsx` + `cost-skeleton.tsx` + `CostScreen` swap with embellishments **removed** (budget, deltas, EoM projection, tabs, CSV, sparklines, vendor/primary/gateway) → Task 5. ✓
+- Thin Suspense page → Task 6. ✓
+- Arthur-down / unconfigured empty state → fallback (Task 4), route degrade (Task 3), verified (Task 6 Step 4). ✓
+- Single PR, no Redis/persistence/capture → no such tasks. ✓
+
+**Reuse check:** Read methods extend the existing `ArthurClient` (same `request<T>` + Bearer auth + `fromTraceEndpoint`). Cost comes straight from Arthur's `*_token_cost` — no client-side pricing, the `pricing.ts`/`usage.ts` Slack path is untouched. Read path reuses `getJSON`/fallback/Suspense. Only new infra is one collector + one route — consistent with runs/overview. ✓
+
+**Placeholder scan:** No TBD/TODO; the only deferred items are the spec's flagged open questions (`bucket_size`, empty `task_ids`, by-model aggregation, task→workflow, window) and the explicitly-removed embellishments. ✓
+
+**Type consistency:** `CostResponse` imported from `@shared/contracts` in `cost-data.tsx` (Task 5), `fallbacks.ts` (Task 4), and the route (Task 3). `CostScreen` accepts `{ data: CostResponse }` (Task 5) matching the call site (Task 5 Step 2). Arthur response types (Task 1) feed `collectCost` (Task 3). ✓
diff --git a/docs/superpowers/plans/2026-06-08-evals-real-data.md b/docs/superpowers/plans/2026-06-08-evals-real-data.md
new file mode 100644
index 0000000..cff77e1
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-evals-real-data.md
@@ -0,0 +1,421 @@
+# `/evals` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/evals` dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Because no evals list endpoint or eval-read path exists yet, this plan also builds the worker contract, route, and Arthur read path as a prerequisite.
+
+**Architecture:** New worker route `GET /api/v1/evals` → `EvalsResponse` (discriminated union, same `available` pattern as `EvalHealthResponse`). A new collector `collect-evals.ts` calls the **confirmed** Arthur read endpoint `POST /api/v1/traces/overview` via a new `getTracesOverview()` method on `ArthurClient`, sums the per-task overviews into a fleet `score`/`spansGraded`/`traceCount`, and degrades to `available: false` when Arthur is unconfigured, unreachable, or nothing is graded. On the dashboard, a thin server route (`page.tsx`) wraps a server component (`evals-data.tsx`) in `<Suspense>`; that component fetches via `getJSON`, falls back to `evalsFallback`, and passes `data` to the client presenter `EvalsScreen`. Identical in shape to `runs-data.tsx` / `RunsScreen`.
+
+**Scope note (read first):** Arthur's read API is confirmed (auth = same `Bearer GENAI_ENGINE_API_KEY`, org-scoped). Our trace path (`POST /api/v1/traces`) only produces `continuous_eval_success_rate`, `eval_count`, `trace_count`, and the three relevance/tool metric types — **and only if continuous evals are configured on the task.** The mock's rule families (hallucination/PII/toxicity/prompt-injection) come from Arthur's `/validate_*` write path, which **we do not call** — they are **out of scope** and dropped from this page. The first increment ships the **fleet aggregate** (score + graded count + window); the per-metric relevance/tool breakdown and trend/sparkline are optional follow-ons (Tasks 3b/3c).
+
+**Tech Stack:** Worker = h3 + Nitro routes, `@shared/contracts` types, Vitest. Dashboard = Next.js App Router, React 19, TypeScript. Dashboard has no test framework — verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-evals-real-data-design.md`
+
+**Required env vars (worker):** `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT` (both already declared optional in `apps/worker/env.ts`; the base read URL is derived from the trace endpoint via `ArthurClient.fromTraceEndpoint`). Reads need the `INFERENCE_READ` permission on the key. No new dashboard env vars — `/evals` reuses `WORKER_BASE_URL` / `WORKER_API_TOKEN` via `getJSON`.
+
+**Remaining open items (non-blocking — see spec Open Questions):** (1) `bucket_size` values for the optional timeseries call; (2) whether empty `task_ids` on `/traces/overview` means "all org tasks" (else enumerate via `/api/v2/tasks/search`); (3) whether continuous evals are actually configured on our live tasks (if not, the page legitimately shows "No graded evals"). None block the aggregate-only increment.
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the commit command for when they do.
+
+---
+
+### Task 1: Add the `EvalsResponse` contract
+
+**Files:**
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add `EvalMetricRow` and `EvalsResponse`**
+
+Append after the existing `EvalHealthResponse` union:
+
+```ts
+export interface EvalMetricRow {
+  metric: string;
+  metricType: "QueryRelevance" | "ResponseRelevance" | "ToolSelection";
+  value: number;
+  status: "pass" | "warn" | "fail";
+  axis: "quality";
+  trend?: number | null;   // only if timeseries wired (Task 3c)
+  spark?: number[];        // only if timeseries wired (Task 3c)
+}
+
+export type EvalsResponse =
+  | {
+      available: true;
+      generatedAt: string;
+      windowHours: number;
+      score: number;        // continuous_eval_success_rate × 100, fleet-wide
+      spansGraded: number;  // Σ eval_count
+      traceCount: number;   // Σ trace_count
+      rows: EvalMetricRow[]; // [] in the aggregate-only first cut
+    }
+  | { available: false; generatedAt: string; reason: string };
+```
+
+- [ ] **Step 2: Typecheck shared**
+
+Run: `cd apps/shared && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 2: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Import the type and add the fallback**
+
+Add `EvalsResponse` to the existing `@shared/contracts` import block, then add:
+
+```ts
+export function evalsFallback(now: string): EvalsResponse {
+  return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+```
+
+- [ ] **Step 2: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (the new export is unused so far, but valid).
+
+---
+
+### Task 3: Build the Arthur read path + collector (fleet aggregate)
+
+This is the first, shippable increment: fleet `score` / `spansGraded` / `traceCount`, `rows: []`. The per-metric breakdown (3b) and trend/sparkline (3c) are optional follow-ons below.
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts` (add a read method)
+- Create: `apps/worker/src/lib/overview/collect-evals.ts`
+- Create: `apps/worker/src/lib/overview/collect-evals.test.ts`
+
+- [ ] **Step 1: Add `getTracesOverview()` to `ArthurClient`**
+
+Add a method reusing the existing private `request<T>` helper and bearer auth:
+
+```ts
+interface TraceOverview {
+  task_id: string;
+  trace_count: number;
+  trace_token_count: number;
+  trace_token_cost: number;
+  eval_count: number;
+  continuous_eval_success_rate: number;
+  last_active: string;
+}
+interface TraceOverviewListResponse { count: number; overviews: TraceOverview[]; }
+
+async getTracesOverview(opts: {
+  taskIds: string[];          // may be empty — see Open Q2
+  startTime: string;          // ISO
+  endTime: string;            // ISO
+}): Promise<TraceOverviewListResponse> {
+  return this.request<TraceOverviewListResponse>("/api/v1/traces/overview", {
+    method: "POST",
+    body: JSON.stringify({
+      task_ids: opts.taskIds,
+      start_time: opts.startTime,
+      end_time: opts.endTime,
+    }),
+  });
+}
+```
+
+Keep the raw Arthur types local to the client; do not leak them into `@shared/contracts`.
+
+> **Task-id enumeration (Open Q2):** if `task_ids: []` is confirmed to mean "all org tasks", pass `[]`. Otherwise enumerate the org's tasks first. The client already searches tasks via `POST /api/v2/tasks/search` (`findTicketTasks`); add a thin `listAllTasks()` if a full enumeration is needed, or have the collector accept a pre-resolved `taskIds`. Default the collector to receive `taskIds` so the route owns the enumeration policy.
+
+- [ ] **Step 2: Write `collect-evals.ts`**
+
+Mirror `collect-runs.ts`/`collect-kpis.ts` — accept an injected fetcher and resolve to the `available: true` fields minus `generatedAt`:
+
+```ts
+export interface CollectEvalsOptions {
+  fetchOverview: (o: { taskIds: string[]; startTime: string; endTime: string })
+    => Promise<{ overviews: TraceOverview[] }>;
+  taskIds: string[];
+  windowHours: number;
+  now: Date;
+}
+
+// Returns { windowHours, score, spansGraded, traceCount, rows } OR a null-ish
+// signal when nothing is graded so the route can emit available:false.
+export async function collectEvals(opts: CollectEvalsOptions) {
+  const endTime = opts.now.toISOString();
+  const startTime = new Date(opts.now.getTime() - opts.windowHours * 3_600_000).toISOString();
+  const { overviews } = await opts.fetchOverview({ taskIds: opts.taskIds, startTime, endTime });
+
+  const spansGraded = sum(overviews, o => o.eval_count);
+  const traceCount  = sum(overviews, o => o.trace_count);
+  // weight success rate by eval_count; 0 graded → caller emits unavailable
+  const score = spansGraded === 0
+    ? 0
+    : (sum(overviews, o => o.continuous_eval_success_rate * o.eval_count) / spansGraded) * 100;
+
+  return { windowHours: opts.windowHours, score, spansGraded, traceCount, rows: [] };
+}
+```
+
+The injected-fetcher boundary keeps the Arthur shape isolated and unit-testable.
+
+- [ ] **Step 3: Unit test the collector**
+
+In `collect-evals.test.ts`, feed stubbed `overviews` and assert: `spansGraded`/`traceCount` are summed, `score` is the eval-count-weighted success rate × 100, and `spansGraded === 0` yields `score === 0` (route turns this into `available:false`). Mirror the style of the existing `collect-*` tests.
+
+Run: `cd apps/worker && npx vitest run src/lib/overview/collect-evals.test.ts`
+Expected: PASS.
+
+- [ ] **Step 3b (optional follow-on): per-metric relevance/tool breakdown**
+
+Only the three Arthur metric types exist on our path. To populate `rows`: list spans (`GET /api/v1/traces/spans`), fetch each span's `metric_results` (`GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results`), parse the opaque `details` JSON string per `metric_type` (e.g. relevance → `llm_relevance_score`), aggregate per metric type, and apply a worker-owned pass/warn/fail threshold. Map each to `EvalMetricRow { metric, metricType, value, status, axis: "quality" }`. Add this behind the same collector with extra fetchers; keep `rows: []` until implemented.
+
+- [ ] **Step 3c (optional follow-on): trend/sparkline**
+
+Wire `POST /api/v1/traces/overview/timeseries` (single task per call) to populate `EvalMetricRow.trend`/`spark` from `continuous_eval_success_rate` buckets. **Confirm `bucket_size` allowed values first (Open Q1).** Until wired, omit `trend`/`spark` entirely — no synthetic series.
+
+---
+
+### Task 4: Add the worker route `GET /api/v1/evals`
+
+**Files:**
+- Create: `apps/worker/src/routes/api/v1/evals.get.ts`
+
+- [ ] **Step 1: Create the route**
+
+Mirror `apps/worker/src/routes/api/v1/runs.get.ts`:
+
+```ts
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { EvalsResponse } from "@shared/contracts";
+import { env } from "../../../../env.js";
+import { ArthurClient } from "../../../sandbox/arthur-client.js";
+import { collectEvals } from "../../../lib/overview/collect-evals.js";
+import { logger } from "../../../lib/logger.js";
+
+const WINDOW_HOURS = 24;
+
+export default defineEventHandler(async (event): Promise<EvalsResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+  const generatedAt = new Date().toISOString();
+
+  if (!env.GENAI_ENGINE_API_KEY || !env.GENAI_ENGINE_TRACE_ENDPOINT) {
+    return { available: false, generatedAt, reason: "Arthur GenAI Engine not configured." };
+  }
+
+  try {
+    const client = ArthurClient.fromTraceEndpoint(
+      env.GENAI_ENGINE_TRACE_ENDPOINT,
+      env.GENAI_ENGINE_API_KEY,
+    );
+    // Open Q2: pass [] if empty === all org tasks; else enumerate via tasks/search.
+    const taskIds: string[] = [];
+    const { windowHours, score, spansGraded, traceCount, rows } = await collectEvals({
+      fetchOverview: (o) => client.getTracesOverview(o),
+      taskIds,
+      windowHours: WINDOW_HOURS,
+      now: new Date(),
+    });
+    if (spansGraded === 0) {
+      return { available: false, generatedAt, reason: "No graded evals in the last 24h." };
+    }
+    return { available: true, generatedAt, windowHours, score, spansGraded, traceCount, rows };
+  } catch (err) {
+    logger.warn({ err: (err as Error).message }, "evals_list_failed");
+    return { available: false, generatedAt, reason: "Eval grading not wired up yet." };
+  }
+});
+```
+
+- [ ] **Step 2: Typecheck worker**
+
+Run: `cd apps/worker && npx tsc --noEmit`
+Expected: PASS.
+
+- [ ] **Step 3: Hit the route**
+
+Run the worker locally and `curl -H "Authorization: Bearer $WORKER_API_TOKEN" localhost:<port>/api/v1/evals`.
+Expected:
+- Arthur unconfigured → `{ available: false, ..., reason: "Arthur GenAI Engine not configured." }`.
+- Configured but nothing graded → `{ available: false, ..., reason: "No graded evals in the last 24h." }`.
+- Configured + graded → `available: true` with `score` / `spansGraded` / `traceCount` (and `rows` once 3b is built).
+
+---
+
+### Task 5: Add the loading skeleton
+
+**Files:**
+- Create: `apps/dashboard/app/evals-skeleton.tsx`
+
+- [ ] **Step 1: Create the skeleton**
+
+Mirror `apps/dashboard/app/overview-skeleton.tsx` — header + one card-shaped block (the Quality group):
+
+```tsx
+// apps/dashboard/app/evals-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+  return <div className={`bg-neutral-200/60 rounded-sm animate-pulse ${className}`} />;
+}
+
+export function EvalsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="flex items-center justify-between">
+        <Block className="h-10 w-72" />
+        <Block className="h-8 w-64" />
+      </div>
+      <Block className="h-[200px]" />
+    </div>
+  );
+}
+```
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 6: Add the server data component
+
+**Files:**
+- Create: `apps/dashboard/app/evals-data.tsx`
+
+- [ ] **Step 1: Create the server component**
+
+Mirror `apps/dashboard/app/runs-data.tsx`:
+
+```tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<EvalsResponse>("/api/v1/evals").catch(() =>
+    evalsFallback(now),
+  );
+  return <EvalsScreen data={data} />;
+}
+```
+
+> This will not typecheck until Task 7 changes `EvalsScreen`'s signature. Expected; full typecheck gate is Task 8.
+
+---
+
+### Task 7: Convert `EvalsScreen` to consume real data
+
+**Files:**
+- Modify: `apps/dashboard/components/cockpit/screens/evals.tsx`
+
+- [ ] **Step 1: Replace imports and signature**
+
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"` and `const D = AIWF_DATA`.
+- Add `import type { EvalsResponse, EvalMetricRow } from "@shared/contracts"`.
+- Change `export function EvalsScreen()` → `export function EvalsScreen({ data }: { data: EvalsResponse })`.
+
+Also remove `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped) and the `groups`/`accents`/`titles` axis-map scaffolding — only the single Quality group remains.
+
+- [ ] **Step 2: Handle the unavailable branch**
+
+When `data.available === false`, render the existing header block (eyebrow + title) but replace the chip with a neutral one and the metric cards with a single panel showing `data.reason`. Mirror the reason path in `EvalHealthKPI` (`overview.tsx`). This covers unconfigured, "no graded evals", and worker-down.
+
+- [ ] **Step 3: Drive the available branch**
+
+- Drive the live chip from `data.spansGraded.toLocaleString("en-US")` + `data.windowHours` instead of the hardcoded `12,408 spans · 24h`; surface `data.score` (e.g. as the headline number).
+- Render a single **Quality** `CkCard` over `data.rows` (all `axis: "quality"`). If `data.rows` is empty (aggregate-only first cut), render just the score + graded-count header, no per-metric grid.
+- Per row: show `metric`, formatted `value`, and the pass/warn/fail `CkChip`.
+- Trend/sparkline: render `e.trend` / `<Spark data={e.spark} ... />` **only when present**; otherwise render neither. No `jitterSeries`.
+
+- [ ] **Step 4: Verify no mock/jitter references remain**
+
+Run: `grep -nE "AIWF_DATA|\bD\.|jitterSeries" apps/dashboard/components/cockpit/screens/evals.tsx`
+Expected: no matches.
+
+---
+
+### Task 8: Rewrite the route to the server pattern + verify
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/evals/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals")
+import { Suspense } from "react";
+
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
+export default function EvalsPage() {
+  return (
+    <Suspense fallback={<EvalsSkeleton />}>
+      <EvalsData />
+    </Suspense>
+  );
+}
+```
+
+- [ ] **Step 2: Typecheck the whole app**
+
+Run: `cd apps/dashboard && npx tsc --noEmit` and `cd apps/worker && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed dashboard files**
+
+Run: `cd apps/dashboard && npx next lint --file app/evals-data.tsx --file app/evals-skeleton.tsx --file "app/(cockpit)/evals/page.tsx" --file components/cockpit/screens/evals.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev` (port 3001), open `http://localhost:3001/evals`.
+Expected:
+- With the worker unreachable or Arthur unconfigured: header chrome renders + a single reason panel ("Worker unavailable." / "Arthur GenAI Engine not configured."), no crash.
+- With Arthur configured but nothing graded (`eval_count = 0`): the "No graded evals in the last 24h." panel.
+- With Arthur configured + graded: the real fleet `score` + spans-graded count over the 24h window render; the Quality breakdown appears once Task 3b is built (else just the aggregate header). No sparklines unless Task 3c is wired.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts \
+  apps/worker/src/sandbox/arthur-client.ts \
+  apps/worker/src/lib/overview/collect-evals.ts \
+  apps/worker/src/lib/overview/collect-evals.test.ts \
+  apps/worker/src/routes/api/v1/evals.get.ts \
+  apps/dashboard/lib/api/fallbacks.ts \
+  apps/dashboard/app/evals-data.tsx \
+  apps/dashboard/app/evals-skeleton.tsx \
+  "apps/dashboard/app/(cockpit)/evals/page.tsx" \
+  apps/dashboard/components/cockpit/screens/evals.tsx
+git commit -m "feat: wire /evals to real Arthur eval data"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- `EvalsResponse` / `EvalMetricRow` contract (mapped to `TraceOverviewResponse`; rule families dropped) → Task 1. ✓
+- Worker Arthur read path `getTracesOverview()` + `collect-evals.ts` (+ test) → Task 3; optional breakdown/timeseries → 3b/3c. ✓
+- Worker route `GET /api/v1/evals` with config-check, `eval_count=0` degrade, error degrade → Task 4. ✓
+- `evalsFallback` → Task 2. ✓
+- `evals-data.tsx` server component → Task 6. ✓
+- `evals-skeleton.tsx` (single Quality block) → Task 5. ✓
+- `EvalsScreen` swap (signature, single Quality group, score + spansGraded chip, optional rows/trend/spark, drop `jitterSeries`) → Task 7. ✓
+- `page.tsx` server route → Task 8. ✓
+- Unavailable / no-graded / worker-down states → Tasks 2, 4, 7; verified in Task 8 Step 4. ✓
+- Out-of-scope (New eval button, overview tile, per-span drill-down, synthetic sparklines, `/validate_*` rule families) → not in any task. ✓
+
+**Confirmed dependency:** Arthur read API is ground-truthed (`POST /api/v1/traces/overview`, bearer auth, org-scoped). First increment ships fleet aggregate; per-metric breakdown (3b) and trend (3c) are optional follow-ons. Non-blocking open items (bucket_size, empty-task_ids semantics, whether continuous evals are configured live) noted at top and at their tasks. ✓
+
+**Placeholder scan:** No TBD/TODO; remaining unknowns are the three non-blocking open items, explicitly flagged. ✓
+
+**Type consistency:** `EvalsResponse` imported from `@shared/contracts` in Tasks 2, 4, 6, 7. `EvalsScreen` accepts `{ data: EvalsResponse }` (Task 7) — matches the call site in Task 6. `collectEvals` returns the `available: true` fields (`windowHours`/`score`/`spansGraded`/`traceCount`/`rows`) the route spreads in Task 4. `EvalsSkeleton` (Task 5) matches the import in Task 8. ✓
diff --git a/docs/superpowers/plans/2026-06-08-prompts-real-data.md b/docs/superpowers/plans/2026-06-08-prompts-real-data.md
new file mode 100644
index 0000000..267b1aa
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-prompts-real-data.md
@@ -0,0 +1,690 @@
+# `/prompts` Real-Data Conversion Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Convert the `/prompts` dashboard page from mock data to live worker data, mirroring the `/runs` server-component fetch pattern. Read-only display of the three workflow prompts the worker actually resolves at runtime, **including real Arthur version history**.
+
+**Architecture:** New worker route `GET /api/v1/prompts` returns a typed `PromptsResponse` built from the same resolution logic the durable `loadPrompts()` step uses (Arthur `production` tags with in-code fallbacks), plus each prompt's real Arthur version-history metadata. A second route `GET /api/v1/prompts/[name]/versions/[version]` returns a single historical version's body on demand. Thin server route (`page.tsx`) wraps a server component (`prompts-data.tsx`) in `<Suspense>`; it fetches the list via `getJSON`, falls back to an empty `PromptsResponse`, and passes `data` to the client presenter `PromptsScreen`. The client fetches historical version bodies lazily through a same-origin Next route handler that proxies the worker (keeps the bearer token server-side). Shape mirrors `runs.get.ts` / `runs-data.tsx` / `RunsScreen`.
+
+**Tech stack:** h3 worker (`@apps/worker`), Next.js App Router dashboard (`@apps/dashboard`), shared `@shared/contracts`. Worker has vitest tests; dashboard has none — dashboard verification is `npx tsc --noEmit`, `next lint`, and a manual browser check.
+
+**Spec:** `docs/superpowers/specs/2026-06-08-prompts-real-data-design.md`
+
+**Scope decisions baked in (confirmed by user + Arthur API ground-truthing):**
+- Read-only display. No write/edit endpoints. Action buttons left inert.
+- **Real Arthur version history is in scope** (version-list metadata + on-demand bodies). Arthur's version list is metadata only, so per-version eval/halluc/p95/cost metrics and the A/B text diff have **no source** — that markup is **removed**, not stubbed with placeholders.
+- Tags are real (`AgenticPromptVersionResponse.tags`); the `production` badge and tag filter stay, backed by data.
+- Worker route reuses a shared extracted `resolvePrompts()` helper (option A) called by both `loadPrompts()` and the route. Confirmed OK to touch `prompts-step.ts`.
+- Body fetch: production body eager (already resolved); historical bodies lazy via the on-demand route.
+
+**Note on commits:** This repo's owner stages commits manually. Do NOT commit unless the user explicitly asks. The final task lists the command for when they do.
+
+---
+
+### Task 1: Add the shared `PromptVersion` + `PromptDef` entities + response contracts
+
+**Files:**
+- Modify: `apps/shared/contracts/domain.ts`
+- Modify: `apps/shared/contracts/api.ts`
+
+- [ ] **Step 1: Add `PromptVersion` + `PromptDef` to `domain.ts`**
+
+```ts
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+  /** Arthur integer version number. */
+  version: number;
+  /** ISO timestamp the version was created. */
+  createdAt: string;
+  /** Real Arthur tags on this version, e.g. ["production"]. */
+  tags: string[];
+  modelProvider: string;
+  modelName: string;
+  numMessages: number;
+  numTools: number;
+  /** Body text. Present only for the production version (eager); other
+   *  versions are fetched on demand. */
+  body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+  /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+  name: string;
+  /** Human label for the workflow phase, e.g. "Research & Plan". */
+  phase: string;
+  /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+  body: string;
+  /** Where the resolved `body` came from. */
+  source: "arthur" | "fallback";
+  /** Model the agent runs this prompt with (env-derived). */
+  model: string;
+  /** Real Arthur version history, newest first. Empty when source is "fallback". */
+  versions: PromptVersion[];
+}
+```
+
+- [ ] **Step 2: Add `PromptsResponse` + `PromptVersionBodyResponse` to `api.ts`**
+
+Add `PromptDef` to the existing `import type { ... } from "./domain.js"` line (note: `PromptVersion` is only referenced transitively through `PromptDef`, so it need not be imported in `api.ts`), then append:
+
+```ts
+export interface PromptsResponse {
+  generatedAt: string;
+  /** `false` when the worker can't resolve prompts (degrades to empty list). */
+  available: boolean;
+  /** Whether Arthur is configured (key + endpoint + task id all set). When
+   *  false, every prompt's `source` is "fallback" and `versions` is empty. */
+  arthurEnabled: boolean;
+  rows: PromptDef[];
+  total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+  generatedAt: string;
+  available: boolean;
+  body: string | null;
+}
+```
+
+- [ ] **Step 3: Typecheck shared**
+
+Run: `pnpm -F @apps/shared exec tsc --noEmit` (or repo-root `pnpm typecheck` if that's the established command — match how the runs plan was verified).
+Expected: PASS.
+
+---
+
+### Task 2: Add Arthur version-list + by-version read methods to `ArthurClient`
+
+**Files:**
+- Modify: `apps/worker/src/sandbox/arthur-client.ts`
+- Modify: `apps/worker/src/sandbox/arthur-client.test.ts` (add coverage for the new methods, matching the file's existing fetch-mock style)
+
+**Context:** `ArthurClient` already has `getPromptByTag` (fetches a tagged version's body). Add two read methods, ground-truthed against `arthur-ai/arthur-engine` `main`. Both reuse the existing `this.baseUrl` + bearer header convention.
+
+- [ ] **Step 1: Add types + `listPromptVersions`**
+
+```ts
+export interface ArthurPromptVersion {
+  version: number;
+  created_at: string;
+  deleted_at: string | null;
+  model_provider: string;
+  model_name: string;
+  tags: string[];
+  num_messages: number;
+  num_tools: number;
+}
+interface AgenticPromptVersionListResponse {
+  count: number;
+  versions: ArthurPromptVersion[];
+}
+
+/** List version metadata for a named prompt (newest first). First page only. */
+async listPromptVersions(taskId: string, name: string): Promise<ArthurPromptVersion[]> {
+  const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions`;
+  const res = await fetch(`${this.baseUrl}${path}`, {
+    method: "GET",
+    headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+  });
+  if (res.status === 404) return [];
+  if (!res.ok) {
+    const body = await res.text().catch(() => "");
+    throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+  }
+  const data = (await res.json()) as AgenticPromptVersionListResponse;
+  return [...data.versions].sort((a, b) => b.version - a.version);
+}
+```
+
+> Assumption (open Q in spec): first page only — sufficient for the timeline. If deep history is required later, add pagination params here.
+
+- [ ] **Step 2: Add `getPromptVersionBody`**
+
+`getPromptByTag` already parses the by-version endpoint's `AgenticPrompt.messages[0].content` shape (passing a tag as `{prompt_version}`). Generalize it to accept any version specifier (integer / `latest` / ISO datetime / tag):
+
+```ts
+/** Fetch the body of a specific version (int | "latest" | ISO datetime | tag). Null on 404. */
+async getPromptVersionBody(taskId: string, name: string, version: number | string): Promise<string | null> {
+  const path = `/api/v1/tasks/${encodeURIComponent(taskId)}/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(String(version))}`;
+  const res = await fetch(`${this.baseUrl}${path}`, {
+    method: "GET",
+    headers: { Authorization: `Bearer ${this.apiKey}`, "ngrok-skip-browser-warning": "true" },
+  });
+  if (res.status === 404) return null;
+  if (!res.ok) {
+    const body = await res.text().catch(() => "");
+    throw new Error(`Arthur GET ${path} → ${res.status}: ${body.slice(0, 300)}`);
+  }
+  const prompt = (await res.json()) as AgenticPrompt;
+  return prompt.messages?.[0]?.content ?? null;
+}
+```
+
+> `getPromptByTag` can optionally be refactored to delegate to `getPromptVersionBody(taskId, name, tag)` to remove duplication — low risk, but keep it a separate optional cleanup so the existing `loadPrompts` path is untouched if you skip it.
+
+- [ ] **Step 3: Typecheck + test the worker**
+
+Run: `pnpm -F @apps/worker exec tsc --noEmit` then `pnpm -F @apps/worker exec vitest run src/sandbox/arthur-client.test.ts`
+Expected: PASS, including the new method tests.
+
+---
+
+### Task 3: Extract a reusable `resolvePrompts()` helper in the worker
+
+**Files:**
+- Create: `apps/worker/src/lib/prompts/resolve.ts` (or `apps/worker/src/lib/resolve-prompts.ts` — match existing `lib/` layout)
+- Modify: `apps/worker/src/workflows/prompts-step.ts`
+
+**Context:** `loadPrompts()` (`workflows/prompts-step.ts`) is a `"use step"` durable step returning `{ research, implement, review }`. The Arthur-vs-fallback resolution inside it is what we want to share. Extract the *pure* logic (no `"use step"`) so a plain h3 route can call it too, and have it also collect real version history. `loadPrompts()` then maps the helper's result back to its `{ research, implement, review }` shape so the workflow contract is unchanged.
+
+- [ ] **Step 1: Create the helper (resolves production body + version history per prompt)**
+
+```ts
+// apps/worker/src/lib/prompts/resolve.ts
+import type { PromptVersion } from "@shared/contracts";
+import { env } from "../../../env.js";
+import { logger } from "../logger.js";
+import { PROMPT_FALLBACKS, PROMPT_NAMES, type PromptName } from "../prompts.js";
+
+const PHASE_LABEL: Record<PromptName, string> = {
+  "research-plan": "Research & Plan",
+  "implement": "Implement",
+  "review": "Review",
+};
+
+export interface ResolvedPrompt {
+  name: PromptName;
+  phase: string;
+  body: string;
+  source: "arthur" | "fallback";
+  model: string;
+  versions: PromptVersion[];
+}
+
+export interface ResolvePromptsResult {
+  arthurEnabled: boolean;
+  prompts: ResolvedPrompt[];
+}
+
+export async function resolvePrompts(): Promise<ResolvePromptsResult> {
+  const model = env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL;
+  const arthurEnabled =
+    !!env.GENAI_ENGINE_API_KEY &&
+    !!env.GENAI_ENGINE_TRACE_ENDPOINT &&
+    !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+  const base = (
+    name: PromptName, body: string, source: "arthur" | "fallback", versions: PromptVersion[] = [],
+  ): ResolvedPrompt => ({ name, phase: PHASE_LABEL[name], body, source, model, versions });
+
+  if (!arthurEnabled) {
+    logger.info({ source: "fallback", reason: "arthur_prompts_disabled" }, "prompts_resolved");
+    return {
+      arthurEnabled,
+      prompts: PROMPT_NAMES.map((n) => base(n, PROMPT_FALLBACKS[n], "fallback")),
+    };
+  }
+
+  const { ArthurClient } = await import("../../sandbox/arthur-client.js");
+  const client = ArthurClient.fromTraceEndpoint(
+    env.GENAI_ENGINE_TRACE_ENDPOINT!,
+    env.GENAI_ENGINE_API_KEY!,
+  );
+  const taskId = env.GENAI_ENGINE_PROMPT_TASK_ID!;
+
+  async function one(name: PromptName): Promise<ResolvedPrompt> {
+    try {
+      const [body, rawVersions] = await Promise.all([
+        client.getPromptByTag(taskId, name, "production"),
+        client.listPromptVersions(taskId, name).catch(() => []),
+      ]);
+      const versions: PromptVersion[] = rawVersions.map((v) => ({
+        version: v.version,
+        createdAt: v.created_at,
+        tags: v.tags,
+        modelProvider: v.model_provider,
+        modelName: v.model_name,
+        numMessages: v.num_messages,
+        numTools: v.num_tools,
+      }));
+      // Attach the eager production body to its matching version entry.
+      const prodVersion = versions.find((v) => v.tags.includes("production"));
+      if (prodVersion && body !== null) prodVersion.body = body;
+
+      if (body === null) {
+        logger.info({ name, source: "fallback", reason: "arthur_prompt_missing" }, "prompts_resolved");
+        return base(name, PROMPT_FALLBACKS[name], "fallback", versions);
+      }
+      logger.info({ name, source: "arthur", versions: versions.length }, "prompts_resolved");
+      return base(name, body, "arthur", versions);
+    } catch (err) {
+      logger.warn({ name, source: "fallback", err: (err as Error).message }, "prompts_resolved");
+      return base(name, PROMPT_FALLBACKS[name], "fallback");
+    }
+  }
+
+  const prompts = await Promise.all(PROMPT_NAMES.map(one));
+  return { arthurEnabled, prompts };
+}
+```
+
+> Verify the import depth (`../../../env.js`, `../logger.js`, `../prompts.js`, `../../sandbox/arthur-client.js`) against the file's actual location before finalizing — adjust to wherever you place it. The originals in `prompts-step.ts` import `../../env.js`, `./lib/logger.js`, `./lib/prompts.js` from `workflows/`. `@shared/contracts` is the same alias the routes use.
+
+- [ ] **Step 2: Rewrite `loadPrompts()` to delegate to the helper**
+
+Keep the `"use step"` directive, `maxRetries = 0`, and the `{ research, implement, review }` return shape. Replace the body with a call to `resolvePrompts()` and a map by name:
+
+```ts
+export async function loadPrompts(): Promise<LoadedPrompts> {
+  "use step";
+  const { resolvePrompts } = await import("../lib/prompts/resolve.js");
+  const { prompts } = await resolvePrompts();
+  const byName = Object.fromEntries(prompts.map((p) => [p.name, p.body]));
+  return {
+    research: byName["research-plan"],
+    implement: byName["implement"],
+    review: byName["review"],
+  };
+}
+loadPrompts.maxRetries = 0;
+```
+
+- [ ] **Step 3: Run the existing prompts-step tests**
+
+Run: `pnpm -F @apps/worker exec vitest run src/workflows/prompts-step.test.ts`
+Expected: PASS. The test mocks `../sandbox/arthur-client.js` and `../../env.js`; if the helper's import paths differ, update the test's mock paths to match (the behavior — fallbacks when disabled, Arthur when enabled — is unchanged).
+
+---
+
+### Task 4: Add the worker routes (`GET /api/v1/prompts` + on-demand version body)
+
+**Files:**
+- Create: `apps/worker/src/routes/api/v1/prompts.get.ts`
+- Create: `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts`
+
+- [ ] **Step 1: Create the list route (mirror `runs.get.ts`)**
+
+```ts
+import { defineEventHandler, setResponseHeader } from "h3";
+import type { PromptsResponse } from "@shared/contracts";
+import { resolvePrompts } from "../../../lib/prompts/resolve.js";
+import { logger } from "../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise<PromptsResponse> => {
+  setResponseHeader(
+    event,
+    "Cache-Control",
+    "private, max-age=15, stale-while-revalidate=60",
+  );
+
+  const generatedAt = new Date().toISOString();
+  try {
+    const { arthurEnabled, prompts } = await resolvePrompts();
+    return {
+      generatedAt,
+      available: true,
+      arthurEnabled,
+      rows: prompts,
+      total: prompts.length,
+    };
+  } catch (err) {
+    logger.warn({ err: (err as Error).message }, "prompts_resolve_failed");
+    return { generatedAt, available: false, arthurEnabled: false, rows: [], total: 0 };
+  }
+});
+```
+
+> `ResolvedPrompt` is structurally assignable to `PromptDef` (same fields incl. `versions`). If TS complains about the `PromptName` vs `string` `name` field, widen via `rows: prompts as PromptDef[]`. Confirm the auth gate that protects `/api/v1/*` (`lib/api-auth.ts`) is applied route-table-wide (not per-file) — no extra wiring needed.
+
+- [ ] **Step 2: Create the on-demand version-body route (mirror `runs/[runId].get.ts`)**
+
+```ts
+// apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts
+import { defineEventHandler, setResponseHeader, getRouterParam } from "h3";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+import { env } from "../../../../../../env.js";
+import { PROMPT_NAMES, type PromptName } from "../../../../../lib/prompts.js";
+import { logger } from "../../../../../lib/logger.js";
+
+export default defineEventHandler(async (event): Promise<PromptVersionBodyResponse> => {
+  setResponseHeader(event, "Cache-Control", "private, max-age=15, stale-while-revalidate=60");
+  const generatedAt = new Date().toISOString();
+
+  const name = getRouterParam(event, "name") ?? "";
+  const version = getRouterParam(event, "version") ?? "";
+  const arthurEnabled =
+    !!env.GENAI_ENGINE_API_KEY && !!env.GENAI_ENGINE_TRACE_ENDPOINT && !!env.GENAI_ENGINE_PROMPT_TASK_ID;
+
+  if (!arthurEnabled || !PROMPT_NAMES.includes(name as PromptName) || !version) {
+    return { generatedAt, available: false, body: null };
+  }
+  try {
+    const { ArthurClient } = await import("../../../../../sandbox/arthur-client.js");
+    const client = ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT!, env.GENAI_ENGINE_API_KEY!);
+    const body = await client.getPromptVersionBody(env.GENAI_ENGINE_PROMPT_TASK_ID!, name, version);
+    return { generatedAt, available: body !== null, body };
+  } catch (err) {
+    logger.warn({ name, version, err: (err as Error).message }, "prompt_version_body_failed");
+    return { generatedAt, available: false, body: null };
+  }
+});
+```
+
+> Verify the relative import depth for this nested route path against the repo's actual `tsconfig`/route layout — count segments from `routes/api/v1/prompts/[name]/versions/` back to `apps/worker/{env.ts,src/lib,src/sandbox}`. Adjust `../` counts accordingly (the `env.ts` lives at `apps/worker/env.ts`, not under `src/`). Confirm h3's file-based dynamic-segment convention uses `[name]`/`[version]` here the same way `runs/[runId].get.ts` does.
+
+- [ ] **Step 3: Typecheck the worker**
+
+Run: `pnpm -F @apps/worker exec tsc --noEmit`
+Expected: PASS.
+
+- [ ] **Step 4: Smoke the endpoints locally (optional but recommended)**
+
+Start the worker, then:
+`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:<port>/api/v1/prompts | jq`
+Expected: `{ available: true, arthurEnabled: <env>, total: 3, rows: [3 prompts; each has body, source, model, versions[]] }`. With Arthur on, `versions` is non-empty and one entry carries `body`.
+`curl -s -H "Authorization: Bearer $WORKER_API_TOKEN" http://localhost:<port>/api/v1/prompts/research-plan/versions/1 | jq`
+Expected (Arthur on): `{ available: true, body: "..." }`; (Arthur off / missing): `{ available: false, body: null }`.
+
+---
+
+### Task 5: Add the dashboard fallback
+
+**Files:**
+- Modify: `apps/dashboard/lib/api/fallbacks.ts`
+
+- [ ] **Step 1: Add `promptsFallback`**
+
+Add `PromptsResponse` to the existing `import type { ... } from "@shared/contracts"`, then append:
+
+```ts
+export function promptsFallback(now: string): PromptsResponse {
+  return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+}
+```
+
+- [ ] **Step 2: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS (no new errors from this file).
+
+---
+
+### Task 6: Add the loading skeleton
+
+**Files:**
+- Create: `apps/dashboard/app/prompts-skeleton.tsx`
+
+- [ ] **Step 1: Create the skeleton (mirror `overview-skeleton.tsx`)**
+
+Header + 4-up KPI row + two-column (rail + detail) block matching the `/prompts` layout:
+
+```tsx
+// apps/dashboard/app/prompts-skeleton.tsx
+function Block({ className = "" }: { className?: string }) {
+  return <div className={`bg-neutral-200/60 rounded-sm animate-pulse ${className}`} />;
+}
+
+export function PromptsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="flex items-end justify-between">
+        <Block className="h-10 w-56" />
+        <Block className="h-9 w-64" />
+      </div>
+      <div className="grid grid-cols-1 lg:grid-cols-4 gap-3">
+        {Array.from({ length: 4 }, (_, i) => (
+          <Block key={i} className="h-[96px]" />
+        ))}
+      </div>
+      <div className="flex flex-col lg:grid lg:grid-cols-[340px_1fr] gap-3 lg:min-h-[720px]">
+        <Block className="lg:h-full h-[300px]" />
+        <Block className="lg:h-full h-[400px]" />
+      </div>
+    </div>
+  );
+}
+```
+
+- [ ] **Step 2: Typecheck**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS.
+
+---
+
+### Task 7: Add the server data component + the client-side version-body proxy route
+
+**Files:**
+- Create: `apps/dashboard/app/prompts-data.tsx`
+- Create: `apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts`
+
+- [ ] **Step 1: Create the server component (mirror `runs-data.tsx`)**
+
+```tsx
+// apps/dashboard/app/prompts-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import type { PromptsResponse } from "@shared/contracts";
+import { promptsFallback } from "@/lib/api/fallbacks";
+
+export async function PromptsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptsResponse>("/api/v1/prompts").catch(() =>
+    promptsFallback(now),
+  );
+  return <PromptsScreen data={data} />;
+}
+```
+
+> This won't typecheck until Task 8 changes `PromptsScreen`'s signature. Expected; the full gate is in Task 9.
+
+- [ ] **Step 2: Create the same-origin proxy route for lazy version bodies**
+
+`PromptsScreen` is a client component; the bearer-gated worker API can't be hit from the browser (the token is server-only). Add a Next route handler that proxies the worker server-side:
+
+```ts
+// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
+import { NextResponse } from "next/server";
+import { getJSON } from "@/lib/api/server";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+
+export async function GET(
+  _req: Request,
+  { params }: { params: Promise<{ name: string; version: string }> },
+) {
+  const { name, version } = await params;
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptVersionBodyResponse>(
+    `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`,
+  ).catch(() => ({ generatedAt: now, available: false, body: null }));
+  return NextResponse.json(data);
+}
+```
+
+> `params` is a Promise in Next 15 route handlers — confirm against the repo's Next version and existing route-handler conventions (check whether other `app/api/**/route.ts` files already exist to mirror their `params` typing). If none exist, this is the first; that's fine.
+
+- [ ] **Step 3: Typecheck dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS for the route handler (the `prompts-data.tsx` line still fails until Task 8; full gate in Task 9).
+
+---
+
+### Task 8: Convert `PromptsScreen` to consume real data (with real version history)
+
+**Files:**
+- Modify: `apps/dashboard/components/cockpit/screens/prompts.tsx`
+
+Keep the read-only registry + version-timeline shape, now backed by real data. **Remove** the per-version metrics grid and the two-column A/B diff (no Arthur source). Reuse existing `CkCard`, `CkKPI`, `Stat`, the chip styling, and the single-column mono body markup lifted from the old `PromptDiff`.
+
+- [ ] **Step 1: Replace imports and remove mock dependency**
+
+```tsx
+"use client";
+
+import React, { useState, useEffect } from "react";
+import { CkCard, CkKPI } from "@/components/ui";
+import type { PromptsResponse, PromptDef, PromptVersion } from "@shared/contracts";
+```
+
+Remove: `AIWF_DATA`, and the mock `Prompt`/`PromptVersion`/`PromptTag` imports from `@/lib/types` (the `PromptVersion` now comes from `@shared/contracts`). Remove `const D = AIWF_DATA`. Keep `useEffect` (used to reset/lazy-load the selected version body when the active prompt changes). `CkChip` stays if still used.
+
+- [ ] **Step 2: Repurpose `PromptStatusChip` for real tags + source**
+
+`PromptStatusChip` keys off a status string. Real statuses now are: the production tag (`production`) on a version, and the resolution `source` (`arthur`/`fallback`). Add `arthur`/`fallback` keys to `PROMPT_STATUS_COLOR` and keep the existing `production`/`staging`/`draft`/`archived`/`locked` keys (real Arthur `tags` may include any string — unknown tags fall through to the default style already coded).
+
+- [ ] **Step 3: Rewrite `PromptList` to consume `PromptDef[]`**
+
+- Signature: `function PromptList({ rows, active, onSelect }: { rows: PromptDef[]; active: string; onSelect: (name: string) => void })`.
+- Tag filter pills: derive the option set from the tags that actually occur across `rows[].versions[].tags` (e.g. `["all", ...uniqueTags]`); filter rows by whether any of their versions carries the selected tag. (If no versions/tags exist — Arthur off — render just `all` or hide the pill row.)
+- Each row keyed by `p.name`; show `p.name`, `p.phase`, `p.model`, the production-tag chip (from the version tagged `production`), and a `source` chip. Remove the eval score/delta figure.
+- `eyebrow`: `` `${arthurEnabled ? "Arthur" : "In-code"} · ${rows.length} prompts` `` — thread `arthurEnabled` through as a prop.
+
+- [ ] **Step 4: Rewrite `PromptDetail` — body panel + real version timeline**
+
+- Signature: `function PromptDetail({ prompt }: { prompt: PromptDef | undefined })`.
+- Keep the "Select a prompt to inspect." empty state when `prompt` is undefined.
+- Header eyebrow: `{prompt.source === "arthur" ? "Arthur" : "In-code"} · {prompt.phase}`. Title: `prompt.name`. Action chips: the `source` chip. Leave the `+ New version` / `Deploy` buttons inert (read-only).
+- Replace the four mock `Stat`s with real ones: `Phase` = `prompt.phase`, `Source` = `prompt.source`, `Model` = `prompt.model`, `Versions` = `prompt.versions.length`.
+- **Version timeline (real):** map `prompt.versions` (newest first). Each card shows: `v{version}`, `createdAt` (format as-is or relative), tag chips (`v.tags`), `modelName`, and `numMessages`/`numTools` counts. **Delete** the mock per-card eval/halluc/p95/cost rows and the `traffic` bar. Clicking a version selects it for the body panel.
+- **Body panel (single column, read-only):** lift the inner mono `<div>` markup from the old `PromptDiff` (drop the two-column diff). Default shows `prompt.body` (the production version). When the user selects a non-production version, fetch its body once via the proxy route and render it:
+  ```tsx
+  const [selectedVersion, setSelectedVersion] = useState<number | null>(null);
+  const [bodyCache, setBodyCache] = useState<Record<number, string>>({});
+  const [loading, setLoading] = useState(false);
+  // reset selection when the prompt changes
+  useEffect(() => { setSelectedVersion(null); }, [prompt?.name]);
+  async function showVersion(v: PromptVersion) {
+    setSelectedVersion(v.version);
+    if (v.body) { setBodyCache((c) => ({ ...c, [v.version]: v.body! })); return; }
+    if (bodyCache[v.version] !== undefined) return;
+    setLoading(true);
+    try {
+      const res = await fetch(`/api/prompts/${prompt!.name}/versions/${v.version}`);
+      const json = (await res.json()) as { body: string | null };
+      setBodyCache((c) => ({ ...c, [v.version]: json.body ?? "(version body unavailable)" }));
+    } finally { setLoading(false); }
+  }
+  const shownBody = selectedVersion != null ? (bodyCache[selectedVersion] ?? (loading ? "Loading…" : "")) : prompt!.body;
+  ```
+- Delete the now-unused `PromptDiff` and `PromptMetrics` functions.
+
+- [ ] **Step 5: Rewrite the top-level `PromptsScreen`**
+
+```tsx
+export function PromptsScreen({ data }: { data: PromptsResponse }) {
+  const [active, setActive] = useState(data.rows[0]?.name ?? "");
+  const selected = data.rows.find((p) => p.name === active);
+  const inProd = data.rows.filter((p) => p.versions.some((v) => v.tags.includes("production"))).length;
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      {/* header — keep the title; leave the inert Import/New buttons */}
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
+        <CkKPI label="Prompts" value={data.total.toString()} sub="workflow phases" />
+        <CkKPI label="In production" value={inProd.toString()} sub={data.arthurEnabled ? "tagged in Arthur" : "in-code defaults"} />
+        {/* A/B + avg-Δ tiles removed — no real source */}
+      </div>
+      <div className="flex flex-col lg:grid lg:grid-cols-[340px_1fr] gap-3 lg:min-h-[720px]">
+        <PromptList rows={data.rows} active={active} onSelect={setActive} arthurEnabled={data.arthurEnabled} />
+        <PromptDetail prompt={selected} />
+      </div>
+    </div>
+  );
+}
+```
+
+> Reduced from 4 KPI tiles to 2 because the A/B-test and avg-eval-Δ tiles have no real source (removed, not stubbed). Adjust the grid (`lg:grid-cols-2`) accordingly.
+
+- [ ] **Step 6: Verify no mock references remain**
+
+Run: `grep -nE "AIWF_DATA|\\bD\\.|PROMPT_BODIES|PromptTag|from \"@/lib/types\"" apps/dashboard/components/cockpit/screens/prompts.tsx`
+Expected: no matches (note `PromptVersion` now legitimately appears via `@shared/contracts`, so it's excluded from this grep).
+
+---
+
+### Task 9: Rewrite the route to the server pattern + verify
+
+**Files:**
+- Modify: `apps/dashboard/app/(cockpit)/prompts/page.tsx`
+
+- [ ] **Step 1: Replace the page with the Suspense + server-component pattern**
+
+```tsx
+// apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts")
+import { Suspense } from "react";
+
+import { PromptsData } from "@/app/prompts-data";
+import { PromptsSkeleton } from "@/app/prompts-skeleton";
+
+export default function PromptsPage() {
+  return (
+    <Suspense fallback={<PromptsSkeleton />}>
+      <PromptsData />
+    </Suspense>
+  );
+}
+```
+
+- [ ] **Step 2: Typecheck the whole dashboard**
+
+Run: `cd apps/dashboard && npx tsc --noEmit`
+Expected: PASS, no errors.
+
+- [ ] **Step 3: Lint the changed files**
+
+Run: `cd apps/dashboard && npx next lint --file app/prompts-data.tsx --file app/prompts-skeleton.tsx --file "app/api/prompts/[name]/versions/[version]/route.ts" --file "app/(cockpit)/prompts/page.tsx" --file components/cockpit/screens/prompts.tsx`
+Expected: no errors.
+
+- [ ] **Step 4: Visual check**
+
+Run: `cd apps/dashboard && pnpm dev`, open `/prompts`.
+Expected:
+- Three prompts listed (`research-plan`, `implement`, `review`) by phase + model.
+- Selecting one shows its production body. With Arthur enabled, the version timeline lists real Arthur versions (version number, created-at, tags, model); clicking a historical version fetches and shows that version's body via `/api/prompts/{name}/versions/{version}`.
+- With Arthur disabled, `source` chip reads `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`.
+- With the worker unreachable (`WORKER_BASE_URL` unset), the page shows the empty state (`0 prompts`), no crash. A failed version-body fetch shows an inline "version body unavailable" note, no crash.
+
+- [ ] **Step 5: Commit (ONLY if the user asks)**
+
+```bash
+git add apps/shared/contracts/api.ts apps/shared/contracts/domain.ts \
+  apps/worker/src/sandbox/arthur-client.ts apps/worker/src/sandbox/arthur-client.test.ts \
+  apps/worker/src/lib/prompts/resolve.ts apps/worker/src/workflows/prompts-step.ts \
+  apps/worker/src/routes/api/v1/prompts.get.ts \
+  "apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts" \
+  apps/dashboard/lib/api/fallbacks.ts apps/dashboard/app/prompts-data.tsx \
+  "apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts" \
+  apps/dashboard/app/prompts-skeleton.tsx "apps/dashboard/app/(cockpit)/prompts/page.tsx" \
+  apps/dashboard/components/cockpit/screens/prompts.tsx
+git commit -m "feat: wire /prompts to real worker data with Arthur version history"
+```
+
+---
+
+## Self-Review
+
+**Spec coverage:**
+- `PromptVersion` + `PromptDef` + `PromptsResponse` + `PromptVersionBodyResponse` contracts → Task 1. ✓
+- Arthur read methods (`listPromptVersions`, `getPromptVersionBody`) → Task 2. ✓
+- Real data source (Arthur production tags + in-code fallbacks) + version history via shared `resolvePrompts()` → Task 3. ✓
+- Worker list route `GET /api/v1/prompts` + on-demand body route `GET /api/v1/prompts/[name]/versions/[version]` → Task 4. ✓
+- Dashboard `promptsFallback` → Task 5. ✓
+- `prompts-skeleton.tsx` → Task 6. ✓
+- `prompts-data.tsx` server component + client-side version-body proxy route → Task 7. ✓
+- `PromptsScreen` swap to read-only real-data view with real version timeline; per-version metrics + A/B diff markup removed → Task 8. ✓
+- Page route → server pattern → Task 9. ✓
+- Worker-down empty state → `promptsFallback` (Task 5) + route catch (Task 4), verified in Task 9 Step 4. ✓
+- Embellishment removal (per-version eval/halluc/p95/cost, traffic split, eval Δ, A/B test KPI) — markup deleted, not stubbed (Task 8). ✓
+
+**Decisions resolved (no longer open):** read-only confirmed; real version history in scope (metadata + on-demand bodies); tags are real; `resolvePrompts()` extraction confirmed OK; production-body eager / historical lazy.
+
+**Still-open items (flagged in spec, do not block execution):**
+1. Lazy vs eager historical body fetch — plan implements eager-production / lazy-history; switch if the user prefers otherwise.
+2. Version-list pagination depth — plan fetches first page only; add pagination if deep history is required.
+
+**Type consistency:** `PromptsResponse`/`PromptDef`/`PromptVersion`/`PromptVersionBodyResponse` imported from `@shared/contracts` across Tasks 3, 4, 5, 7, 8. `PromptsScreen` accepts `{ data: PromptsResponse }` (Task 8) — matches the call site (Task 7). `ResolvedPrompt` (worker) is structurally assignable to `PromptDef` (incl. `versions: PromptVersion[]`); widen the `name` field if TS narrows on the literal union. `ArthurPromptVersion` (snake_case Arthur shape) is mapped to the camelCase `PromptVersion` inside `resolvePrompts()`. `PromptsSkeleton` (Task 6) matches the import in Task 9. ✓
+
+**Placeholder scan:** No TBD/TODO. Verify, when executing: worker route import depths (esp. the nested `prompts/[name]/versions/[version].get.ts` path), the Next route-handler `params` Promise convention against the repo's Next version, and the worker dev-run command — all flagged inline. ✓
+</content>
diff --git a/docs/superpowers/specs/2026-06-08-cost-real-data-design.md b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md
new file mode 100644
index 0000000..657bfa2
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-cost-real-data-design.md
@@ -0,0 +1,166 @@
+# `/cost` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Draft — has open questions (see end)
+**Scope:** Convert the `/cost` (Cost & Usage) dashboard page from mock data to live worker data, mirroring the overview/runs server-component fetch pattern. Cost + token usage come from **Arthur** (the GenAI Engine), which already aggregates token counts and USD cost from the OpenInference traces the workflow ships in.
+
+## Problem
+
+The `/cost` page (`apps/dashboard/app/(cockpit)/cost/page.tsx`) renders a complete UI — spend / token KPIs, a daily-spend area chart, a per-model donut + breakdown table, and a per-workflow/task breakdown table — entirely from mock data (`AIWF_DATA.COST_BY_MODEL`, `AIWF_DATA.HOURS24`, `AIWF_DATA.WORKFLOWS`). The overview and `/runs` pages already fetch real data from the worker; `/cost` should do the same.
+
+The overview's `cost24h` (`KpisResponse`), `Run.cost`, `Run.tokens`, and `WorkflowRow.costToday` are all hardcoded `null` (`collect-kpis.ts:69`, `collect-runs.ts:171-172`, `collect-workflows.ts:81`, `derive-kpis.ts:49`) because the Vercel Workflow run store carries no usage. But the workflow already ships OpenInference traces to Arthur (per-ticket task, `apps/worker/src/sandbox/arthur-tracer.ts` + `arthur-client.ts`), and **Arthur aggregates token + cost data first-class** on those traces. So the real source already exists and is queryable — no new capture or persistence is needed.
+
+## Current state
+
+### What the screen needs (exact data shape)
+
+Read from `apps/dashboard/components/cockpit/screens/cost.tsx`:
+
+| UI element | Mock source | Real source after this change |
+| --- | --- | --- |
+| KPI: spend | `sum(COST_BY_MODEL.cost)` | `totals.totalTokenCost` (USD) |
+| KPI: Tokens | `sum(COST_BY_MODEL.tokens)` | `totals.totalTokens` |
+| KPI: Cost/run avg | hardcoded `$0.41` | `totals.costPerRun` |
+| KPI: Projection EoM | hardcoded `$1,184` | **removed** (no source) |
+| Area chart "Daily spend" | `HOURS24.map(h => h.cost*24)` | `daily[].cost` + `daily[].date` (Arthur timeseries) |
+| Donut "Model mix" | `COST_BY_MODEL[].share` + center | `byModel[].cost` → shares computed in-screen; center = `totalTokenCost` |
+| Table "Per-model breakdown" | `COST_BY_MODEL[]` | `byModel[] { model, cost, tokens }` (span-level aggregation) |
+| Table "Per-workflow breakdown" | `WORKFLOWS[]` sorted by `costToday` | `byWorkflow[]` (= per-Arthur-task; see mapping note) |
+| Header tabs "By model / workflow / actor" | inert | **removed** |
+| "Export CSV" button | inert | **removed** |
+| Sparklines (`Spark`, random `sparkSeries`) | mock RNG | **removed** |
+| Budget `$1,200`, MoM/WoW deltas | hardcoded | **removed** |
+
+Mock shapes (replaced): `CostByModel { model, vendor, cost, tokens, share }` (`apps/dashboard/lib/types.ts:36`); `HourPoint` (`apps/shared/contracts/domain.ts:129`).
+
+### How real data flows (the template — overview/runs)
+
+1. Worker route `apps/worker/src/routes/api/v1/...` returns a typed `@shared/contracts` response; wraps the collector in try/catch and degrades to an empty payload on failure (see `runs.get.ts`, `workflows.get.ts`). Sends `Cache-Control: private, max-age=15, swr=60`.
+2. Response interface declared in `apps/shared/contracts/api.ts`.
+3. Dashboard fetches server-side via `getJSON<T>(path)` (`apps/dashboard/lib/api/server.ts`) — bearer `WORKER_API_TOKEN`, `cache: "no-store"`.
+4. A `*-data.tsx` server component calls `getJSON`, `.catch()`s to a fallback in `apps/dashboard/lib/api/fallbacks.ts`, passes a `data` prop to the client screen.
+5. The page is a thin `<Suspense fallback={<Skeleton/>}><Data/></Suspense>` route.
+
+This is a **single-PR conversion** — no persistence layer, no two-step rollout.
+
+## The real data source — Arthur GenAI Engine
+
+The worker already holds an Arthur client. `ArthurClient.fromTraceEndpoint(env.GENAI_ENGINE_TRACE_ENDPOINT, env.GENAI_ENGINE_API_KEY)` (`arthur-client.ts:37`) builds a client whose `request<T>` helper sends `Authorization: Bearer <GENAI_ENGINE_API_KEY>`. Both env vars are optional (`apps/worker/env.ts:83-84`) → when unset, the route falls back to the empty state. Reads require the `INFERENCE_READ` permission on the key. Arthur is org-scoped (the single deployment sees its own org) — consistent with this project's single-tenant deployment model.
+
+### Token + cost are first-class on Arthur traces
+
+Traces/spans extend `TokenCountCostSchema`:
+`{ prompt_token_count, completion_token_count, total_token_count, prompt_token_cost, completion_token_cost, total_token_cost }` (cost in USD floats, `null` if unavailable). Responses also carry `display_currency` (defaults USD).
+
+### Endpoints used
+
+1. **Totals + per-task breakdown (one call):** `POST /api/v1/traces/overview`
+   body `{ task_ids, start_time, end_time }` →
+   `{ count, overviews: [{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }] }`.
+   Multi-task in one call gives fleet totals (sum across `overviews`) **and** the per-task breakdown over a window.
+
+2. **Daily-spend chart:** `POST /api/v1/traces/overview/timeseries`
+   body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) →
+   points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`.
+   For a fleet daily-spend chart, fan out one call per task and **merge points by bucket timestamp**, summing `trace_token_cost`/`trace_token_count`. (`bucket_size` allowed values are unconfirmed — see open questions.)
+
+3. **By-model breakdown (the one manual aggregation):** `GET /api/v1/traces/spans` (and/or `GET /api/v1/traces`) extend `TokenCountCostSchema`, and spans carry `model_name`. The overview endpoint is per-**task**, not per-model, so a by-model table requires fetching span rows for the window and **summing token/cost client-side grouped by `model_name`**. This is the only client-side aggregation; flagged below.
+
+### How usage→cost is computed
+
+No client-side pricing. Arthur returns USD cost directly (`*_token_cost`), already derived from the traces. The worker just sums Arthur's pre-aggregated numbers (for totals/timeseries) or groups span rows by `model_name` (for the by-model table). The pricing table (`apps/worker/src/sandbox/agents/pricing.ts`) and the Slack `usageReport` path are untouched and not on this read path.
+
+### Reconciliation with the overview KPI (out of scope, noted)
+
+The overview's `cost24h` / `WorkflowRow.costToday` / `Run.cost` are hardcoded `null` today. The same Arthur source could backfill those so cost is computed in exactly one place going forward (e.g. `collectKpis`/`collectWorkflows` querying `/traces/overview` for the matching task/window). Out of scope for this PR, but called out so the `null` placeholders aren't reinvented elsewhere.
+
+## Proposed contract (`apps/shared/contracts/api.ts`)
+
+```ts
+export interface CostByModelEntry {
+  model: string;   // Arthur span model_name
+  cost: number;    // USD, summed total_token_cost over the window
+  tokens: number;  // summed total_token_count over the window
+}
+
+export interface CostByWorkflowEntry {
+  /** Arthur task_id (per ticket-run, e.g. "AWT-42" / "AWT-42.1"). */
+  taskId: string;
+  /** Arthur task name (= the ticket-run identifier). */
+  name: string;
+  runs: number;       // trace_count for the task
+  tokens: number;     // trace_token_count
+  cost: number;       // trace_token_cost (USD)
+  costPerRun: number; // cost / max(1, runs)
+}
+
+export interface CostResponse {
+  generatedAt: string;
+  /** false when Arthur is unconfigured/unreachable or returns nothing. The
+   *  screen renders its empty/N-A state. */
+  available: boolean;
+  /** Window the figures cover (the request's start_time/end_time). */
+  window: { start: string; end: string }; // ISO
+  totals: {
+    totalTokenCost: number; // USD, Σ overviews[].trace_token_cost
+    totalTokens: number;    // Σ overviews[].trace_token_count
+    traceCount: number;     // Σ overviews[].trace_count
+    costPerRun: number;     // totalTokenCost / max(1, traceCount)
+  };
+  byModel: CostByModelEntry[];
+  /** Per-task (= per ticket-run) breakdown from /traces/overview. */
+  byWorkflow: CostByWorkflowEntry[];
+  /** Per-day spend, oldest→newest, merged across tasks from the timeseries. */
+  daily: { date: string; cost: number; tokens: number }[]; // date = bucket ISO timestamp
+}
+```
+
+Notes:
+- `byWorkflow` is named to match the screen's "Per-workflow breakdown" section, but its entries are **per Arthur task** (per ticket-run), since that's the natural grain of `/traces/overview`. See the mapping open question.
+- Stripped from the contract/screen (no real source, per user decision): budget, MoM/WoW deltas, EoM projection, "By actor" tab, decorative sparklines, "Export CSV".
+
+## Fallback / unavailable state
+
+Add `costFallback(now)` to `apps/dashboard/lib/api/fallbacks.ts`:
+
+```ts
+export function costFallback(now: string): CostResponse {
+  return {
+    generatedAt: now,
+    available: false,
+    window: { start: now, end: now },
+    totals: { totalTokenCost: 0, totalTokens: 0, traceCount: 0, costPerRun: 0 },
+    byModel: [],
+    byWorkflow: [],
+    daily: [],
+  };
+}
+```
+
+The worker route degrades to the same empty payload (`available:false`) when `GENAI_ENGINE_API_KEY`/`GENAI_ENGINE_TRACE_ENDPOINT` are unset or any Arthur call throws — matching `runs.get.ts`/`workflows.get.ts`. The screen renders `$0.00` / `0` / empty tables — never crashes.
+
+## Behavior
+
+- **Happy path:** `/cost` shows real spend, token totals, per-model and per-task breakdowns, and a per-day spend chart, all from Arthur over the chosen window.
+- **Arthur unconfigured / unreachable / 401:** `getJSON` returns (or the worker degrades to) `available:false` → empty/zero state. No crash.
+
+## Out of scope
+
+- Wiring tabs / "Export CSV" (removed).
+- Backfilling the overview's `cost24h`/`costToday`/`Run.cost` from Arthur (mentioned above).
+- A task→workflow mapping for a true by-workflow rollup (breakdown stays per-task).
+
+## Open questions / assumptions
+
+1. **`bucket_size` values.** `/traces/overview/timeseries` takes a `bucket_size`, but the allowed values (e.g. `"day"` vs a duration vs an enum) are unconfirmed. **Assumption:** a day-granularity bucket exists for the daily chart; confirm the exact value.
+2. **Empty `task_ids`.** Does `/traces/overview` with an empty/omitted `task_ids` return org-wide totals, or is `task_ids` required? If required, the worker must first list the org's tasks (the client already lists tasks via `/api/v2/tasks/search`) and pass their ids. **Assumption:** we enumerate tasks and pass ids explicitly.
+3. **By-model client aggregation.** Per-model totals require fetching span rows and summing by `model_name` client-side (Arthur has no per-model overview). Acceptable, given span volume per window? Or drop the by-model table for v1?
+4. **Task→workflow mapping.** Arthur tasks are per ticket-run (`AWT-42`, `AWT-42.1`). The "by workflow" section therefore shows **per-task** rows unless we add a task→workflow mapping. Stated, not blocking; per-task is the natural breakdown.
+5. **Window.** Which window do the KPIs cover — calendar MTD, rolling 30d, or 24h? Drives `start_time`/`end_time`. **Assumption:** calendar month-to-date (matches the original "MTD" framing); confirm.
+
+## Verification
+
+1. Worker + dashboard typecheck pass.
+2. `GET /api/v1/cost` returns non-empty `totals`/`byWorkflow` for a window with real Arthur traces.
+3. `/cost` renders those figures (spend, tokens, breakdowns, daily chart).
+4. With Arthur unconfigured (env unset) or unreachable, `/cost` shows the zero/empty state — no crash.
diff --git a/docs/superpowers/specs/2026-06-08-evals-real-data-design.md b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md
new file mode 100644
index 0000000..95d683a
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-evals-real-data-design.md
@@ -0,0 +1,227 @@
+# `/evals` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Draft (has open questions — see end)
+**Scope:** Convert the `/evals` page from mock data to live data, mirroring the overview/runs server-component fetch pattern. Unlike `/runs`, the worker does **not** yet expose an evals list endpoint and the underlying eval results are **not yet read from anywhere** — so this design also covers the prerequisite of producing/reading eval results, with the data-source decision flagged explicitly.
+
+## Problem
+
+The `/evals` dashboard page (`apps/dashboard/app/(cockpit)/evals/page.tsx`) is a 4-line stub that renders `EvalsScreen` with no data fetch. `EvalsScreen` (`apps/dashboard/components/cockpit/screens/evals.tsx`) is a `"use client"` component that reads the hardcoded `AIWF_DATA.EVALS` mock slice and draws synthetic sparklines via `jitterSeries`. Nothing on this page is real.
+
+We want `/evals` to fetch real data from the worker through the same three-layer pattern the overview and runs pages use:
+1. thin server route (`page.tsx`) → `<Suspense>` + server data component;
+2. `evals-data.tsx` server component calling `getJSON<T>` with a `.catch()` fallback;
+3. client presenter `EvalsScreen` receiving a typed `data` prop.
+
+## Current state
+
+### Mock (what the screen renders today)
+
+`apps/dashboard/lib/data/mock.ts` exports `EVALS: EvalMetric[]` (the "Arthur evals" slice, lines ~82–93). The shape is `EvalMetric` from `apps/dashboard/lib/types.ts`:
+
+```ts
+export interface EvalMetric {
+  metric: string;                          // "Hallucination", "PII Detection", …
+  value: number;                           // numeric reading
+  target: string;                          // human string, e.g. "< 0.05", "= 0", "flags"
+  status: "pass" | "warn" | "fail";
+  trend: number;                           // signed delta vs prior window
+  axis: "safety" | "quality" | "ops";      // grouping bucket
+  family: string;                          // "output" | "agent" | "input" | "rag" | "runtime"
+  unit?: string;                           // optional, e.g. "flags/24h"
+}
+```
+
+`EvalsScreen` renders, per `axis` group ("safety", "quality", "ops"):
+- a `CkCard` with eyebrow=axis, title from a fixed map, a left-border accent color, and an action label `{list.length} evaluators`;
+- one cell per metric containing: `metric` name, a `pass`/`warn`/`fail` `CkChip`, the formatted `value` (`<1` → `toFixed(3)`, else as-is), optional `unit`, a `trend` indicator (`↗`/`↘`/`→` + `Math.abs(trend).toFixed(3)`; **negative trend renders green, positive red** — i.e. "down is good" by current convention), a `Spark` sparkline, and `target {e.target}`.
+
+Header chrome is decorative/hardcoded: the eyebrow "Arthur engine · continuous evaluation", the title "Evaluations & guardrails", a `CkChip` "Live · 12,408 spans · 24h", and a `+ New eval` button.
+
+**The sparkline is fake:** `Spark data={jitterSeries(...)}`. There is no per-metric time series in the mock or anywhere else.
+
+### Existing eval scaffold
+
+`apps/shared/contracts/api.ts` already declares a discriminated union:
+
+```ts
+export type EvalHealthResponse =
+  | { available: true; score: number; pass: number; warn: number; fail: number;
+      spansGraded: number; windowHours: number }
+  | { available: false; reason: string };
+```
+
+The worker route `apps/worker/src/routes/api/v1/overview/eval-health.get.ts` is a hardcoded stub returning `{ available: false, reason: "Eval grading not wired up yet." }`. The overview page already consumes it: `overview-data.tsx` fetches `/api/v1/overview/eval-health` (falls back to `evalHealthFallback()` → `{ available: false, reason: "Worker unavailable." }`), and `EvalHealthKPI` in `overview.tsx` renders a donut of pass/warn/fail + score + `spansGraded`/`windowHours` when `available`, else the `reason` string. This is a **summary** KPI tile, not the per-metric breakdown the `/evals` page needs.
+
+### Where eval results actually originate (the real data source — CONFIRMED)
+
+Arthur is integrated **write-only** today:
+- `apps/worker/src/sandbox/arthur-client.ts` — a client for the Arthur GenAI Engine **tasks/prompts** API (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). It creates one task per ticket run and hosts/tags prompt versions. It has **no** read method yet.
+- `apps/worker/src/sandbox/arthur-tracer.ts` — a bundled Python OpenInference tracer that **ships traces/spans into** Arthur Engine from inside each sandbox via `POST /api/v1/traces`. Data flows out of the worker; nothing reads it back.
+- Wiring lives in `apps/worker/src/workflows/agent.ts` (`ensureArthurTaskForTicket`, gated on `env.GENAI_ENGINE_API_KEY` + `env.GENAI_ENGINE_TRACE_ENDPOINT`).
+
+**The Arthur GenAI Engine DOES expose a read API** (ground-truthed from `arthur-ai/arthur-engine` + `arthur-common` on `main`). Auth is the **same** `Authorization: Bearer GENAI_ENGINE_API_KEY` used for writes; reads require the `INFERENCE_READ` permission. All reads are **org-scoped** — a deployment's key sees its whole org, which matches our single-tenant-per-deployment model. The relevant endpoints:
+
+- **Fleet aggregate (primary source for this page) — one call, multi-task:**
+  `POST /api/v1/traces/overview` body `TraceOverviewRequest { task_ids, start_time, end_time }` → `TraceOverviewListResponse { count, overviews: TraceOverviewResponse[] }`. Each `TraceOverviewResponse` = `{ task_id, trace_count, trace_token_count, trace_token_cost, eval_count, continuous_eval_success_rate, last_active }`. This yields fleet-wide eval health (success rate + trace/eval counts) over a 24h window with no per-task fan-out at the result-shaping layer.
+- **Per-metric breakdown (optional):** `GET /api/v1/traces/spans` (list, metadata only) → `GET /api/v1/traces/spans/{span_id}` → `SpanWithMetricsResponse.metric_results: MetricResultResponse[]` where each = `{ id, metric_type, details, prompt_tokens, completion_tokens, latency_ms, span_id, metric_id, created_at }`. `metric_type` is an enum of **only** `QueryRelevance | ResponseRelevance | ToolSelection`. `details` is an opaque JSON string (e.g. relevance → `{ bert_f_score, reranker_relevance_score, llm_relevance_score, reason }`). **There is no flat numeric score or pass/fail on a metric result** — we parse `details` and apply our own threshold.
+- **Trend/timeseries (optional):** `POST /api/v1/traces/overview/timeseries` body `{ task_id, start_time, end_time, bucket_size }` (**single task per call**) → points `{ timestamp, trace_count, trace_token_count, trace_token_cost, continuous_eval_success_rate }`.
+
+#### CRITICAL CAVEAT — what our trace path actually yields
+
+The rich rule-based evals the mock screen implies — **hallucination, PII, toxicity, prompt-injection** Pass/Fail — live in Arthur's **legacy inference/rule model**, populated **only** by the `/validate_prompt` + `/validate_response` write path. **We never call that path; we only ship OpenInference traces (`POST /api/v1/traces`).** Therefore `GET /api/v2/inferences/query` and those rule families are **empty for us**.
+
+What our trace path actually produces:
+- `continuous_eval_success_rate`, `eval_count` (spans graded), `trace_count` — from `/traces/overview`;
+- the three relevance/tool metric types — and **only if continuous evals are configured on the task**; otherwise `eval_count = 0`.
+
+So the realistic `/evals` page = an overall **eval-health score** (`continuous_eval_success_rate × 100`), the **graded count + window**, and a **relevance / tool-selection breakdown**. The hallucination/toxicity/PII/prompt-injection families the mock shows are **dropped** from this page. Adopting Arthur's `validate_*` API to populate them is a **separate future prerequisite, explicitly out of scope** here.
+
+**Conclusion:** evals are now reachable via a confirmed read API, so this is no longer blocked. Conversion's prerequisite is to add a worker-side read path (`getTracesOverview()` on `ArthurClient` + a `collect-evals.ts` collector). When Arthur is unconfigured, or when `eval_count = 0` (no continuous evals configured / no graded spans in window), the page degrades to the documented unavailable state — exactly like `eval-health` does today.
+
+## Proposed data contract
+
+Add to `apps/shared/contracts/api.ts`. The shape now maps directly to `TraceOverviewResponse` (the fleet aggregate) plus the relevance/tool-selection breakdown. We reuse the **same discriminated-union shape** as `EvalHealthResponse` so the page handles "not wired up" / "nothing graded" identically to overview. Fields with no real source on our trace-only path are **dropped** (no synthetic sparklines, no rule families).
+
+```ts
+/** One evaluator's aggregate reading over the window. Limited to the metric
+ *  types Arthur computes from our OpenInference trace path:
+ *  ResponseRelevance / QueryRelevance / ToolSelection. */
+export interface EvalMetricRow {
+  metric: string;                          // display name, e.g. "Response Relevance"
+  metricType:                              // Arthur metric_type enum
+    | "QueryRelevance"
+    | "ResponseRelevance"
+    | "ToolSelection";
+  value: number;                           // aggregate score parsed from metric_results.details
+  status: "pass" | "warn" | "fail";        // computed against our own threshold
+  axis: "quality";                         // all three are quality-axis on our path
+  // Only present when /traces/overview/timeseries is wired (see Open Q1).
+  trend?: number | null;                   // signed delta vs window start; omitted if not wired
+  spark?: number[];                        // success-rate buckets; omitted if not wired
+}
+
+export type EvalsResponse =
+  | {
+      available: true;
+      generatedAt: string;
+      windowHours: number;
+      /** continuous_eval_success_rate × 100, fleet-wide. */
+      score: number;
+      /** Σ eval_count across tasks — "spans graded" in the window. */
+      spansGraded: number;
+      /** Σ trace_count across tasks. */
+      traceCount: number;
+      /** Per-metric-type breakdown; empty if no continuous evals configured. */
+      rows: EvalMetricRow[];
+    }
+  | { available: false; generatedAt: string; reason: string };
+```
+
+Notes:
+- `score`/`spansGraded`/`traceCount`/`windowHours` come straight from summing `TraceOverviewResponse` fields across the returned overviews.
+- `EvalMetricRow.value`/`status` require the **optional** per-span breakdown (Open Q below). If we ship the aggregate-only first cut, `rows` is `[]` and the page renders the score + graded count without the per-metric grid. This keeps the first increment small.
+- `target`/`family`/`unit` from the old draft are **removed** — they were presentation metadata for rule families we cannot populate. `axis` collapses to the single `"quality"` literal because only relevance/tool metrics exist on our path.
+- `trend`/`spark` are present **only** if `/traces/overview/timeseries` is wired (Open Q1); otherwise omitted entirely (no static placeholders).
+
+**Assumption:** the `/evals` page consumes only this trace-derived data; the existing `EvalHealthResponse` summary tile on overview is left untouched. We do **not** consolidate the two endpoints in this change (though `EvalsResponse.score`/`spansGraded` could later feed it).
+
+## Real data source & how it's obtained (worker side)
+
+New worker route `GET /api/v1/evals` → `EvalsResponse`, structured like `runs.get.ts`:
+- sets `Cache-Control: private, max-age=15, stale-while-revalidate=60`;
+- if `env.GENAI_ENGINE_API_KEY` / `env.GENAI_ENGINE_TRACE_ENDPOINT` are unset, returns `{ available: false, reason: "Arthur GenAI Engine not configured." }` (no throw);
+- otherwise builds an `ArthurClient` (via the existing `ArthurClient.fromTraceEndpoint`) and calls a new read method `getTracesOverview({ taskIds, startTime, endTime })` → `POST /api/v1/traces/overview`. The new `apps/worker/src/lib/overview/collect-evals.ts` collector sums the returned `overviews` into `score`/`spansGraded`/`traceCount`, and (optionally) shapes `rows` from the per-span metric breakdown. Returns `available: true`;
+- if `eval_count` sums to `0` (no continuous evals configured on our tasks, or nothing graded in window), return `{ available: false, reason: "No graded evals in the last 24h." }` — there is genuinely nothing to show;
+- on any error, logs `evals_list_failed` and returns `{ available: false, reason: "Eval grading not wired up yet." }` — same degrade behavior as the other routes.
+
+**Task-id enumeration:** `/traces/overview` takes `task_ids`. It is **unconfirmed** whether an empty/omitted `task_ids` means "all org tasks" (Open Q2). If it does, we pass none. If it does not, we first enumerate the org's tasks via the existing `/api/v2/tasks/search` path (the `ArthurClient` already does substring search there) and pass their ids. The collector boundary (`collect-evals.ts` taking an injected fetcher) keeps this isolated and testable, matching `collect-runs.ts`/`collect-kpis.ts`.
+
+## Dashboard changes
+
+### 1. `app/(cockpit)/evals/page.tsx` (rewrite)
+Thin server route, drops the direct screen import:
+```tsx
+import { Suspense } from "react";
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
+export default function EvalsPage() {
+  return (
+    <Suspense fallback={<EvalsSkeleton />}>
+      <EvalsData />
+    </Suspense>
+  );
+}
+```
+
+### 2. `app/evals-data.tsx` (new server component)
+Mirrors `runs-data.tsx`:
+```tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<EvalsResponse>("/api/v1/evals").catch(() =>
+    evalsFallback(now),
+  );
+  return <EvalsScreen data={data} />;
+}
+```
+
+### 3. `lib/api/fallbacks.ts` (add)
+```ts
+export function evalsFallback(now: string): EvalsResponse {
+  return { available: false, generatedAt: now, reason: "Worker unavailable." };
+}
+```
+
+### 4. `components/cockpit/screens/evals.tsx` (modify)
+- Signature `EvalsScreen()` → `EvalsScreen({ data }: { data: EvalsResponse })`.
+- Remove `import { AIWF_DATA } from "@/lib/data/mock"`, `const D = AIWF_DATA`, and `import { jitterSeries } from "@/lib/rng"` (synthetic sparklines are dropped — no static placeholders).
+- Import `EvalsResponse`/`EvalMetricRow` from `@shared/contracts` (drop the mock `EvalMetric` reliance).
+- When `data.available === false`, render the existing header chrome but replace the metric cards with a single empty/unavailable panel showing `data.reason` (mirroring `EvalHealthKPI`'s reason path). This is also the state when nothing is graded.
+- When `available`:
+  - Drive the "Live · N spans · 24h" chip from `data.spansGraded` / `data.windowHours` instead of the hardcoded "12,408 spans · 24h"; optionally show `data.score`.
+  - The mock's three axis groups (safety/quality/ops) collapse to a single **Quality** group, since only relevance/tool metrics exist on our path. Render `data.rows` (all `axis: "quality"`) in one card.
+  - Each row shows `metric`, the formatted `value`, and the pass/warn/fail `CkChip`.
+  - Sparkline / trend: render `e.spark` / `e.trend` **only when present** (timeseries wired); otherwise render neither. Drop the `Spark`/`jitterSeries` usage when not wired.
+  - If `rows` is empty (aggregate-only first cut), render just the score + graded-count header — no per-metric grid.
+
+### 5. `app/evals-skeleton.tsx` (new)
+Loading fallback styled like `overview-skeleton.tsx` — header placeholder + one card-shaped block (the Quality group).
+
+## Behavior
+
+- **Happy path (Arthur configured, continuous evals graded):** `/evals` renders the fleet eval-health score + spans-graded count over the real 24h window, and (if the per-span breakdown is wired) a Quality card of relevance/tool-selection metrics. Trend/sparkline appear only when the timeseries call is wired.
+- **Arthur not configured:** worker returns `available: false`, reason "Arthur GenAI Engine not configured." Page shows header chrome + reason panel. No crash.
+- **Nothing graded (`eval_count = 0`):** worker returns `available: false`, reason "No graded evals in the last 24h." Same panel.
+- **Worker down / 401:** `getJSON` throws → `evalsFallback` → `available: false`, reason "Worker unavailable." Same silent-degrade as overview/runs.
+
+## Out of scope
+
+- Wiring up the `+ New eval` button.
+- The `EvalHealthResponse` overview tile (left as-is; could later be derived from `EvalsResponse` but not in this change).
+- **Adopting Arthur's `/validate_prompt` + `/validate_response` write path** to populate the legacy rule families (hallucination, PII, toxicity, prompt-injection). This is the prerequisite for those metrics and is a **separate future effort** — those families are simply absent from this page.
+- Per-span drill-down / individual inference detail views.
+- Synthetic sparklines — removed entirely (no static placeholders).
+
+## Open questions / assumptions (need user decision)
+
+The Arthur read API is now **confirmed** (see "Where eval results actually originate"). Remaining genuinely-open items:
+
+1. **`/traces/overview/timeseries` `bucket_size` values.** The allowed `bucket_size` values are unconfirmed. Needed only if we wire trend/sparkline; the aggregate-only first cut does not require it. **Assumption:** trend/sparkline are deferred to a second increment.
+2. **Empty `task_ids` semantics.** Does `POST /api/v1/traces/overview` treat an empty/omitted `task_ids` as "all org tasks"? If yes, one call with no ids suffices. If no, the collector must first enumerate tasks via `/api/v2/tasks/search`. **Assumption:** unconfirmed → plan covers both paths; default to enumerating tasks if empty-means-all is not verified.
+3. **Are continuous evals actually configured on our tasks in the live instance?** If continuous evals are not enabled on the per-ticket tasks, `eval_count = 0` and the page legitimately shows the "No graded evals" state. Confirming this is what determines whether the happy path ever fires today.
+
+Resolved (no longer open): read-API existence/shape, auth, org-scope/single-tenant aggregation, and the metric-family set (only relevance/tool on our path; rule families dropped).
+
+## Verification
+
+1. Shared + worker + dashboard typecheck pass (`npx tsc --noEmit`) with `EvalsResponse` imported in the route, `evals-data.tsx`, and `evals.tsx`.
+2. With the worker unreachable (or Arthur unconfigured), `/evals` renders header chrome + the reason panel, no crash.
+3. With Arthur configured and continuous evals graded, `/evals` renders the real fleet score + spans-graded count over the 24h window (and the Quality breakdown if wired).
+4. With Arthur configured but `eval_count = 0`, `/evals` shows the "No graded evals in the last 24h." panel.
diff --git a/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md
new file mode 100644
index 0000000..49be94c
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-08-prompts-real-data-design.md
@@ -0,0 +1,223 @@
+# `/prompts` Real-Data Conversion — Design
+
+**Date:** 2026-06-08
+**Status:** Approved
+**Scope:** Swap the existing `/prompts` page from mock data to live worker data, mirroring the `/runs` and overview pattern. **Read-only display, including real Arthur version history.** No write/edit endpoints. Embellishment fields with no real backing are removed (markup deleted, not stubbed with placeholders).
+
+## Problem
+
+The `/prompts` dashboard page (`apps/dashboard/app/(cockpit)/prompts/page.tsx`) renders a full prompt-registry UI but is wired entirely to mock data (`AIWF_DATA.PROMPTS`, `PROMPT_VERSIONS`, `PROMPT_BODIES` from `@/lib/data/mock`). The overview and `/runs` pages already fetch real data from the worker through a server-component pattern. We want `/prompts` to show the prompts the worker actually drives the AI workflow with.
+
+## Real data source (the important finding)
+
+In this project, "prompts" are the three system prompts that drive each workflow phase. They live in the worker, not in a CMS:
+
+- **Static fallbacks (source of truth in code):** `apps/worker/src/lib/prompts.ts` defines three constant strings — `researchPlanPrompt`, `implementPrompt`, `reviewPrompt` — exported as `PROMPT_FALLBACKS: Record<PromptName, string>` keyed by `PROMPT_NAMES = ["research-plan", "implement", "review"]`.
+- **Optional runtime override (Arthur GenAI Engine):** `apps/worker/src/workflows/prompts-step.ts`'s `loadPrompts()` step checks whether `GENAI_ENGINE_API_KEY`, `GENAI_ENGINE_TRACE_ENDPOINT`, and `GENAI_ENGINE_PROMPT_TASK_ID` are all set. If so, it fetches the `production`-tagged version of each prompt from Arthur via `ArthurClient.getPromptByTag(taskId, name, "production")` (`apps/worker/src/sandbox/arthur-client.ts`). On 404 / error / Arthur disabled it falls back to the in-code `PROMPT_FALLBACKS` string for that name.
+- **Seeding:** `apps/worker/scripts/setup-arthur-prompts.ts` is a one-shot script that pushes the three fallback strings into a single Arthur task named `ai-workflow-prompts` and tags each `production`. This is the only writer; nothing in the request/runtime path writes prompts.
+
+**Arthur read API (ground-truthed against `arthur-ai/arthur-engine` `main`).** Auth is the same `Authorization: Bearer GENAI_ENGINE_API_KEY`; prompt reads require the `TASK_READ` scope. Three endpoints are relevant:
+
+- **List versions (metadata only):** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions` → `AgenticPromptVersionListResponse { count, versions: AgenticPromptVersionResponse[] }`. Each `AgenticPromptVersionResponse`: `{ version (int), created_at, deleted_at (nullable), model_provider, model_name, tags: string[], num_messages, num_tools }`. **No message body and no per-version eval metrics.**
+- **Fetch a version body:** `GET /api/v1/tasks/{task_id}/prompts/{prompt_name}/versions/{prompt_version}` where `{prompt_version}` accepts `latest` | an integer | an ISO datetime | a tag → `AgenticPrompt { messages }`. This is the endpoint the existing `ArthurClient.getPromptByTag` already uses (it passes a tag). We use it to fetch the body of any specific version (the `production`-tagged one eagerly; an arbitrary version on demand).
+- **List all prompts on a task:** `GET /api/v1/tasks/{task_id}/prompts` → `LLMGetAllMetadataListResponse { count, llm_metadata: [{ name, versions, tags, created_at, latest_version_created_at, deleted_versions }] }`. Not strictly needed — our three phase-prompt names are fixed — so we don't use it.
+
+**Conclusion:** there is no editable prompt *registry* in this app, and the worker never persists prompt metadata locally — but Arthur **does** expose real version history (version number, created-at, tags, model) per named prompt, plus on-demand bodies. So the real, available data per phase prompt is: a stable name, the human phase label, the resolved **production body**, the resolved `source` (`arthur` | `fallback`), the model, and a list of **real Arthur versions** (`{ version, createdAt, tags, modelProvider, modelName, numMessages, numTools }`).
+
+This makes the conversion a faithful read-only swap **with real version history**. The mock-only fields that have **no Arthur source** — per-version eval/halluc/p95/cost metrics, traffic split, KPI deltas, `lastEditedBy`, the two-version A/B text diff — are **removed** (markup deleted, not replaced with static placeholders). Tags are real (`AgenticPromptVersionResponse.tags`), so a `production` badge and a tag filter are backed by data and kept.
+
+## Current state (mock)
+
+`apps/dashboard/components/cockpit/screens/prompts.tsx` (`PromptsScreen`) consumes three mock slices via `const D = AIWF_DATA`:
+
+1. `D.PROMPTS: Prompt[]` — 7 entries. Per the mock `Prompt` type (`apps/dashboard/lib/types.ts:64`):
+   `id`, `name`, `workflow`, `workflowName`, `span`, `versionCount`, `current`, `trafficSplit: Record<string, number>`, `evalScore`, `evalDelta`, `lastEditedBy`, `lastEditedAtMin`, `tags: PromptTag[]`, `model`.
+2. `D.PROMPT_VERSIONS: Record<string, PromptVersion[]>` — only `p_plan_changes` has history. Per `PromptVersion` (`types.ts:81`):
+   `v`, `deployedAt`, `by`, `status: PromptTag`, `traffic`, `evalScore`, `runs`, `costAvg`, `p95`, `halluc`, `change`.
+3. `D.PROMPT_BODIES: Record<string, string>` — body text keyed by version label (`v12`, `v11`).
+
+`PromptTag = "production" | "staging" | "draft" | "archived" | "locked" | "ab-test"`.
+
+What the screen renders from these:
+- **Header KPIs** (`CkKPI`): total prompts, count in `production`, count of `ab-test`, and a hardcoded `"+0.4%"` avg eval delta.
+- **Left rail `PromptList`:** tag filter pills (`all/production/staging/draft/locked`), per-prompt row showing `name`, `current` version, `workflowName`, tag chips, and an `evalScore`/`evalDelta` figure.
+- **Right pane `PromptDetail`:** header eyebrow `Arthur · {workflowName} → {span}`, `+ New version` / `Deploy` buttons, four `Stat`s (current version, version count, eval score, traffic split), a **version timeline** of `PromptVersion[]`, a two-column **text diff** between two selected versions (`PromptDiff`, reads `PROMPT_BODIES`), and a **side-by-side metrics** table (`PromptMetrics`: evalScore/halluc/p95/costAvg/runs). It already has graceful empty states: "Select a prompt to inspect." and "Detailed version history not yet captured for this prompt." (rendered when `versions.length === 0`).
+
+The page (`app/(cockpit)/prompts/page.tsx`) is a 4-line stub that renders `<PromptsScreen />` with no data fetch.
+
+## Existing pattern (template)
+
+Real data flows through three layers (see `app/overview-data.tsx`, `app/runs-data.tsx`):
+
+1. `app/(cockpit)/<view>/page.tsx` — thin server route: `<Suspense fallback={<Skeleton/>}><Data/></Suspense>`.
+2. `app/<view>-data.tsx` — **server component**: calls `getJSON<T>(path)` (`lib/api/server.ts`, server-only fetch with `Bearer WORKER_API_TOKEN`, `cache: "no-store"`, 10s timeout), `.catch()`es to a fallback in `lib/api/fallbacks.ts`, passes a `data` prop to the client screen.
+3. `components/cockpit/screens/<view>.tsx` — **client presenter**: receives `data`, renders. Untracked metrics arrive `null`/empty and render as `—` or an empty state.
+
+Worker routes live under `apps/worker/src/routes/api/v1/*.get.ts` as h3 `defineEventHandler`s returning a typed `@shared/contracts` response, gated by the shared bearer token (`apps/worker/src/lib/api-auth.ts`). Response interfaces are declared in `apps/shared/contracts/api.ts`; row/entity types in `apps/shared/contracts/domain.ts`.
+
+## Proposed data contract
+
+Add to `apps/shared/contracts/api.ts`. Entity type goes in `domain.ts` (currently has no prompt type).
+
+### `apps/shared/contracts/domain.ts` (new entities)
+
+```ts
+/** One Arthur version of a named prompt (metadata; body fetched on demand). */
+export interface PromptVersion {
+  /** Arthur integer version number. */
+  version: number;
+  /** ISO timestamp the version was created. */
+  createdAt: string;
+  /** Real Arthur tags on this version, e.g. ["production"]. */
+  tags: string[];
+  modelProvider: string;
+  modelName: string;
+  numMessages: number;
+  numTools: number;
+  /** Body text. Present only for the production version (eager); other
+   *  versions are fetched on demand via the by-version endpoint. */
+  body?: string;
+}
+
+/** A workflow phase prompt as resolved by the worker at runtime. */
+export interface PromptDef {
+  /** Stable Arthur/fallback key: "research-plan" | "implement" | "review". */
+  name: string;
+  /** Human label for the workflow phase, e.g. "Research & Plan". */
+  phase: string;
+  /** Resolved production prompt body (Arthur production tag, or in-code fallback). */
+  body: string;
+  /** Where the resolved `body` came from. */
+  source: "arthur" | "fallback";
+  /** Model the agent runs this prompt with (env-derived). */
+  model: string;
+  /** Real Arthur version history, newest first. Empty when source is "fallback". */
+  versions: PromptVersion[];
+}
+```
+
+### `apps/shared/contracts/api.ts` (new response)
+
+```ts
+export interface PromptsResponse {
+  generatedAt: string;
+  /** `false` when the worker can't resolve prompts (degrades to empty list). */
+  available: boolean;
+  /** Whether Arthur is configured (key + endpoint + task id all set). When
+   *  false, every prompt's `source` is "fallback" and `versions` is empty. */
+  arthurEnabled: boolean;
+  rows: PromptDef[];
+  total: number;
+}
+
+/** On-demand body for a single historical Arthur version. */
+export interface PromptVersionBodyResponse {
+  generatedAt: string;
+  available: boolean;
+  body: string | null;
+}
+```
+
+**Body fetch strategy — decided: eager for the production version, lazy for the rest.** The list response carries every phase prompt with its full `versions` metadata array and the **production body eagerly** on `PromptDef.body` (we already fetch it to resolve what the workflow uses, so it's free). Non-production version bodies are NOT shipped in this response — `PromptVersion.body` is `undefined` for them. When the user expands a historical version, the screen fetches that single body on demand through a second worker route (see "Worker routes"). This keeps the list response small (3 bodies, not N) and avoids fanning out an unbounded number of Arthur body calls per page load.
+
+Notes:
+- `available` follows the `RunsResponse`/`RunDetailResponse` convention: `true` on a successful resolve, `false` in the fallback object.
+- `arthurEnabled` lets the screen honestly say "showing in-code defaults" vs "showing production prompts from Arthur".
+- Per-version eval/halluc/p95/cost metrics, traffic split, and `lastEditedBy` are **not** in the contract — Arthur's version list is metadata only and has no such source. The screen markup that rendered them is removed.
+
+## Worker routes
+
+### `GET /api/v1/prompts` — list (new file `apps/worker/src/routes/api/v1/prompts.get.ts`, mirrors `runs.get.ts`)
+
+- `defineEventHandler` returning `PromptsResponse`, same `Cache-Control: private, max-age=15, stale-while-revalidate=60` header.
+- Resolve all three phase prompts via a shared helper `resolvePrompts()`. The exact production-body resolution already lives in `loadPrompts()` (`workflows/prompts-step.ts`), which is a `"use step"` durable step returning `{ research, implement, review }` — not callable from a plain h3 route. **Decision (option A, confirmed OK to touch the step):** extract the pure resolution into `apps/worker/src/lib/prompts/resolve.ts`, returning `PromptDef[]` + `arthurEnabled`, and have **both** `loadPrompts()` and the route call it. Single source of truth, no drift.
+- Per prompt, `resolvePrompts()` does:
+  - `model` = `env.AGENT_KIND === "codex" ? env.CODEX_MODEL : env.CLAUDE_MODEL` (same expression as `runs.get.ts`).
+  - `phase` from a static label map: `research-plan → "Research & Plan"`, `implement → "Implement"`, `review → "Review"`.
+  - When Arthur is enabled: fetch the `production`-tagged body via the existing `ArthurClient.getPromptByTag(taskId, name, "production")` (→ `body`, `source: "arthur"`), AND fetch the version list via a new `ArthurClient.listPromptVersions(taskId, name)` (→ `versions: PromptVersion[]`, newest first). Any single failure degrades that prompt to its in-code fallback body, `source: "fallback"`, `versions: []` — same per-prompt try/catch the current step already has.
+  - When Arthur is disabled: `body` = `PROMPT_FALLBACKS[name]`, `source: "fallback"`, `versions: []`.
+- `available: true` on success; the `catch` returns the empty `available:false` object (matching `runs.get.ts`). Resolution rarely fully throws because each prompt independently falls back, so the happy path always has three rows.
+
+### `GET /api/v1/prompts/[name]/versions/[version]` — on-demand body (new file)
+
+Backs lazy body fetching for historical versions the user expands. New file `apps/worker/src/routes/api/v1/prompts/[name]/versions/[version].get.ts` (h3 dynamic-segment pattern, same as the existing `runs/[runId].get.ts`):
+
+- Reads route params `name` and `version`, validates `name` against `PROMPT_NAMES` (404/empty otherwise), calls a new `ArthurClient.getPromptVersionBody(taskId, name, version)` which hits `GET /api/v1/tasks/{task_id}/prompts/{name}/versions/{version}` and returns the first message content (the existing `getPromptByTag` already parses this `AgenticPrompt.messages[0].content` shape — generalize it to accept any `{prompt_version}`).
+- Returns a small typed response `PromptVersionBodyResponse { generatedAt; available: boolean; body: string | null }` (add to `api.ts`). When Arthur is disabled or the version is missing → `available:false, body:null`.
+- Same `Cache-Control` header and bearer gate as the other v1 routes.
+
+## Dashboard wiring
+
+1. **`lib/api/fallbacks.ts`** — add `promptsFallback(now)`:
+   ```ts
+   export function promptsFallback(now: string): PromptsResponse {
+     return { generatedAt: now, available: false, arthurEnabled: false, rows: [], total: 0 };
+   }
+   ```
+2. **`app/prompts-data.tsx`** (new server component), single fetch like `runs-data.tsx`:
+   ```ts
+   const data = await getJSON<PromptsResponse>("/api/v1/prompts").catch(() => promptsFallback(now));
+   return <PromptsScreen data={data} />;
+   ```
+3. **`app/prompts-skeleton.tsx`** (new) — header + KPI row + two-column (rail + detail) block, styled like `overview-skeleton.tsx`.
+4. **`app/(cockpit)/prompts/page.tsx`** — rewrite to the `<Suspense fallback={<PromptsSkeleton/>}><PromptsData/></Suspense>` shape.
+5. **`components/cockpit/screens/prompts.tsx`** — change `PromptsScreen()` to `PromptsScreen({ data }: { data: PromptsResponse })`. Map the real `PromptDef[]` onto the existing UI. Keep the tag filter and version timeline (now real), but **delete** the per-version metrics grid and the two-column A/B diff (no backing data). Historical-version body expansion fetches lazily from the on-demand route.
+6. **On-demand version-body fetch (client).** `PromptsScreen` is a `"use client"` presenter, so expanding a historical version does a client-side `fetch`. The bearer-gated worker API is not directly reachable from the browser (the `WORKER_API_TOKEN` is server-only — see `lib/api/server.ts`). So add a thin Next route handler `app/api/prompts/[name]/versions/[version]/route.ts` that re-uses `getJSON<PromptVersionBodyResponse>("/api/v1/prompts/<name>/versions/<version>")` server-side and returns it to the client. The screen fetches `/api/prompts/{name}/versions/{version}` (same-origin, no token exposure). Cache the resolved body in component state so re-expanding doesn't refetch.
+
+### Screen mapping (mock field → real field / behavior)
+
+| Mock usage | Real replacement |
+|---|---|
+| `D.PROMPTS` list | `data.rows` (3 `PromptDef`) |
+| `p.id` (row key, selection) | `p.name` (stable key) |
+| `p.name` | `p.name` |
+| `p.workflowName` / `p.span` (eyebrow) | `p.phase` (eyebrow `{data.arthurEnabled ? "Arthur" : "In-code"} · {p.phase}`) |
+| `p.current` version badge | real: highest `p.versions[].version`, or the production-tagged version number; show `source` chip alongside |
+| `p.tags` chips + tag filter pills | **kept, real** — derive the row's tags from its production version's `tags` (`p.versions.find(v => v.tags.includes("production"))?.tags`), and per-version `tags` in the timeline. Filter pills reduced to tags that actually occur (e.g. `all` + `production`). |
+| `p.evalScore` / `p.evalDelta` | **removed** (no Arthur source — markup deleted) |
+| `D.PROMPT_VERSIONS[id]` timeline | **kept, real** — `p.versions` (`{version, createdAt, tags, modelName, numMessages, numTools}`), newest first. Each entry shows version number, `createdAt`, tag chips, `modelName`, message/tool counts. The mock's eval/halluc/p95/cost rows in each timeline card are **removed**. |
+| `D.PROMPT_BODIES[v]` two-column diff (`PromptDiff`) | **removed** — replaced by a single read-only body panel. Shows `p.body` (production) by default; clicking a timeline version fetches that version's body via the on-demand route and renders it in the same panel. |
+| `PromptMetrics` side-by-side table | **removed** (no per-version metrics) |
+| Header KPIs (total / production / ab-test / avg Δ) | total = `data.total`; "In production" = count of rows whose versions include a `production` tag; ab-test and avg-Δ tiles **removed** (no source) |
+| `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons | left inert (read-only), matching how `/runs` left its `+ Filter` / `Export` buttons |
+
+Faithful render: left rail lists the 3 prompts by `name` + `phase` + `model` + production tag chip; right pane shows a read-only body panel (production body by default, swappable to a selected historical version fetched on demand) plus the real version timeline. Reuses `CkCard`/`CkKPI`/`Stat`, the chip styling (repurposed for real `tags`), and the single-column body markup lifted from the old `PromptDiff`.
+
+## Behavior
+
+- **Happy path (Arthur disabled — current production reality):** `/prompts` lists the 3 workflow prompts with their in-code fallback bodies, `source: "fallback"`, `arthurEnabled: false`, `versions: []`. Eyebrow reflects "In-code". The version timeline section is empty (no markup, since there are no versions). Bodies are exactly what the agent runs.
+- **Happy path (Arthur enabled):** each prompt's production body and full real version history come from Arthur (`source: "arthur"`). The timeline lists every Arthur version with its real `version`, `createdAt`, `tags`, and `modelName`. Expanding a historical version fetches its body on demand via `GET /api/v1/prompts/[name]/versions/[version]`. A prompt that fails to resolve from Arthur degrades to its fallback body with `versions: []`.
+- **Worker down / 401:** `getJSON` throws → `promptsFallback` → empty list, `available:false`. The screen shows its "Select a prompt to inspect." empty state with `0 prompts`. No crash. Same silent-fallback as `/runs`. An on-demand body fetch that fails renders an inline "version body unavailable" note, not a page crash.
+
+## Out of scope
+
+- Editing, creating, deploying, or version-bumping prompts (the `+ New version` / `Deploy` / `Import from prod` / `+ New prompt` buttons stay inert).
+- Per-version eval/halluc/p95/cost metrics and the two-version A/B text diff — no Arthur source; markup removed.
+- Traffic split, `lastEditedBy`, eval deltas — no source; markup removed.
+- Wiring the `/editor` view (separate `workflow-editor` screen).
+
+## Open questions / assumptions
+
+Resolved by user decisions and Arthur API ground-truthing:
+
+- **Read-only — confirmed.** No write endpoints; action buttons stay inert.
+- **Version history — confirmed in scope.** Real Arthur version history (metadata + on-demand bodies) is included. Per-version eval metrics are NOT available from Arthur's version-list endpoint (metadata only: `{version, created_at, tags, model_name, num_messages, num_tools}`), so the mock's per-version metrics are dropped — confirmed acceptable.
+- **Tags are real.** The `production` badge and the tag filter are backed by `AgenticPromptVersionResponse.tags`; kept.
+- **Resolution-helper extraction — confirmed.** Shared `resolvePrompts()` used by both `loadPrompts()` and the route; OK to touch `prompts-step.ts`.
+- **Embellishment fields — removed, not stubbed.** Per the user decision, fields with no real backing have their markup deleted rather than rendered as static placeholders.
+
+Still open:
+
+1. **Lazy vs eager body fetch — proposed eager-for-production, lazy-for-history.** Stated above; flagged here in case you'd rather ship all version bodies eagerly (simpler client, larger/slower response) or fetch even the production body lazily (smaller list response, extra round-trip on first view).
+2. **Version pagination depth.** Arthur's `…/versions` endpoint is paginated. Assumption: fetch the first page only (newest N, e.g. default page size) and not the full history — sufficient for the timeline. Confirm whether deep history (all pages) is required.
+
+## Verification
+
+1. `apps/shared` + `apps/worker` typecheck (`pnpm -F @apps/worker typecheck` or `npx tsc --noEmit`).
+2. Worker `GET /api/v1/prompts` returns 3 rows with non-empty `body`, correct `source`, `arthurEnabled` reflecting env, and (Arthur on) a non-empty `versions[]` with real `version`/`createdAt`/`tags`. Existing `prompts-step` tests still pass.
+3. Worker `GET /api/v1/prompts/research-plan/versions/<n>` returns that version's `body` (Arthur on) or `available:false` (Arthur off / missing).
+4. Dashboard typecheck passes.
+5. `/prompts` renders the 3 real prompts; selecting one shows its production body; the timeline lists real Arthur versions; expanding one fetches and shows that version's body. With Arthur disabled, `source` is `fallback`, the timeline is empty, and bodies match `apps/worker/src/lib/prompts.ts`.
+6. With the worker unreachable, `/prompts` shows the empty state (`0 prompts`), not an error.
+</content>
+</invoke>