Blazity · kasin-it · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/.claude/learnings.md b/.claude/learnings.md
@@ -48,3 +48,18 @@ In flow-editor.tsx (FlowCanvas), touch drag/pan on iOS jumped ~5px then stopped,
 
 ## 2026-06-05 — CORRECTION: real root cause was onPointerLeave, not page-scroll
 On-device HUD instrumentation proved the actual cause of the iOS canvas drag bug: the canvas had `onPointerLeave={onPointerUp}`, and iOS Safari spuriously fires `pointerleave` mid-gesture (finger still down and inside the element, even with pointer capture set). That ended the drag one move in — node/canvas jumped ~5px then froze while the finger kept moving. Fix: gate it to non-touch only `onPointerLeave={(e)=>{ if(e.pointerType!=="touch") onPointerUp(e); }}` — pointer capture guarantees a real pointerup/cancel, so pointerleave is only needed as the desktop mouse-left-window fallback. The non-passive touchmove preventDefault was kept (prevents page-scroll hijack / pointercancel) but was NOT the primary fix. Lesson: when reasoning about WebKit event quirks fails twice, add an on-screen HUD logging the raw pointer event stream instead of guessing.
+
+## Arthur GenAI Engine — eval read path (for /evals page)
+- Arthur is integrated **write-only** in the worker today: `arthur-tracer.ts` ships OpenInference traces via `POST /api/v1/traces`; `arthur-client.ts` only does tasks/prompts (`/api/v2/tasks*`, `/api/v1/tasks/{id}/prompts*`). Nothing reads evals back.
+- Arthur DOES expose a read API (same `Bearer GENAI_ENGINE_API_KEY`, needs `INFERENCE_READ`, org-scoped → matches single-tenant). Primary for fleet eval health: `POST /api/v1/traces/overview` { task_ids, start_time, end_time } → overviews of { trace_count, eval_count, continuous_eval_success_rate, ... }. Per-metric detail: span `metric_results` (metric_type enum is ONLY QueryRelevance|ResponseRelevance|ToolSelection; `details` is opaque JSON, no flat score/pass-fail — parse + threshold yourself). Timeseries: `POST /api/v1/traces/overview/timeseries` (single task per call).
+- CRITICAL: the rich rule families the mock shows (hallucination/PII/toxicity/prompt-injection) come ONLY from the legacy `/validate_prompt` + `/validate_response` write path, which we never call. `GET /api/v2/inferences/query` is empty for us. Our trace path yields only success-rate + eval/trace counts + the 3 relevance/tool metric types (and only if continuous evals are configured on the task; else eval_count=0).
+
+## 2026-06-09 — CORRECTION: the Arthur read API above was WRONG; verified against live instance
+The /cost + /evals collectors were first built against assumed endpoints that DO NOT EXIST. Verified the real API against the deployed Arthur (`accomplished-beauty-production-8c60.up.railway.app`, FastAPI — fetch `/openapi.json` for ground truth). Hard-won facts:
+- **No aggregate/overview endpoint exists.** `POST /api/v1/traces/overview`, `.../overview/timeseries`, and `POST /api/v1/traces/spans` all 404/405. The real shape is row-query + **client-side aggregation**: `GET /api/v1/traces?task_ids=…&start_time=…&end_time=…&page&page_size` returns per-trace `{ task_id, total_token_count, total_token_cost (USD, may be null), start_time, … }`. `GET /api/v1/traces/spans` is GET (not POST) and its rows carry NO `model_name` (model is only in `span_name` for LLM-kind spans, e.g. `"claude/claude-opus-4-6"`) — so "cost by model" was dropped.
+- **`task_ids` is REQUIRED** on every trace read (empty → 400 "Field required"). The "empty = all org" assumption was false. Must enumerate tasks first.
+- **Pages are 0-INDEXED.** `page=1` skips the first page → empty results (this silently returned $0). Start at `page=0`; loop until `collected >= count` or an empty page.
+- **`POST /api/v2/tasks/search` pagination is BROKEN** (page param effectively ignored; unique tasks returned DECREASE as page_size grows: 162→147→122→72; and it omits tasks that actually have traces, e.g. AWP-3.5). Use **`GET /api/v2/tasks?page_size=N`** instead — returns a bare JSON array of all tasks reliably in one oversized page. `ArthurClient.listAllTasks` does this.
+- Multi-`task_ids` queries work and accumulate correctly; batch the ids ~50/request to keep the GET URL well under server limits (`ArthurClient.listTraces`/`countTraces` chunk + paginate + merge).
+- **Evals: there is no success-rate field.** Compute pass-rate from `countTraces(..., { continuous_eval_run_status })` (enum: `pending|passed|running|failed|skipped|error`): `score = passed / (passed+failed)`. On this instance continuous evals are NOT configured (`GET /api/v2/tasks/{id}/metrics` → 404, trace `metrics` carry only token/cost, `annotations: null`, spans have no `metric_results`), so passed=failed=0 and /evals correctly degrades to `available:false`. The logic lights up once evals are enabled.
+- Verified live (MTD): /cost = $26.33 over 94 traces / 23M tokens, real per-workflow + daily breakdown. The dashboard reads the **deployed** worker — these fixes need a worker redeploy to show.
diff --git a/apps/dashboard/app/(cockpit)/cost/page.tsx b/apps/dashboard/app/(cockpit)/cost/page.tsx
@@ -1,5 +1,12 @@
 // apps/dashboard/app/(cockpit)/cost/page.tsx — Cost & usage ("/cost")
-import { CostScreen } from "@/components/cockpit/screens/cost";
+import { Suspense } from "react";
+import { CostData } from "@/app/cost-data";
+import { CostSkeleton } from "@/app/cost-skeleton";
+
 export default function CostPage() {
-  return <CostScreen />;
+  return (
+    <Suspense fallback={<CostSkeleton />}>
+      <CostData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/(cockpit)/evals/page.tsx b/apps/dashboard/app/(cockpit)/evals/page.tsx
@@ -1,5 +1,13 @@
 // apps/dashboard/app/(cockpit)/evals/page.tsx — Arthur evals ("/evals")
-import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import { Suspense } from "react";
+
+import { EvalsData } from "@/app/evals-data";
+import { EvalsSkeleton } from "@/app/evals-skeleton";
+
 export default function EvalsPage() {
-  return <EvalsScreen />;
+  return (
+    <Suspense fallback={<EvalsSkeleton />}>
+      <EvalsData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/(cockpit)/prompts/page.tsx b/apps/dashboard/app/(cockpit)/prompts/page.tsx
@@ -1,5 +1,13 @@
 // apps/dashboard/app/(cockpit)/prompts/page.tsx — Prompts ("/prompts")
-import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import { Suspense } from "react";
+
+import { PromptsData } from "@/app/prompts-data";
+import { PromptsSkeleton } from "@/app/prompts-skeleton";
+
 export default function PromptsPage() {
-  return <PromptsScreen />;
+  return (
+    <Suspense fallback={<PromptsSkeleton />}>
+      <PromptsData />
+    </Suspense>
+  );
 }
diff --git a/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts b/apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
@@ -0,0 +1,18 @@
+// apps/dashboard/app/api/prompts/[name]/versions/[version]/route.ts
+// Same-origin proxy so the client can lazily fetch a historical prompt-version
+// body without the server-only WORKER_API_TOKEN ever reaching the browser.
+import { NextResponse } from "next/server";
+import { getJSON } from "@/lib/api/server";
+import type { PromptVersionBodyResponse } from "@shared/contracts";
+
+export async function GET(
+  _req: Request,
+  { params }: { params: Promise<{ name: string; version: string }> },
+) {
+  const { name, version } = await params;
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptVersionBodyResponse>(
+    `/api/v1/prompts/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`,
+  ).catch(() => ({ generatedAt: now, available: false, body: null }));
+  return NextResponse.json(data);
+}
diff --git a/apps/dashboard/app/cost-data.tsx b/apps/dashboard/app/cost-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/cost-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { CostScreen } from "@/components/cockpit/screens/cost";
+import type { CostResponse } from "@shared/contracts";
+import { costFallback } from "@/lib/api/fallbacks";
+
+export async function CostData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<CostResponse>("/api/v1/cost").catch(() =>
+    costFallback(now),
+  );
+  return <CostScreen data={data} />;
+}
diff --git a/apps/dashboard/app/cost-skeleton.tsx b/apps/dashboard/app/cost-skeleton.tsx
@@ -0,0 +1,18 @@
+// apps/dashboard/app/cost-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function CostSkeleton() {
+  return (
+    <div className="px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="grid grid-cols-3 gap-3">
+        {Array.from({ length: 3 }, (_, i) => <Block key={i} className="h-[100px]" />)}
+      </div>
+      <div className="grid lg:grid-cols-[1.5fr_1fr] gap-3">
+        <Block className="h-[260px]" />
+        <Block className="h-[260px]" />
+      </div>
+      <Block className="h-[300px]" />
+      <Block className="h-[300px]" />
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/evals-data.tsx b/apps/dashboard/app/evals-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/evals-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { EvalsScreen } from "@/components/cockpit/screens/evals";
+import type { EvalsResponse } from "@shared/contracts";
+import { evalsFallback } from "@/lib/api/fallbacks";
+
+export async function EvalsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<EvalsResponse>("/api/v1/evals").catch(() =>
+    evalsFallback(now),
+  );
+  return <EvalsScreen data={data} />;
+}
diff --git a/apps/dashboard/app/evals-skeleton.tsx b/apps/dashboard/app/evals-skeleton.tsx
@@ -0,0 +1,16 @@
+// apps/dashboard/app/evals-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function EvalsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      {/* Header (eyebrow + title, chip) */}
+      <div className="flex items-center justify-between">
+        <Block className="h-10 w-72" />
+        <Block className="h-8 w-64" />
+      </div>
+      {/* Quality group card */}
+      <Block className="h-[200px]" />
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/prompts-data.tsx b/apps/dashboard/app/prompts-data.tsx
@@ -0,0 +1,13 @@
+// apps/dashboard/app/prompts-data.tsx
+import { getJSON } from "@/lib/api/server";
+import { PromptsScreen } from "@/components/cockpit/screens/prompts";
+import type { PromptsResponse } from "@shared/contracts";
+import { promptsFallback } from "@/lib/api/fallbacks";
+
+export async function PromptsData() {
+  const now = new Date().toISOString();
+  const data = await getJSON<PromptsResponse>("/api/v1/prompts").catch(() =>
+    promptsFallback(now),
+  );
+  return <PromptsScreen data={data} />;
+}
diff --git a/apps/dashboard/app/prompts-skeleton.tsx b/apps/dashboard/app/prompts-skeleton.tsx
@@ -0,0 +1,22 @@
+// apps/dashboard/app/prompts-skeleton.tsx
+import { Block } from "./skeleton-block";
+
+export function PromptsSkeleton() {
+  return (
+    <div className="px-4 lg:px-6 pt-5 pb-8 flex flex-col gap-4">
+      <div className="flex items-end justify-between">
+        <Block className="h-10 w-56" />
+        <Block className="h-9 w-64" />
+      </div>
+      <div className="grid grid-cols-1 lg:grid-cols-2 gap-3">
+        {Array.from({ length: 2 }, (_, i) => (
+          <Block key={i} className="h-[124px]" />
+        ))}
+      </div>
+      <div className="flex flex-col lg:grid lg:grid-cols-[340px_1fr] gap-3 lg:min-h-[720px]">
+        <Block className="lg:h-full h-[300px]" />
+        <Block className="lg:h-full h-[400px]" />
+      </div>
+    </div>
+  );
+}
diff --git a/apps/dashboard/app/skeleton-block.tsx b/apps/dashboard/app/skeleton-block.tsx
@@ -0,0 +1,4 @@
+// apps/dashboard/app/skeleton-block.tsx
+export function Block({ className = "" }: { className?: string }) {
+  return <div className={`bg-neutral-200/60 rounded-sm animate-pulse ${className}`} />;
+}