Skip to content

Commit 9c37bd6

Browse files
feat: initial poc
1 parent 44d3180 commit 9c37bd6

12 files changed

Lines changed: 1805 additions & 8 deletions

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
"@vitest/coverage-v8": "^4.1.2",
122122
"@vitest/eslint-plugin": "^1.6.9",
123123
"ai": "^6.0.116",
124+
"braintrust": "^3.9.0",
124125
"concurrently": "^9.2.1",
125126
"duplexpair": "^1.0.2",
126127
"eslint": "^9.34.0",

pnpm-lock.yaml

Lines changed: 509 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import { runEval } from "./infra/scaffolding.js";
2+
3+
const context =
4+
"You are a MongoDB expert. For all operations use the 'movies' collection by default.";
5+
6+
const mflixMovies = {
7+
collection: "movies",
8+
documents: "tests/accuracy/test-data-dumps/mflix.movies-with-plot.json",
9+
};
10+
11+
runEval({
12+
clusterConfig: { search: true },
13+
maxConcurrency: 5,
14+
experimentName: "search-benchmark-<model_name>",
15+
id: "idx-management",
16+
tags: ["<model_name>", "search-benchmark", "index-management"],
17+
data: [
18+
{
19+
id: "idx-create-dynamic",
20+
input: {
21+
systemPrompt: context,
22+
userPrompt: "Create a search index on 'movies' collection with dynamic mapping.",
23+
dbClusterSeed: {
24+
collections: [mflixMovies],
25+
},
26+
},
27+
assertions:
28+
"Look up all indexes of 'movies' you should see one search index with dynamic mapping.",
29+
},
30+
{
31+
id: "idx-delete",
32+
input: {
33+
systemPrompt: context,
34+
userPrompt: "Remove index 'movies_title_text'.",
35+
followUpInstructions: [
36+
"If after a failed attempt assistant suggests another action that could potentially lead to successfully deleting it, allow it.",
37+
],
38+
dbClusterSeed: {
39+
collections: [
40+
{
41+
...mflixMovies,
42+
indexes: [
43+
{
44+
type: "search",
45+
name: "movies_title_text",
46+
definition: { mappings: { fields: { title: { type: "string" } } } },
47+
},
48+
],
49+
},
50+
],
51+
},
52+
},
53+
assertions: [
54+
"Confirm that a 'search' index with 'movies_title_text' is found and successfully deleted from 'movies' collection.",
55+
"If assistant fails to find the index eventually consider the test failed.",
56+
"Reduce score by 25% if it needed user intervention to delete the index.",
57+
],
58+
},
59+
{
60+
id: "idx-query-must",
61+
input: {
62+
systemPrompt: context,
63+
userPrompt: "Find movies with 'Romance' in genres and 'rich British person in India' (use text search) in its plot.",
64+
dbClusterSeed: {
65+
collections: [{
66+
...mflixMovies,
67+
indexes: [
68+
{
69+
type: "search",
70+
name: "movies_plot_text",
71+
definition: { mappings: { fields: { plot: { type: "string" } } } },
72+
},
73+
],
74+
}]
75+
},
76+
},
77+
assertions:
78+
"The assistant is expected to return at least 1 document, the first returned result should be the document with id 'fbf30e42-ae6d-4775-bb3e-c5c127ddea06' from 'movies' collection.",
79+
}
80+
],
81+
});
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import * as untracedAi from "ai";
2+
import type { ModelMessage, OnStepFinishEvent } from "ai";
3+
import { wrapAISDK } from "braintrust";
4+
5+
import type { VercelMCPClientTools } from "../../sdk/agent.js";
6+
import type { Model } from "../../sdk/models.js";
7+
8+
const ai = wrapAISDK(untracedAi); // wraps Vercel AI SDK for Braintrust tracing
9+
10+
// Number of LLM steps (tool calls, tool results, and assistant messages) allowed
11+
// per conversation before forcefully stopping it to prevent infinite loops in failure cases.
12+
const DEFAULT_STEP_COUNT = 10;
13+
14+
// Truncate tool outputs in the conversation serialization (used to feed the conversation
15+
// into the judge bot and follow-up bot) to prevent overwhelming their context windows
16+
// with verbose tool results.
17+
const CONVERSATION_SERIALIZER_MAX_TOOL_OUTPUT_CHARS = 4000;
18+
19+
export const ROLE = {
20+
USER: "USER",
21+
ASSISTANT: "ASSISTANT",
22+
FOLLOW_UP_BOT: "FOLLOW-UP-BOT",
23+
JUDGE_BOT: "JUDGE-BOT",
24+
} as const;
25+
26+
export class Conversation {
27+
private messages: ModelMessage[] = [];
28+
readonly tools: VercelMCPClientTools;
29+
readonly model: Model;
30+
31+
constructor(tools: VercelMCPClientTools, model: Model, initialMessages: ModelMessage[] = []) {
32+
this.tools = tools;
33+
this.model = model;
34+
this.messages = [...initialMessages];
35+
}
36+
37+
async converse(systemPrompt: string, userPrompt: string): Promise<void> {
38+
debugStep(ROLE.USER, { stepNumber: 0, text: userPrompt } as OnStepFinishEvent<any>);
39+
this.appendMessages({ role: "user" as const, content: userPrompt });
40+
41+
const result = await ai.generateText({
42+
model: this.model.getModel(),
43+
system: systemPrompt,
44+
messages: this.getMessages(),
45+
tools: this.tools,
46+
onStepFinish: (step) => debugStep(ROLE.ASSISTANT, step),
47+
stopWhen: ai.stepCountIs(DEFAULT_STEP_COUNT),
48+
});
49+
50+
this.appendMessages(...result.response.messages);
51+
}
52+
53+
getMessages(): ModelMessage[] {
54+
return this.messages;
55+
}
56+
57+
private appendMessages(...messages: ModelMessage[]): void {
58+
this.messages.push(...messages);
59+
}
60+
}
61+
62+
// Produces numbered <turn> XML blocks consumed by the follow-up bot and judge bot as their conversation input.
63+
export function serializeMessages(messages: ModelMessage[]): string {
64+
const truncate = (s: string, max: number) =>
65+
s.length <= max ? s : `${s.slice(0, max)}…[truncated ${s.length - max} chars]`;
66+
const blocks: string[] = [];
67+
let turn = 0;
68+
for (const msg of messages) {
69+
const role = String((msg.role as string | undefined) ?? "unknown");
70+
const content = (msg as Record<string, unknown>).content;
71+
const inner: string[] = [];
72+
if (typeof content === "string") {
73+
if (content) inner.push(content);
74+
} else if (Array.isArray(content)) {
75+
for (const part of content as Record<string, unknown>[]) {
76+
switch (part.type) {
77+
case "text":
78+
if (part.text) inner.push(String(part.text as string));
79+
break;
80+
case "tool-call": {
81+
const id = String(part.toolCallId ?? "");
82+
const name = String(part.toolName ?? "");
83+
inner.push(`<tool_call id="${id}" name="${name}">${JSON.stringify(part.input)}</tool_call>`);
84+
break;
85+
}
86+
case "tool-result": {
87+
const id = String(part.toolCallId ?? "");
88+
const name = String(part.toolName ?? "");
89+
const output = truncate(
90+
JSON.stringify(part.output),
91+
CONVERSATION_SERIALIZER_MAX_TOOL_OUTPUT_CHARS
92+
);
93+
inner.push(`<tool_result for="${id}" name="${name}">${output}</tool_result>`);
94+
break;
95+
}
96+
default:
97+
inner.push(JSON.stringify(part));
98+
}
99+
}
100+
}
101+
if (inner.length === 0) continue;
102+
turn += 1;
103+
blocks.push(`<turn n="${turn}" role="${role}">\n${inner.join("\n")}\n</turn>`);
104+
}
105+
return blocks.join("\n");
106+
}
107+
108+
// Prints conversation progress in a human-friendly format with color-coding for easier debugging of eval failures.
109+
export function debugStep(role: string, step: OnStepFinishEvent<any>): void {
110+
if (!process.env.DEBUG) return;
111+
112+
const colors = {
113+
cyan: "\x1b[36m",
114+
green: "\x1b[32m",
115+
yellow: "\x1b[33m",
116+
magenta: "\x1b[35m",
117+
red: "\x1b[31m",
118+
blue: "\x1b[34m",
119+
reset: "\x1b[0m",
120+
};
121+
if (step.reasoningText) {
122+
console.log(`${colors.cyan}${role} (#${step.stepNumber}): REASONING: ${step.reasoningText}${colors.reset}`);
123+
}
124+
if (step.text) {
125+
let color = colors.yellow;
126+
if (role === `${ROLE.ASSISTANT}[${ROLE.FOLLOW_UP_BOT}]`) {
127+
color = colors.red;
128+
} else if (role === `${ROLE.ASSISTANT}[${ROLE.JUDGE_BOT}]`) {
129+
color = colors.magenta;
130+
} else if (role === ROLE.ASSISTANT) {
131+
color = colors.green;
132+
}
133+
console.log(`${color}${role} (#${step.stepNumber}): ${step.text}${colors.reset}`);
134+
}
135+
if (step.toolResults && step.toolResults.length > 0) {
136+
const first = step.toolResults[0]!;
137+
if (
138+
step.toolResults.length === 1 &&
139+
(first.toolName === "submit-score" || first.toolName === "submit-follow-up")
140+
) {
141+
console.log(`${colors.green}${role} (#${step.stepNumber}): VERDICT: ${JSON.stringify(first.input, null, 2)}${colors.reset}`);
142+
} else {
143+
console.log(`${colors.blue}${role} (#${step.stepNumber}): TOOL-CALL: ${JSON.stringify(step.toolResults, null, 2)}${colors.reset}`);
144+
}
145+
} else if (step.toolCalls && step.toolCalls.length > 0) {
146+
console.log(`${colors.yellow}${role} (#${step.stepNumber}): TOOL-REQUEST: ${JSON.stringify(step.toolCalls, null, 2)}${colors.reset}`);
147+
}
148+
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import * as untracedAi from "ai";
2+
import { wrapAISDK, traced } from "braintrust";
3+
import { z } from "zod";
4+
5+
import type { Model } from "../../sdk/models.js";
6+
import type { FollowUpResult } from "./scaffolding.types.js";
7+
import { type Conversation, ROLE, serializeMessages, debugStep } from "./conversation.js";
8+
9+
const ai = wrapAISDK(untracedAi);
10+
11+
const followUpSchema = z.object({
12+
hasFollowUp: z
13+
.boolean()
14+
.describe("true if a follow-up instruction is still applicable and would advance the conversation, false if none are"),
15+
explanation: z.string().describe("brief explanation of the decision"),
16+
response: z.string().optional().describe("message to send as the user (required when hasFollowUp=true)"),
17+
});
18+
19+
export class FollowUpBot {
20+
constructor(private model: Model) { }
21+
22+
async decide(conversation: Conversation, instructions: string | string[]): Promise<FollowUpResult> {
23+
const instructionsArray = Array.isArray(instructions) ? instructions : [instructions];
24+
const conversationSummary = serializeMessages(conversation.getMessages());
25+
26+
// We capture structured data via tool calls rather than forcing schema on text generation,
27+
// which preserves the model's reasoning for debugging.
28+
let followUpResult: z.infer<typeof followUpSchema> = {
29+
hasFollowUp: false,
30+
explanation: "No follow-up needed by default",
31+
};
32+
const submitFollowUpTool = untracedAi.tool({
33+
description: "Submit your follow-up decision. Call this exactly once.",
34+
inputSchema: followUpSchema,
35+
execute: async (input) => {
36+
followUpResult = input;
37+
return { ok: true };
38+
},
39+
});
40+
41+
await traced(
42+
async () => {
43+
const userPrompt = `Process this conversation:\n${conversationSummary}`;
44+
debugStep(`${ROLE.USER}[${ROLE.FOLLOW_UP_BOT}]`, { stepNumber: 0, text: userPrompt } as any);
45+
46+
await ai.generateText({
47+
model: this.model.getModel(),
48+
system: buildFollowUpSystemPrompt(instructionsArray),
49+
messages: [{ role: "user" as const, content: userPrompt }],
50+
tools: { "submit-follow-up": submitFollowUpTool },
51+
onStepFinish: (step) => debugStep(`${ROLE.ASSISTANT}[${ROLE.FOLLOW_UP_BOT}]`, step),
52+
stopWhen: [ai.hasToolCall("submit-follow-up")],
53+
});
54+
},
55+
{ name: "follow-up-bot" }
56+
);
57+
58+
if (followUpResult.hasFollowUp) {
59+
return {
60+
hasFollowUp: true,
61+
response: followUpResult.response!
62+
};
63+
} else {
64+
return {
65+
hasFollowUp: false,
66+
}
67+
}
68+
}
69+
}
70+
71+
function buildFollowUpSystemPrompt(instructions: string[]): string {
72+
return [
73+
"You are a human tester working with a MongoDB AI assistant.",
74+
"You receive a conversation transcript (including tool calls and results) and follow-up instructions.",
75+
"",
76+
"### Follow-up instructions",
77+
...instructions.map((s, i) => `${i + 1}. ${s}`),
78+
"",
79+
"### Behaviour",
80+
"- Review the conversation and determine if any follow-up instruction is still applicable and would advance the conversation.",
81+
"- If yes: call `submit-follow-up` with hasFollowUp=true and a `response` written as a user directive in first person as concise as possible.",
82+
"- If no applicable instructions remain: call `submit-follow-up` with hasFollowUp=false.",
83+
].join("\n");
84+
}

0 commit comments

Comments
 (0)