mongodb-js
diff --git a/‎package.json‎
Lines changed: 1 addition & 0 deletions b/‎package.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pnpm-lock.yaml‎
Lines changed: 509 additions & 4 deletions b/‎pnpm-lock.yaml‎
Lines changed: 509 additions & 4 deletions
diff --git a/‎tests/accuracy/eval/indexManagementTextSearchIndexCreation.eval.ts‎
Lines changed: 81 additions & 0 deletions b/‎tests/accuracy/eval/indexManagementTextSearchIndexCreation.eval.ts‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎tests/accuracy/eval/infra/conversation.ts‎
Lines changed: 148 additions & 0 deletions b/‎tests/accuracy/eval/infra/conversation.ts‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎tests/accuracy/eval/infra/followUpBot.ts‎
Lines changed: 84 additions & 0 deletions b/‎tests/accuracy/eval/infra/followUpBot.ts‎
Lines changed: 84 additions & 0 deletions
@@ -121,6 +121,7 @@
     "@vitest/coverage-v8": "^4.1.2",
     "@vitest/eslint-plugin": "^1.6.9",
     "ai": "^6.0.116",
+    "braintrust": "^3.9.0",
     "concurrently": "^9.2.1",
     "duplexpair": "^1.0.2",
     "eslint": "^9.34.0",
 
@@ -0,0 +1,81 @@
+import { runEval } from "./infra/scaffolding.js";
+
+const context =
+    "You are a MongoDB expert. For all operations use the 'movies' collection by default.";
+
+const mflixMovies = {
+    collection: "movies",
+    documents: "tests/accuracy/test-data-dumps/mflix.movies-with-plot.json",
+};
+
+runEval({
+    clusterConfig: { search: true },
+    maxConcurrency: 5,
+    experimentName: "search-benchmark-<model_name>",
+    id: "idx-management",
+    tags: ["<model_name>", "search-benchmark", "index-management"],
+    data: [
+        {
+            id: "idx-create-dynamic",
+            input: {
+                systemPrompt: context,
+                userPrompt: "Create a search index on 'movies' collection with dynamic mapping.",
+                dbClusterSeed: {
+                    collections: [mflixMovies],
+                },
+            },
+            assertions:
+                "Look up all indexes of 'movies' you should see one search index with dynamic mapping.",
+        },
+        {
+            id: "idx-delete",
+            input: {
+                systemPrompt: context,
+                userPrompt: "Remove index 'movies_title_text'.",
+                followUpInstructions: [
+                    "If after a failed attempt assistant suggests another action that could potentially lead to successfully deleting it, allow it.",
+                ],
+                dbClusterSeed: {
+                    collections: [
+                        {
+                            ...mflixMovies,
+                            indexes: [
+                                {
+                                    type: "search",
+                                    name: "movies_title_text",
+                                    definition: { mappings: { fields: { title: { type: "string" } } } },
+                                },
+                            ],
+                        },
+                    ],
+                },
+            },
+            assertions: [
+                "Confirm that a 'search' index with 'movies_title_text' is found and successfully deleted from 'movies' collection.",
+                "If assistant fails to find the index eventually consider the test failed.",
+                "Reduce score by 25% if it needed user intervention to delete the index.",
+            ],
+        },
+        {
+            id: "idx-query-must",
+            input: {
+                systemPrompt: context,
+                userPrompt: "Find movies with 'Romance' in genres and 'rich British person in India' (use text search) in its plot.",
+                dbClusterSeed: {
+                    collections: [{
+                        ...mflixMovies,
+                        indexes: [
+                            {
+                                type: "search",
+                                name: "movies_plot_text",
+                                definition: { mappings: { fields: { plot: { type: "string" } } } },
+                            },
+                        ],
+                    }]
+                },
+            },
+            assertions:
+                "The assistant is expected to return at least 1 document, the first returned result should be the document with id 'fbf30e42-ae6d-4775-bb3e-c5c127ddea06' from 'movies' collection.",
+        }
+    ],
+});
@@ -0,0 +1,148 @@
+import * as untracedAi from "ai";
+import type { ModelMessage, OnStepFinishEvent } from "ai";
+import { wrapAISDK } from "braintrust";
+
+import type { VercelMCPClientTools } from "../../sdk/agent.js";
+import type { Model } from "../../sdk/models.js";
+
+const ai = wrapAISDK(untracedAi); // wraps Vercel AI SDK for Braintrust tracing
+
+// Number of LLM steps (tool calls, tool results, and assistant messages) allowed
+// per conversation before forcefully stopping it to prevent infinite loops in failure cases.
+const DEFAULT_STEP_COUNT = 10;
+
+// Truncate tool outputs in the conversation serialization (used to feed the conversation
+// into the judge bot and follow-up bot) to prevent overwhelming their context windows
+// with verbose tool results.
+const CONVERSATION_SERIALIZER_MAX_TOOL_OUTPUT_CHARS = 4000;
+
+export const ROLE = {
+    USER: "USER",
+    ASSISTANT: "ASSISTANT",
+    FOLLOW_UP_BOT: "FOLLOW-UP-BOT",
+    JUDGE_BOT: "JUDGE-BOT",
+} as const;
+
+export class Conversation {
+    private messages: ModelMessage[] = [];
+    readonly tools: VercelMCPClientTools;
+    readonly model: Model;
+
+    constructor(tools: VercelMCPClientTools, model: Model, initialMessages: ModelMessage[] = []) {
+        this.tools = tools;
+        this.model = model;
+        this.messages = [...initialMessages];
+    }
+
+    async converse(systemPrompt: string, userPrompt: string): Promise<void> {
+        debugStep(ROLE.USER, { stepNumber: 0, text: userPrompt } as OnStepFinishEvent<any>);
+        this.appendMessages({ role: "user" as const, content: userPrompt });
+
+        const result = await ai.generateText({
+            model: this.model.getModel(),
+            system: systemPrompt,
+            messages: this.getMessages(),
+            tools: this.tools,
+            onStepFinish: (step) => debugStep(ROLE.ASSISTANT, step),
+            stopWhen: ai.stepCountIs(DEFAULT_STEP_COUNT),
+        });
+
+        this.appendMessages(...result.response.messages);
+    }
+
+    getMessages(): ModelMessage[] {
+        return this.messages;
+    }
+
+    private appendMessages(...messages: ModelMessage[]): void {
+        this.messages.push(...messages);
+    }
+}
+
+// Produces numbered <turn> XML blocks consumed by the follow-up bot and judge bot as their conversation input.
+export function serializeMessages(messages: ModelMessage[]): string {
+    const truncate = (s: string, max: number) =>
+        s.length <= max ? s : `${s.slice(0, max)}…[truncated ${s.length - max} chars]`;
+    const blocks: string[] = [];
+    let turn = 0;
+    for (const msg of messages) {
+        const role = String((msg.role as string | undefined) ?? "unknown");
+        const content = (msg as Record<string, unknown>).content;
+        const inner: string[] = [];
+        if (typeof content === "string") {
+            if (content) inner.push(content);
+        } else if (Array.isArray(content)) {
+            for (const part of content as Record<string, unknown>[]) {
+                switch (part.type) {
+                    case "text":
+                        if (part.text) inner.push(String(part.text as string));
+                        break;
+                    case "tool-call": {
+                        const id = String(part.toolCallId ?? "");
+                        const name = String(part.toolName ?? "");
+                        inner.push(`<tool_call id="${id}" name="${name}">${JSON.stringify(part.input)}</tool_call>`);
+                        break;
+                    }
+                    case "tool-result": {
+                        const id = String(part.toolCallId ?? "");
+                        const name = String(part.toolName ?? "");
+                        const output = truncate(
+                            JSON.stringify(part.output),
+                            CONVERSATION_SERIALIZER_MAX_TOOL_OUTPUT_CHARS
+                        );
+                        inner.push(`<tool_result for="${id}" name="${name}">${output}</tool_result>`);
+                        break;
+                    }
+                    default:
+                        inner.push(JSON.stringify(part));
+                }
+            }
+        }
+        if (inner.length === 0) continue;
+        turn += 1;
+        blocks.push(`<turn n="${turn}" role="${role}">\n${inner.join("\n")}\n</turn>`);
+    }
+    return blocks.join("\n");
+}
+
+// Prints conversation progress in a human-friendly format with color-coding for easier debugging of eval failures.
+export function debugStep(role: string, step: OnStepFinishEvent<any>): void {
+    if (!process.env.DEBUG) return;
+
+    const colors = {
+        cyan: "\x1b[36m",
+        green: "\x1b[32m",
+        yellow: "\x1b[33m",
+        magenta: "\x1b[35m",
+        red: "\x1b[31m",
+        blue: "\x1b[34m",
+        reset: "\x1b[0m",
+    };
+    if (step.reasoningText) {
+        console.log(`${colors.cyan}${role} (#${step.stepNumber}): REASONING: ${step.reasoningText}${colors.reset}`);
+    }
+    if (step.text) {
+        let color = colors.yellow;
+        if (role === `${ROLE.ASSISTANT}[${ROLE.FOLLOW_UP_BOT}]`) {
+            color = colors.red;
+        } else if (role === `${ROLE.ASSISTANT}[${ROLE.JUDGE_BOT}]`) {
+            color = colors.magenta;
+        } else if (role === ROLE.ASSISTANT) {
+            color = colors.green;
+        }
+        console.log(`${color}${role} (#${step.stepNumber}): ${step.text}${colors.reset}`);
+    }
+    if (step.toolResults && step.toolResults.length > 0) {
+        const first = step.toolResults[0]!;
+        if (
+            step.toolResults.length === 1 &&
+            (first.toolName === "submit-score" || first.toolName === "submit-follow-up")
+        ) {
+            console.log(`${colors.green}${role} (#${step.stepNumber}): VERDICT: ${JSON.stringify(first.input, null, 2)}${colors.reset}`);
+        } else {
+            console.log(`${colors.blue}${role} (#${step.stepNumber}): TOOL-CALL: ${JSON.stringify(step.toolResults, null, 2)}${colors.reset}`);
+        }
+    } else if (step.toolCalls && step.toolCalls.length > 0) {
+        console.log(`${colors.yellow}${role} (#${step.stepNumber}): TOOL-REQUEST: ${JSON.stringify(step.toolCalls, null, 2)}${colors.reset}`);
+    }
+}
@@ -0,0 +1,84 @@
+import * as untracedAi from "ai";
+import { wrapAISDK, traced } from "braintrust";
+import { z } from "zod";
+
+import type { Model } from "../../sdk/models.js";
+import type { FollowUpResult } from "./scaffolding.types.js";
+import { type Conversation, ROLE, serializeMessages, debugStep } from "./conversation.js";
+
+const ai = wrapAISDK(untracedAi);
+
+const followUpSchema = z.object({
+    hasFollowUp: z
+        .boolean()
+        .describe("true if a follow-up instruction is still applicable and would advance the conversation, false if none are"),
+    explanation: z.string().describe("brief explanation of the decision"),
+    response: z.string().optional().describe("message to send as the user (required when hasFollowUp=true)"),
+});
+
+export class FollowUpBot {
+    constructor(private model: Model) { }
+
+    async decide(conversation: Conversation, instructions: string | string[]): Promise<FollowUpResult> {
+        const instructionsArray = Array.isArray(instructions) ? instructions : [instructions];
+        const conversationSummary = serializeMessages(conversation.getMessages());
+
+        // We capture structured data via tool calls rather than forcing schema on text generation,
+        // which preserves the model's reasoning for debugging.
+        let followUpResult: z.infer<typeof followUpSchema> = {
+            hasFollowUp: false,
+            explanation: "No follow-up needed by default",
+        };
+        const submitFollowUpTool = untracedAi.tool({
+            description: "Submit your follow-up decision. Call this exactly once.",
+            inputSchema: followUpSchema,
+            execute: async (input) => {
+                followUpResult = input;
+                return { ok: true };
+            },
+        });
+
+        await traced(
+            async () => {
+                const userPrompt = `Process this conversation:\n${conversationSummary}`;
+                debugStep(`${ROLE.USER}[${ROLE.FOLLOW_UP_BOT}]`, { stepNumber: 0, text: userPrompt } as any);
+
+                await ai.generateText({
+                    model: this.model.getModel(),
+                    system: buildFollowUpSystemPrompt(instructionsArray),
+                    messages: [{ role: "user" as const, content: userPrompt }],
+                    tools: { "submit-follow-up": submitFollowUpTool },
+                    onStepFinish: (step) => debugStep(`${ROLE.ASSISTANT}[${ROLE.FOLLOW_UP_BOT}]`, step),
+                    stopWhen: [ai.hasToolCall("submit-follow-up")],
+                });
+            },
+            { name: "follow-up-bot" }
+        );
+
+        if (followUpResult.hasFollowUp) {
+            return {
+                hasFollowUp: true,
+                response: followUpResult.response!
+            };
+        } else {
+            return {
+                hasFollowUp: false,
+            }
+        }
+    }
+}
+
+function buildFollowUpSystemPrompt(instructions: string[]): string {
+    return [
+        "You are a human tester working with a MongoDB AI assistant.",
+        "You receive a conversation transcript (including tool calls and results) and follow-up instructions.",
+        "",
+        "### Follow-up instructions",
+        ...instructions.map((s, i) => `${i + 1}. ${s}`),
+        "",
+        "### Behaviour",
+        "- Review the conversation and determine if any follow-up instruction is still applicable and would advance the conversation.",
+        "- If yes: call `submit-follow-up` with hasFollowUp=true and a `response` written as a user directive in first person as concise as possible.",
+        "- If no applicable instructions remain: call `submit-follow-up` with hasFollowUp=false.",
+    ].join("\n");
+}