diff --git a/examples/node-agent-live/.gitignore b/examples/node-agent-live/.gitignore new file mode 100644 index 0000000..d7e3f41 --- /dev/null +++ b/examples/node-agent-live/.gitignore @@ -0,0 +1,2 @@ +chatlog.txt +output-*.wav \ No newline at end of file diff --git a/examples/node-agent-live/index.js b/examples/node-agent-live/index.js new file mode 100644 index 0000000..9a4f98d --- /dev/null +++ b/examples/node-agent-live/index.js @@ -0,0 +1,110 @@ +const { writeFile, appendFile } = require("fs/promises"); +const { createClient, AgentEvents } = require("../../dist/main/index"); +const fetch = require("cross-fetch"); +const { join } = require("path"); + +const deepgram = createClient(process.env.DEEPGRAM_API_KEY); + +const agent = async () => { + let audioBuffer = Buffer.alloc(0); + let i = 0; + const url = "https://dpgr.am/spacewalk.wav"; + const connection = deepgram.agent(); + connection.on(AgentEvents.Welcome, () => { + console.log("Welcome to the Deepgram Voice Agent!"); + + connection.configure({ + audio: { + input: { + encoding: "linear16", + sampleRate: 44100, + }, + output: { + encoding: "linear16", + sampleRate: 16000, + container: "wav", + }, + }, + agent: { + listen: { + model: "nova-2", + }, + speak: { + model: "aura-asteria-en", + }, + think: { + provider: { + type: "open_ai", + }, + model: "gpt-4o-mini", + }, + }, + }); + + console.log("Deepgram agent configured!"); + + setInterval(() => { + console.log("Keep alive!"); + connection.keepAlive(); + }, 5000); + + fetch(url) + .then((r) => r.body) + .then((res) => { + res.on("readable", () => { + console.log("Sending audio chunk"); + connection.send(res.read()); + }); + }); + }); + + connection.on(AgentEvents.Open, () => { + console.log("Connection opened"); + }); + + connection.on(AgentEvents.Close, () => { + console.log("Connection closed"); + process.exit(0); + }); + + connection.on(AgentEvents.ConversationText, async (data) => { + await appendFile(join(__dirname, `chatlog.txt`), JSON.stringify(data) + "\n"); + }); + + connection.on(AgentEvents.UserStartedSpeaking, () => { + if (audioBuffer.length) { + console.log("Interrupting agent."); + audioBuffer = Buffer.alloc(0); + } + }); + + connection.on(AgentEvents.Metadata, (data) => { + console.dir(data, { depth: null }); + }); + + connection.on(AgentEvents.Audio, (data) => { + console.log("Audio chunk received"); + // Concatenate the audio chunks into a single buffer + const buffer = Buffer.from(data); + audioBuffer = Buffer.concat([audioBuffer, buffer]); + }); + + connection.on(AgentEvents.Error, (err) => { + console.error("Error!"); + console.error(JSON.stringify(err, null, 2)); + console.error(err.message); + }); + + connection.on(AgentEvents.AgentAudioDone, async () => { + console.log("Agent audio done"); + await writeFile(join(__dirname, `output-${i}.wav`), audioBuffer); + audioBuffer = Buffer.alloc(0); + i++; + }); + + connection.on(AgentEvents.Unhandled, (data) => { + console.dir(data, { depth: null }); + }); +}; + +void agent(); diff --git a/src/DeepgramClient.ts b/src/DeepgramClient.ts index 63a0956..67da0fd 100644 --- a/src/DeepgramClient.ts +++ b/src/DeepgramClient.ts @@ -1,6 +1,7 @@ import { DeepgramVersionError } from "./lib/errors"; import { AbstractClient, + AgentLiveClient, ListenClient, ManageClient, ReadClient, @@ -80,6 +81,16 @@ export default class DeepgramClient extends AbstractClient { return new SpeakClient(this.options); } + /** + * Returns a new instance of the AgentLiveClient, which provides access to Deepgram's Voice Agent API. + * + * @returns {AgentLiveClient} A new instance of the AgentLiveClient. + * @beta + */ + public agent(endpoint: string = "/agent"): AgentLiveClient { + return new AgentLiveClient(this.options, endpoint); + } + /** * @deprecated * @see https://dpgr.am/js-v3 diff --git a/src/lib/constants.ts b/src/lib/constants.ts index a464e83..b41c091 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -36,6 +36,7 @@ export const DEFAULT_HEADERS = { }; export const DEFAULT_URL = "https://api.deepgram.com"; +export const DEFAULT_AGENT_URL = "wss://agent.deepgram.com"; export const DEFAULT_GLOBAL_OPTIONS: Partial = { fetch: { options: { url: DEFAULT_URL, headers: DEFAULT_HEADERS } }, @@ -44,8 +45,16 @@ export const DEFAULT_GLOBAL_OPTIONS: Partial = { }, }; +export const DEFAULT_AGENT_OPTIONS: Partial = { + fetch: { options: { url: DEFAULT_URL, headers: DEFAULT_HEADERS } }, + websocket: { + options: { url: DEFAULT_AGENT_URL, _nodeOnlyHeaders: DEFAULT_HEADERS }, + }, +}; + export const DEFAULT_OPTIONS: DefaultClientOptions = { global: DEFAULT_GLOBAL_OPTIONS, + agent: DEFAULT_AGENT_OPTIONS, }; export enum SOCKET_STATES { diff --git a/src/lib/enums/AgentEvents.ts b/src/lib/enums/AgentEvents.ts new file mode 100644 index 0000000..818cf8f --- /dev/null +++ b/src/lib/enums/AgentEvents.ts @@ -0,0 +1,78 @@ +export enum AgentEvents { + /** + * Built in socket events. + */ + Open = "Open", + Close = "Close", + Error = "Error", + /** + * Audio event? + */ + Audio = "Audio", + /** + * Confirms the successful connection to the websocket. + * { type: "Welcome", session_id: "String"} + */ + Welcome = "Welcome", + /** + * Confirms that your `configure` request was successful. + * { type: "SettingsApplied" } + */ + SettingsApplied = "SettingsApplied", + /** + * Triggered when the agent "hears" the user say something. + * { type: "ConversationText", role: string, content: string } + */ + ConversationText = "ConversationText", + /** + * Triggered when the agent begins receiving user audio. + * { type: "UserStartedSpeaking" } + */ + UserStartedSpeaking = "UserStartedSpeaking", + /** + * Triggered when the user has stopped speaking and the agent is processing the audio. + * { type: "AgentThinking", content: string } + */ + AgentThinking = "AgentThinking", + /** + * A request to call client-side functions. + * { type: "FunctionCallRequest", function_call_id: string, function_name: string, input: Record } + */ + FunctionCallRequest = "FunctionCallRequest", + /** + * Debug message triggered when the agent is calling a function. + * { type: "FunctionCalling" } + */ + FunctionCalling = "FunctionCalling", + /** + * Triggered when the agent begins streaming an audio response. + * { type: "AgentStartedSpeaking", total_latency: number, tts_latency: number, ttt_latency: number } + */ + AgentStartedSpeaking = "AgentStartedSpeaking", + /** + * Triggered when the agent has finished streaming an audio response. + * { type: "AgentAudioDone" } + */ + AgentAudioDone = "AgentAudioDone", + /** + * This event is only emitted when you send an `InjectAgentMessage` request while + * the user is currently speaking or the server is processing user audio. + * { type: "InjectionRefused" } + */ + InjectionRefused = "InjectionRefused", + /** + * A successful response to the `UpdateInstructions` request. + * { type: "InstructionsUpdated" } + */ + InstructionsUpdated = "InstructionsUpdated", + /** + * A successful response to the `UpdateSpeak` request. + * { type: "SpeakUpdated" } + */ + SpeakUpdated = "SpeakUpdated", + + /** + * Catch all for any other message event + */ + Unhandled = "Unhandled", +} diff --git a/src/lib/enums/index.ts b/src/lib/enums/index.ts index d85c2cd..a4a3192 100644 --- a/src/lib/enums/index.ts +++ b/src/lib/enums/index.ts @@ -1,3 +1,4 @@ +export * from "./AgentEvents"; export * from "./LiveConnectionState"; export * from "./LiveTranscriptionEvents"; export * from "./LiveTTSEvents"; diff --git a/src/lib/types/AgentLiveSchema.ts b/src/lib/types/AgentLiveSchema.ts new file mode 100644 index 0000000..e781810 --- /dev/null +++ b/src/lib/types/AgentLiveSchema.ts @@ -0,0 +1,184 @@ +type AudioFormat = + | { + encoding: "linear16"; + container: "wav" | "none"; + sampleRate: 8000 | 16000 | 24000 | 32000 | 48000; + } + | { + encoding: "mulaw"; + container: "wav" | "none"; + sampleRate: 8000 | 16000; + } + | { + encoding: "alaw"; + container: "wav" | "none"; + sampleRate: 8000 | 16000; + }; + +type AudioEncoding = + | "linear16" + | "flac" + | "mulaw" + | "amr-nb" + | "amr-wb" + | "Opus" + | "speex" + | "g729"; + +type ListenModel = + | "nova-2" + | "nova-2-meeting" + | "nova-2-phonecall" + | "nova-2-voicemail" + | "nova-2-finance" + | "nova-2-conversational" + | "nova-2-video" + | "nova-2-medical" + | "nova-2-drivethru" + | "nova-2-automotive" + | "nova-2-atc" + | "nova" + | "nova-phonecall" + | "enhanced" + | "enhanced-meeting" + | "enhanced-phonecall" + | "enhanced-finance" + | "base" + | "base-meeting" + | "base-phonecall" + | "base-voicemail" + | "base-finance" + | "base-conversational" + | "base-video" + | "whisper-tiny" + | "whisper" + | "whisper-small" + | "whisper-medium" + | "whisper-large"; + +type SpeakModel = + | "aura-asteria-en" + | "aura-luna-en" + | "aura-stella-en" + | "aura-athena-en" + | "aura-hera-en" + | "aura-orion-en" + | "aura-arcas-en" + | "aura-perseus-en" + | "aura-angus-en" + | "aura-orpheus-en" + | "aura-helios-en" + | "aura-zeus-en"; + +interface ThinkModelFunction { + name: string; + description: string; + url: string; + headers: [ + { + key: "authorization"; + value: string; + } + ]; + method: "POST"; + parameters: { + type: string; + properties: Record< + string, + { + type: string; + description: string; + } + >; + }; +} + +type ThinkModel = + | { + provider: { + type: "open_ai"; + }; + model: "gpt-4o-mini"; + instructions: string; + functions: ThinkModelFunction[]; + } + | { + provider: { + type: "anthropic"; + }; + model: "claude-3-haiku-20240307"; + instructions: string; + functions: ThinkModelFunction[]; + } + | { + provider: { + type: "groq"; + }; + model: ""; + instructions: string; + functions: ThinkModelFunction[]; + } + | { + provider: { + type: "custom"; + url: string; + key: string; + }; + model: string; + instructions: string; + functions: ThinkModelFunction[]; + }; + +/** + * @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#settingsconfiguration + */ +interface AgentLiveSchema extends Record { + audio: { + input?: { + /** + * @default 1 + */ + channels: number; + encoding: AudioEncoding; + /** + * @default false + */ + multichannel: boolean; + sampleRate: number; + }; + /** + * @see https://developers.deepgram.com/docs/tts-media-output-settings#audio-format-combinations + */ + output?: AudioFormat; + }; + agent: { + listen: { + /** + * @see https://developers.deepgram.com/docs/model + */ + model: ListenModel; + }; + speak: { + /** + * @see https://developers.deepgram.com/docs/tts-models + */ + model: SpeakModel; + }; + /** + * @see https://developers.deepgram.com/reference/voicebot-api-phase-preview#supported-llm-providers-and-models + */ + think: ThinkModel; + }; + context: { + /** + * LLM message history (e.g. to restore existing conversation if websocket disconnects) + */ + messages: []; + /** + * Whether to replay the last message, if it is an assistant message. + */ + replay: boolean; + }; +} + +export type { AgentLiveSchema, SpeakModel }; diff --git a/src/lib/types/DeepgramClientOptions.ts b/src/lib/types/DeepgramClientOptions.ts index 808dc66..c5d4eed 100644 --- a/src/lib/types/DeepgramClientOptions.ts +++ b/src/lib/types/DeepgramClientOptions.ts @@ -64,6 +64,7 @@ export interface DeepgramClientOptions { onprem?: NamespaceOptions; read?: NamespaceOptions; speak?: NamespaceOptions; + agent?: NamespaceOptions; /** * @deprecated as of 3.4, use a namespace like `global` instead diff --git a/src/lib/types/FunctionCallResponse.ts b/src/lib/types/FunctionCallResponse.ts new file mode 100644 index 0000000..7cd82c6 --- /dev/null +++ b/src/lib/types/FunctionCallResponse.ts @@ -0,0 +1,13 @@ +/** + * Respond with this when you receive a `FunctionCallRequest` payload. + */ +export interface FunctionCallResponse { + /** + * This must be the ID that was received in the request. + */ + function_call_id: string; + /** + * The result of the function call. + */ + output: string; +} diff --git a/src/lib/types/index.ts b/src/lib/types/index.ts index 9dd0f7b..9af2779 100644 --- a/src/lib/types/index.ts +++ b/src/lib/types/index.ts @@ -1,3 +1,4 @@ +export * from "./AgentLiveSchema"; export * from "./AnalyzeSchema"; export * from "./AsyncAnalyzeResponse"; export * from "./AsyncPrerecordedResponse"; @@ -8,6 +9,7 @@ export * from "./DeepgramClientOptions"; export * from "./DeepgramResponse"; export * from "./DeepgramSource"; export * from "./Fetch"; +export * from "./FunctionCallResponse"; export * from "./GetModelsResponse"; export * from "./GetModelsSchema"; export * from "./GetProjectBalancesResponse"; diff --git a/src/packages/AbstractLiveClient.ts b/src/packages/AbstractLiveClient.ts index 004710e..2ed4b45 100644 --- a/src/packages/AbstractLiveClient.ts +++ b/src/packages/AbstractLiveClient.ts @@ -245,8 +245,8 @@ export abstract class AbstractLiveClient extends AbstractClient { } if (typeof data !== "string") { - if (data.byteLength === 0) { - this.log("warn", "skipping `send` for zero-byte blob", data); + if (!data?.byteLength) { + this.log("warn", "skipping `send` for zero-byte payload", data); return; } diff --git a/src/packages/AgentLiveClient.ts b/src/packages/AgentLiveClient.ts new file mode 100644 index 0000000..f5b99fd --- /dev/null +++ b/src/packages/AgentLiveClient.ts @@ -0,0 +1,178 @@ +import { DEFAULT_AGENT_URL } from "../lib/constants"; +import { AgentEvents } from "../lib/enums/AgentEvents"; +import type { + AgentLiveSchema, + SpeakModel, + DeepgramClientOptions, + FunctionCallResponse, +} from "../lib/types"; +import { AbstractLiveClient } from "./AbstractLiveClient"; + +export class AgentLiveClient extends AbstractLiveClient { + public namespace: string = "agent"; + + constructor(options: DeepgramClientOptions, endpoint: string = "/agent") { + super(options); + this.baseUrl = options.agent?.websocket?.options?.url ?? DEFAULT_AGENT_URL; + + this.connect({}, endpoint); + } + + /** + * Sets up the connection event handlers. + * This method is responsible for handling the various events that can occur on the WebSocket connection, such as opening, closing, and receiving messages. + * - When the connection is opened, it emits the `AgentEvents.Open` event. + * - When the connection is closed, it emits the `AgentEvents.Close` event. + * - When an error occurs on the connection, it emits the `AgentEvents.Error` event. + * - When a message is received, it parses the message and emits the appropriate event based on the message type. + */ + public setupConnection(): void { + if (this.conn) { + this.conn.onopen = () => { + this.emit(AgentEvents.Open, this); + }; + + this.conn.onclose = (event: any) => { + this.emit(AgentEvents.Close, event); + }; + + this.conn.onerror = (event: ErrorEvent) => { + this.emit(AgentEvents.Error, event); + }; + + this.conn.onmessage = (event: MessageEvent) => { + this.handleMessage(event); + }; + } + } + + /** + * Handles incoming messages from the WebSocket connection. + * @param event - The MessageEvent object representing the received message. + */ + protected handleMessage(event: MessageEvent): void { + if (typeof event.data === "string") { + try { + const data = JSON.parse(event.data); + this.handleTextMessage(data); + } catch (error) { + this.emit(AgentEvents.Error, { + event, + message: "Unable to parse `data` as JSON.", + error, + }); + } + } else if (event.data instanceof Blob) { + event.data.arrayBuffer().then((buffer) => { + this.handleBinaryMessage(Buffer.from(buffer)); + }); + } else if (event.data instanceof ArrayBuffer) { + this.handleBinaryMessage(Buffer.from(event.data)); + } else if (Buffer.isBuffer(event.data)) { + this.handleBinaryMessage(event.data); + } else { + console.log("Received unknown data type", event.data); + this.emit(AgentEvents.Error, { + event, + message: "Received unknown data type.", + }); + } + } + + /** + * Handles binary messages received from the WebSocket connection. + * @param data - The binary data. + */ + protected handleBinaryMessage(data: Buffer): void { + this.emit(AgentEvents.Audio, data); + } + + /** + * Handles text messages received from the WebSocket connection. + * @param data - The parsed JSON data. + */ + protected handleTextMessage(data: any): void { + if (data.type in AgentEvents) { + this.emit(data.type, data); + } else { + this.emit(AgentEvents.Unhandled, data); + } + } + + /** + * To be called with your model configuration BEFORE sending + * any audio data. + * @param options - The SettingsConfiguration object. + * @param options.audio.input.encoding - The encoding for your inbound (user) audio. + * @param options.audio.input.sampleRate - The sample rate for your inbound (user) audio. + * @param options.audio.output.encoding - The encoding for your outbound (agent) audio. + * @param options.audio.output.sampleRate - The sample rate for your outbound (agent) audio. + * @param options.audio.output.bitrate - The bitrate for your outbound (agent) audio. + * @param options.audio.output.container - The container for your outbound (agent) audio. + * @param options.agent.listen.model - The STT model to use for processing user audio. + * @param options.agent.speak.model - The TTS model to use for generating agent audio. + * @param options.agent.think.provider.type - The LLM provider to use. + * @param options.agent.think.model - The LLM model to use. + * @param options.agent.think.instructions - The instructions to provide to the LLM. + * @param options.agent.think.functions - The functions to provide to the LLM. + * @param options.context.messages - The message history to provide to the LLM (useful if a websocket connection is lost.) + * @param options.context.replay - Whether to replay the last message if it was an assistant message. + */ + public configure(options: AgentLiveSchema): void { + // Converting the property names... + const opts: Record = { ...options }; + opts.audio.input["sample_rate"] = options.audio.input?.sampleRate; + delete opts.audio.input.sampleRate; + opts.audio.output["sample_rate"] = options.audio.output?.sampleRate; + delete opts.audio.output.sampleRate; + this.send(JSON.stringify({ type: "SettingsConfiguration", ...opts })); + } + + /** + * Provide new instructions to the LLM. + * @param instructions - The instructions to provide. + */ + public updateInstructions(instructions: string): void { + this.send(JSON.stringify({ type: "UpdateInstructions", instructions })); + } + + /** + * Change the speak model. + * @param model - The new model to use. + */ + public updateSpeak(model: SpeakModel): void { + this.send(JSON.stringify({ type: "UpdateSpeak", model })); + } + + /** + * Immediately trigger an agent message. If this message + * is sent while the user is speaking, or while the server is in the + * middle of sending audio, then the request will be ignored and an InjectionRefused + * event will be emitted. + * @example "Hold on while I look that up for you." + * @example "Are you still on the line?" + * @param message - The message to speak. + */ + public injectAgentMessage(message: string): void { + this.send(JSON.stringify({ type: "InjectAgentMessage", message })); + } + + /** + * Respond to a function call request. + * @param response - The response to the function call request. + * @param response.function_call_id - The ID that was received in the request (these MUST match). + * @param response.output - The result of the function call. + */ + public functionCallResponse(response: FunctionCallResponse): void { + this.send(JSON.stringify({ type: "FunctionCallResponse", ...response })); + } + + /** + * Send a keepalive to avoid closing the websocket while you + * are not transmitting audio. This should be sent at least + * every 8 seconds. + */ + public keepAlive(): void { + this.send(JSON.stringify({ type: "KeepAlive" })); + } +} diff --git a/src/packages/index.ts b/src/packages/index.ts index cd451c5..7872e2a 100644 --- a/src/packages/index.ts +++ b/src/packages/index.ts @@ -1,6 +1,7 @@ export * from "./AbstractClient"; export * from "./AbstractLiveClient"; export * from "./AbstractRestClient"; +export * from "./AgentLiveClient"; export * from "./ListenClient"; export * from "./ListenLiveClient"; export * from "./ListenRestClient";