Skip to content

Commit 832b0fa

Browse files
authored
Custom model format & multi turn chat implementation (#21)
* cleanup * custom format * token fetch * context split
1 parent ff8c1c8 commit 832b0fa

File tree

15 files changed

+620
-149
lines changed

15 files changed

+620
-149
lines changed

README.md

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ const { RunInference } = require('@duck4i/llama');
5151
const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
5252
const user_prompt = "What is life expectancy of a duck?";
5353

54-
const inference = RunInference("model.gguf", system_prompt, user_prompt);
54+
const inference = RunInference("model.gguf", user_prompt, system_prompt, /*optional*/ 512);
5555

5656
console.log("Answer", inference);
5757

@@ -60,7 +60,7 @@ console.log("Answer", inference);
6060
It is likely you will want async functions for better memory management with multiple prompts, which is done like this:
6161

6262
```javascript
63-
const { LoadModelAsync, RunInferenceAsync, ReleaseModelAsync } = require('@duck4i/llama');
63+
const { LoadModelAsync, CreateContextAsync, RunInferenceAsync, ReleaseContextAsync, ReleaseModelAsync } = require('@duck4i/llama');
6464

6565
const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
6666
const prompts = [
@@ -70,17 +70,50 @@ const prompts = [
7070
]
7171

7272
const model = await LoadModelAsync("model.gguf");
73-
console.log("Model loaded\n", model);
73+
const ctx = await CreateContextAsync(model, /*optional n_ctx*/ 0, /*optional flash_att*/ true);
74+
console.log("Model loaded", model);
7475

7576
for (const prompt of prompts) {
76-
const inference = await RunInferenceAsync(model, system_prompt, prompt, /*optional max tokens*/ 1024);
77-
console.log("Answer:\n", inference);
77+
const inference = await RunInferenceAsync(model, ctx, prompt, system_prompt, /*optional max tokens*/ 512);
78+
console.log("Answer:", inference);
7879
}
7980

81+
await ReleaseContextAsync(model);
8082
await ReleaseModelAsync(model);
8183

8284
```
8385

86+
### Model format
87+
88+
Its likely you will want more control over the model, so you can push the complete formatted prompt to it with prefix `!#`, like this:
89+
90+
```javascript
91+
92+
const system = "You are ...";
93+
const user = "...";
94+
95+
// QWEN example (prefix !# will get removed before reaching the llm)
96+
const prompt = `"!#<|im_start|>system ${system}<|im_end|><|im_start|>user ${user}<|im_end|><|im_start|>assistant"`;
97+
98+
const reply = await RunInferenceAsync(modelHandle, prompt, /*optional max token*/ 128)
99+
100+
```
101+
102+
Getting tokens from model is done by `GetModelToken` method.
103+
104+
```javascript
105+
106+
const eos = GetModelToken(modelHandle, "EOS");
107+
const bos = GetModelToken(modelHandle, "BOS");
108+
const eot = GetModelToken(modelHandle, "EOT");
109+
const sep = GetModelToken(modelHandle, "SEP");
110+
const cls = GetModelToken(modelHandle, "CLS");
111+
const nl = GetModelToken(modelHandle, "NL");
112+
113+
```
114+
115+
### Logging control
116+
84117
You can control log levels coming from llamacpp like this:
85118

86119
```javascript

__tests__/basic.js

Lines changed: 93 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,31 @@
11
const { execSync } = require('child_process');
22
const fs = require('fs');
33

4-
const { RunInference, LoadModelAsync, RunInferenceAsync, ReleaseModelAsync, SetLogLevel } = require("bindings")("npm-llama");
4+
const { ChatManager, Role } = require('../chatManager');
5+
const {
6+
RunInference,
7+
LoadModelAsync,
8+
CreateContextAsync,
9+
RunInferenceAsync,
10+
ReleaseContextAsync,
11+
ReleaseModelAsync,
12+
SetLogLevel,
13+
GetModelToken,
14+
} = require("bindings")("npm-llama");
515

616
const model = "model.gguf";
717
const modelUrl = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-fp16.gguf?download=true";
818
const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
919

20+
1021
describe('Node LLaMA Test Suite', () => {
1122

1223
beforeAll(() => {
1324

1425
if (!fs.existsSync(model)) {
1526
execSync(`npx llama-download -p ${model} -u ${modelUrl}`, { stdio: 'inherit' });
1627
} else {
17-
console.log("Model already exists");
28+
console.log("Model already downloaded");
1829
}
1930
});
2031

@@ -24,11 +35,12 @@ describe('Node LLaMA Test Suite', () => {
2435
});
2536

2637
test('direct inference works', () => {
27-
const inference = RunInference(model, system_prompt, "How old can ducks get?");
28-
expect(inference).toBeTruthy();
38+
const inference = RunInference(model, "How old can ducks get?", system_prompt);
39+
console.log("Result", inference);
40+
expect(inference.includes('10 years')).toBeTruthy();
2941
});
3042

31-
test('inference works with async', async () => {
43+
test('async inference works', async () => {
3244

3345
const prompts = [
3446
"How old can ducks get?",
@@ -37,13 +49,87 @@ describe('Node LLaMA Test Suite', () => {
3749
]
3850

3951
const modelHandle = await LoadModelAsync(model);
52+
const ctx = await CreateContextAsync(modelHandle);
4053
console.log("Model loaded", model);
4154

4255
for (const prompt of prompts) {
43-
const inference = await RunInferenceAsync(modelHandle, system_prompt, prompt, /*optional*/ 1024);
44-
console.log("Inference", inference);
56+
const inference = await RunInferenceAsync(modelHandle, ctx, prompt, system_prompt, 64);
57+
console.log("Reply:", inference);
58+
expect(inference.length > 0).toBeTruthy();
4559
}
4660

61+
await ReleaseContextAsync(ctx);
62+
await ReleaseModelAsync(modelHandle);
63+
});
64+
65+
test('custom inference works', async () => {
66+
67+
const user = "How old can ducks get?";
68+
const prompt = `"!#<|im_start|>system ${system_prompt}<|im_end|><|im_start|>user ${user}<|im_end|><|im_start|>assistant"`;
69+
70+
const modelHandle = await LoadModelAsync(model);
71+
const context = await CreateContextAsync(modelHandle);
72+
const result = await RunInferenceAsync(modelHandle, context, prompt);
73+
await ReleaseContextAsync(context);
74+
await ReleaseModelAsync(modelHandle);
75+
76+
console.log("Result", result);
77+
expect(result.length > 1).toBeTruthy();
78+
});
79+
80+
test('tokens work', async () => {
81+
82+
const modelHandle = await LoadModelAsync(model);
83+
const ctx = await CreateContextAsync(modelHandle);
84+
85+
const eos = GetModelToken(modelHandle, "EOS");
86+
const bos = GetModelToken(modelHandle, "BOS");
87+
const eot = GetModelToken(modelHandle, "EOT");
88+
const sep = GetModelToken(modelHandle, "SEP");
89+
const cls = GetModelToken(modelHandle, "CLS");
90+
const nl = GetModelToken(modelHandle, "NL");
91+
92+
console.log("EOS", eos);
93+
console.log("BOS", bos);
94+
console.log("EOT", eot);
95+
console.log("SEP", sep);
96+
console.log("CLS", cls);
97+
console.log("NL", nl);
98+
99+
await ReleaseContextAsync(ctx);
47100
await ReleaseModelAsync(modelHandle);
101+
102+
expect(eos.length > 1).toBeTruthy();
103+
expect(bos.length > 1).toBeTruthy();
104+
})
105+
106+
test('chat works', async () => {
107+
SetLogLevel(4); // warn
108+
109+
const modelHandle = await LoadModelAsync(model);
110+
const ctx = await CreateContextAsync(modelHandle);
111+
112+
const chat = new ChatManager(system_prompt);
113+
114+
let reply = "";
115+
let prompt = chat.getNextPrompt("Hello, my name is Duck!");
116+
117+
reply = await RunInferenceAsync(modelHandle, ctx, prompt, 128);
118+
console.log("Reply", reply);
119+
120+
chat.addMessage(Role.ASSISTANT, reply);
121+
122+
prompt = chat.getNextPrompt("What was my name?");
123+
reply = await RunInferenceAsync(modelHandle, ctx, prompt, 128);
124+
console.log("Reply", reply);
125+
126+
chat.addMessage(Role.ASSISTANT, reply);
127+
128+
await ReleaseContextAsync(ctx);
129+
await ReleaseModelAsync(modelHandle);
130+
131+
expect(reply.includes("Duck")).toBeTruthy();
48132
});
133+
134+
49135
});

chatManager.js

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
const Role = Object.freeze({
2+
SYSTEM: 'system',
3+
USER: 'user',
4+
ASSISTANT: 'assistant'
5+
});
6+
7+
class ChatManager {
8+
constructor(systemPrompt = '') {
9+
this.systemPrompt = systemPrompt;
10+
this.history = [];
11+
this.delimiter = {
12+
start: '<|im_start|>',
13+
end: '<|im_end|>'
14+
};
15+
}
16+
17+
addMessage(role, content) {
18+
if (!Object.values(Role).includes(role)) {
19+
throw new Error(`Invalid role. Must be one of: ${Object.values(Role).join(', ')}`);
20+
}
21+
this.history.push({ role, content });
22+
}
23+
24+
clear() {
25+
this.history = [];
26+
}
27+
28+
setSystemPrompt(prompt) {
29+
this.systemPrompt = prompt;
30+
}
31+
32+
getNextPrompt(userPrompt) {
33+
let formatted = `!#${this.delimiter.start}${Role.SYSTEM} ${this.systemPrompt}${this.delimiter.end}`;
34+
35+
if (userPrompt !== undefined) {
36+
this.addMessage(Role.USER, userPrompt);
37+
}
38+
39+
for (const message of this.history) {
40+
formatted += `${this.delimiter.start}${message.role} ${message.content}${this.delimiter.end}`;
41+
}
42+
43+
// Add the assistant delimiter for the next response
44+
formatted += `${this.delimiter.start}${Role.ASSISTANT}`;
45+
return formatted;
46+
}
47+
48+
getHistory() {
49+
return this.history;
50+
}
51+
}
52+
53+
// Example usage:
54+
/*
55+
const chat = new ChatManager("You are a helpful AI assistant.");
56+
57+
// Add user message
58+
chat.addMessage(Role.USER, "Hello!");
59+
60+
// Get prompt for LLM
61+
const prompt = chat.getNextPrompt();
62+
63+
// After getting LLM response, add it to history
64+
chat.addMessage(Role.ASSISTANT, "Hi there! How can I help you today?");
65+
66+
*/
67+
68+
module.exports = {
69+
Role,
70+
ChatManager
71+
}

downloadModel.js

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
#!/usr/bin/env node
2-
3-
const { Command } = require('commander');
4-
const packageInfo = require('./package.json');
5-
61
const axios = require('axios');
72
const fs = require('fs');
83
const path = require('path');
@@ -65,27 +60,6 @@ async function downloadModel(url, outputPath) {
6560
}
6661
}
6762

68-
const program = new Command();
69-
program
70-
.version(packageInfo.version)
71-
.requiredOption('-u, --url <url>', 'Download URL')
72-
.requiredOption('-p, --path <prompt>', 'Output path');
73-
74-
program.parse(process.argv);
75-
const options = program.opts();
76-
77-
const url = `${options.url}`;
78-
const target = `${options.path}`;
79-
80-
console.log(`Downloading from ${url} to ${target}`);
81-
82-
// Run the download
83-
downloadModel(url, target)
84-
.then(() => {
85-
console.log('Download completed successfully');
86-
process.exit(0);
87-
})
88-
.catch((error) => {
89-
console.error('Download failed:', error.message);
90-
process.exit(1);
91-
});
63+
module.exports = {
64+
downloadModel
65+
}

include/npm-llama.h

Lines changed: 0 additions & 14 deletions
This file was deleted.

index.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,9 @@
1-
module.exports = require("bindings")("npm-llama");
1+
const bindings = require("bindings")("npm-llama");
2+
const { chatManager } = require("./chatManager");
3+
const { downloadModel } = require("./downloadModel");
4+
5+
module.exports = {
6+
...bindings,
7+
chatManager,
8+
downloadModel
9+
}

index.mjs

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
1+
const { ChatManager } = require('./chatManager');
2+
const { downloadModel } = require("./downloadModel");
3+
14
import { createRequire } from 'module';
25
const require = createRequire(import.meta.url);
36
const nativeModule = require('./build/Release/npm-llama.node');
47

58
export const SetLogLevel = nativeModule.SetLogLevel;
69
export const RunInference = nativeModule.RunInference;
710
export const LoadModelAsync = nativeModule.LoadModelAsync;
11+
export const CreateContextAsync = nativeModule.CreateContextAsync;
812
export const RunInferenceAsync = nativeModule.RunInferenceAsync;
9-
export const ReleaseModelAsync = nativeModule.ReleaseModelAsync;
13+
export const ReleaseContextAsync = nativeModule.ReleaseContextAsync;
14+
export const ReleaseModelAsync = nativeModule.ReleaseModelAsync;
15+
export const GetModelToken = nativeModule.GetModelToken;
16+
17+
export { ChatManager };
18+
export { downloadModel };

0 commit comments

Comments
 (0)