duck4i
diff --git a/‎README.md
Lines changed: 38 additions & 5 deletions b/‎README.md
Lines changed: 38 additions & 5 deletions
diff --git a/‎__tests__/basic.js
Lines changed: 93 additions & 7 deletions b/‎__tests__/basic.js
Lines changed: 93 additions & 7 deletions
diff --git a/‎chatManager.js
Lines changed: 71 additions & 0 deletions b/‎chatManager.js
Lines changed: 71 additions & 0 deletions
diff --git a/‎downloadModel.js
Lines changed: 3 additions & 29 deletions b/‎downloadModel.js
Lines changed: 3 additions & 29 deletions
diff --git a/‎include/npm-llama.h
Lines changed: 0 additions & 14 deletions b/‎include/npm-llama.h
Lines changed: 0 additions & 14 deletions
diff --git a/‎index.js
Lines changed: 9 additions & 1 deletion b/‎index.js
Lines changed: 9 additions & 1 deletion
diff --git a/‎index.mjs
Lines changed: 10 additions & 1 deletion b/‎index.mjs
Lines changed: 10 additions & 1 deletion
@@ -51,7 +51,7 @@ const { RunInference } = require('@duck4i/llama');
 const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
 const user_prompt = "What is life expectancy of a duck?";
 
-const inference = RunInference("model.gguf", system_prompt, user_prompt);
+const inference = RunInference("model.gguf", user_prompt, system_prompt, /*optional*/ 512);
 
 console.log("Answer", inference);
 
@@ -60,7 +60,7 @@ console.log("Answer", inference);
 It is likely you will want async functions for better memory management with multiple prompts, which is done like this:
 
 ```javascript
-const { LoadModelAsync, RunInferenceAsync, ReleaseModelAsync } = require('@duck4i/llama');
+const { LoadModelAsync, CreateContextAsync, RunInferenceAsync, ReleaseContextAsync, ReleaseModelAsync } = require('@duck4i/llama');
 
 const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
 const prompts = [
@@ -70,17 +70,50 @@ const prompts = [
 ]
 
 const model = await LoadModelAsync("model.gguf");
-console.log("Model loaded\n", model);
+const ctx = await CreateContextAsync(model, /*optional n_ctx*/ 0, /*optional flash_att*/ true);
+console.log("Model loaded", model);
 
 for (const prompt of prompts) {
-    const inference = await RunInferenceAsync(model, system_prompt, prompt, /*optional max tokens*/ 1024);
-    console.log("Answer:\n", inference);
+    const inference = await RunInferenceAsync(model, ctx, prompt, system_prompt, /*optional max tokens*/ 512);
+    console.log("Answer:", inference);
 }
 
+await ReleaseContextAsync(model);
 await ReleaseModelAsync(model);
 
 ```
 
+### Model format
+
+Its likely you will want more control over the model, so you can push the complete formatted prompt to it with prefix `!#`, like this:
+
+```javascript
+
+const system = "You are ...";
+const user = "...";
+
+//  QWEN example (prefix !# will get removed before reaching the llm)
+const prompt = `"!#<|im_start|>system ${system}<|im_end|><|im_start|>user ${user}<|im_end|><|im_start|>assistant"`;
+
+const reply = await RunInferenceAsync(modelHandle, prompt, /*optional max token*/ 128)
+
+```
+
+Getting tokens from model is done by `GetModelToken` method.
+
+```javascript
+
+const eos = GetModelToken(modelHandle, "EOS");
+const bos = GetModelToken(modelHandle, "BOS");
+const eot = GetModelToken(modelHandle, "EOT");
+const sep = GetModelToken(modelHandle, "SEP");
+const cls = GetModelToken(modelHandle, "CLS");
+const nl = GetModelToken(modelHandle, "NL");
+
+```
+
+### Logging control
+
 You can control log levels coming from llamacpp like this:
 
 ```javascript
 
@@ -1,20 +1,31 @@
 const { execSync } = require('child_process');
 const fs = require('fs');
 
-const { RunInference, LoadModelAsync, RunInferenceAsync, ReleaseModelAsync, SetLogLevel } = require("bindings")("npm-llama");
+const { ChatManager, Role } = require('../chatManager');
+const {
+    RunInference,
+    LoadModelAsync,
+    CreateContextAsync,
+    RunInferenceAsync,
+    ReleaseContextAsync,
+    ReleaseModelAsync,
+    SetLogLevel,
+    GetModelToken,
+} = require("bindings")("npm-llama");
 
 const model = "model.gguf";
 const modelUrl = "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-fp16.gguf?download=true";
 const system_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.";
 
+
 describe('Node LLaMA Test Suite', () => {
 
     beforeAll(() => {
 
         if (!fs.existsSync(model)) {
             execSync(`npx llama-download -p ${model} -u ${modelUrl}`, { stdio: 'inherit' });
         } else {
-            console.log("Model already exists");
+            console.log("Model already downloaded");
         }
     });
 
@@ -24,11 +35,12 @@ describe('Node LLaMA Test Suite', () => {
     });
 
     test('direct inference works', () => {
-        const inference = RunInference(model, system_prompt, "How old can ducks get?");
-        expect(inference).toBeTruthy();
+        const inference = RunInference(model, "How old can ducks get?", system_prompt);
+        console.log("Result", inference);
+        expect(inference.includes('10 years')).toBeTruthy();
     });
 
-    test('inference works with async', async () => {
+    test('async inference works', async () => {
 
         const prompts = [
             "How old can ducks get?",
@@ -37,13 +49,87 @@ describe('Node LLaMA Test Suite', () => {
         ]
 
         const modelHandle = await LoadModelAsync(model);
+        const ctx = await CreateContextAsync(modelHandle);
         console.log("Model loaded", model);
 
         for (const prompt of prompts) {
-            const inference = await RunInferenceAsync(modelHandle, system_prompt, prompt, /*optional*/ 1024);
-            console.log("Inference", inference);
+            const inference = await RunInferenceAsync(modelHandle, ctx, prompt, system_prompt, 64);
+            console.log("Reply:", inference);
+            expect(inference.length > 0).toBeTruthy();
         }
 
+        await ReleaseContextAsync(ctx);
+        await ReleaseModelAsync(modelHandle);
+    });
+
+    test('custom inference works', async () => {
+
+        const user = "How old can ducks get?";
+        const prompt = `"!#<|im_start|>system ${system_prompt}<|im_end|><|im_start|>user ${user}<|im_end|><|im_start|>assistant"`;
+
+        const modelHandle = await LoadModelAsync(model);
+        const context = await CreateContextAsync(modelHandle);
+        const result = await RunInferenceAsync(modelHandle, context, prompt);
+        await ReleaseContextAsync(context);
+        await ReleaseModelAsync(modelHandle);
+
+        console.log("Result", result);
+        expect(result.length > 1).toBeTruthy();
+    });
+
+    test('tokens work', async () => {
+
+        const modelHandle = await LoadModelAsync(model);
+        const ctx = await CreateContextAsync(modelHandle);
+
+        const eos = GetModelToken(modelHandle, "EOS");
+        const bos = GetModelToken(modelHandle, "BOS");
+        const eot = GetModelToken(modelHandle, "EOT");
+        const sep = GetModelToken(modelHandle, "SEP");
+        const cls = GetModelToken(modelHandle, "CLS");
+        const nl = GetModelToken(modelHandle, "NL");
+
+        console.log("EOS", eos);
+        console.log("BOS", bos);
+        console.log("EOT", eot);
+        console.log("SEP", sep);
+        console.log("CLS", cls);
+        console.log("NL", nl);
+
+        await ReleaseContextAsync(ctx);
         await ReleaseModelAsync(modelHandle);
+
+        expect(eos.length > 1).toBeTruthy();
+        expect(bos.length > 1).toBeTruthy();
+    })
+
+    test('chat works', async () => {
+        SetLogLevel(4); // warn
+
+        const modelHandle = await LoadModelAsync(model);
+        const ctx = await CreateContextAsync(modelHandle);
+
+        const chat = new ChatManager(system_prompt);
+
+        let reply = "";
+        let prompt = chat.getNextPrompt("Hello, my name is Duck!");
+
+        reply = await RunInferenceAsync(modelHandle, ctx, prompt, 128);
+        console.log("Reply", reply);
+
+        chat.addMessage(Role.ASSISTANT, reply);
+
+        prompt = chat.getNextPrompt("What was my name?");
+        reply = await RunInferenceAsync(modelHandle, ctx, prompt, 128);
+        console.log("Reply", reply);
+
+        chat.addMessage(Role.ASSISTANT, reply);
+
+        await ReleaseContextAsync(ctx);
+        await ReleaseModelAsync(modelHandle);
+
+        expect(reply.includes("Duck")).toBeTruthy();
     });
+
+    
 });
@@ -0,0 +1,71 @@
+const Role = Object.freeze({
+    SYSTEM: 'system',
+    USER: 'user',
+    ASSISTANT: 'assistant'
+});
+
+class ChatManager {
+    constructor(systemPrompt = '') {
+        this.systemPrompt = systemPrompt;
+        this.history = [];
+        this.delimiter = {
+            start: '<|im_start|>',
+            end: '<|im_end|>'
+        };
+    }
+
+    addMessage(role, content) {
+        if (!Object.values(Role).includes(role)) {
+            throw new Error(`Invalid role. Must be one of: ${Object.values(Role).join(', ')}`);
+        }
+        this.history.push({ role, content });
+    }
+
+    clear() {
+        this.history = [];
+    }
+
+    setSystemPrompt(prompt) {
+        this.systemPrompt = prompt;
+    }
+
+    getNextPrompt(userPrompt) {
+        let formatted = `!#${this.delimiter.start}${Role.SYSTEM} ${this.systemPrompt}${this.delimiter.end}`;
+
+        if (userPrompt !== undefined) {
+            this.addMessage(Role.USER, userPrompt);
+        }
+
+        for (const message of this.history) {
+            formatted += `${this.delimiter.start}${message.role} ${message.content}${this.delimiter.end}`;
+        }
+
+        // Add the assistant delimiter for the next response
+        formatted += `${this.delimiter.start}${Role.ASSISTANT}`;
+        return formatted;
+    }
+
+    getHistory() {
+        return this.history;
+    }
+}
+
+// Example usage:
+/*
+const chat = new ChatManager("You are a helpful AI assistant.");
+
+// Add user message
+chat.addMessage(Role.USER, "Hello!");
+
+// Get prompt for LLM
+const prompt = chat.getNextPrompt();
+
+// After getting LLM response, add it to history
+chat.addMessage(Role.ASSISTANT, "Hi there! How can I help you today?");
+
+*/
+
+module.exports = {
+    Role,
+    ChatManager
+}
@@ -1,8 +1,3 @@
-#!/usr/bin/env node
-
-const { Command } = require('commander');
-const packageInfo = require('./package.json');
-
 const axios = require('axios');
 const fs = require('fs');
 const path = require('path');
@@ -65,27 +60,6 @@ async function downloadModel(url, outputPath) {
     }
 }
 
-const program = new Command();
-program
-    .version(packageInfo.version)
-    .requiredOption('-u, --url <url>', 'Download URL')
-    .requiredOption('-p, --path <prompt>', 'Output path');
-
-program.parse(process.argv);
-const options = program.opts();
-
-const url = `${options.url}`;
-const target = `${options.path}`;
-
-console.log(`Downloading from ${url} to ${target}`);
-
-// Run the download
-downloadModel(url, target)
-    .then(() => {
-        console.log('Download completed successfully');
-        process.exit(0);
-    })
-    .catch((error) => {
-        console.error('Download failed:', error.message);
-        process.exit(1);
-    });
+module.exports = {
+    downloadModel
+}
@@ -1 +1,9 @@
-module.exports = require("bindings")("npm-llama");
+const bindings = require("bindings")("npm-llama");
+const { chatManager } = require("./chatManager");
+const { downloadModel } = require("./downloadModel");
+
+module.exports = {
+    ...bindings,
+    chatManager,
+    downloadModel
+}
@@ -1,9 +1,18 @@
+const { ChatManager } = require('./chatManager');
+const { downloadModel } = require("./downloadModel");
+
 import { createRequire } from 'module';
 const require = createRequire(import.meta.url);
 const nativeModule = require('./build/Release/npm-llama.node');
 
 export const SetLogLevel = nativeModule.SetLogLevel;
 export const RunInference = nativeModule.RunInference;
 export const LoadModelAsync = nativeModule.LoadModelAsync;
+export const CreateContextAsync = nativeModule.CreateContextAsync;
 export const RunInferenceAsync = nativeModule.RunInferenceAsync;
-export const ReleaseModelAsync = nativeModule.ReleaseModelAsync;
+export const ReleaseContextAsync = nativeModule.ReleaseContextAsync;
+export const ReleaseModelAsync = nativeModule.ReleaseModelAsync;
+export const GetModelToken = nativeModule.GetModelToken;
+
+export { ChatManager };
+export { downloadModel };