diff --git a/docs/docs/modules/indexes/vector_stores/integrations/myscale.md b/docs/docs/modules/indexes/vector_stores/integrations/myscale.md new file mode 100644 index 000000000000..7cbac11278a3 --- /dev/null +++ b/docs/docs/modules/indexes/vector_stores/integrations/myscale.md @@ -0,0 +1,76 @@ +--- +sidebar_class_name: node-only +--- + +# MyScale + +[MyScale](https://myscale.com/) is an emerging AI database that harmonizes the power of vector search and SQL analytics, providing a managed, efficient, and responsive experience. + +:::tip Compatibility +Only available on Node.js. +::: + +## Setup + +1. Launch a cluster through [MyScale's Web Console](https://console.myscale.com/), see [the MyScale documentation](https://docs.myscale.com/en/quickstart/) for more information. +2. Install the Node.js SDK. + +```bash npm2yarn +npm install -S @clickhouse/client +``` + +## Index and query docs + +```typescript +import { MyScaleStore } from "langchain/vectorstores/myscale"; +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; + +const vectorStore = await MyScaleStore.fromTexts( + ["Hello world", "Bye bye", "hello nice world"], + [ + { id: 2, name: "2" }, + { id: 1, name: "1" }, + { id: 3, name: "3" }, + ], + new OpenAIEmbeddings(), + { + host: process.env.MYSCALE_HOST || "https://localhost:8443", + username: process.env.MYSCALE_USERNAME || "username", + password: process.env.MYSCALE_PASSWORD || "password", + } +); + +const results = await vectorStore.similaritySearch("hello world", 1); +console.log(results); + +const filteredResults = await vectorStore.similaritySearch("hello world", 1, { + whereStr: "metadata.name = '1'", +}); +console.log(filteredResults); +``` + +## Query docs from existing collection + +```typescript +import { MyScaleStore } from "langchain/vectorstores/myscale"; +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; + +const vectorStore = await MyScaleStore.fromExistingIndex( + new OpenAIEmbeddings(), + { + host: process.env.MYSCALE_HOST || "https://localhost:8443", + username: process.env.MYSCALE_USERNAME || "username", + password: process.env.MYSCALE_PASSWORD || "password", + database: "your_database", // default is default + table: "your_table", // default is vector_table + } +); + +const results = await vectorStore.similaritySearch("hello world", 1); +console.log(results); + +const filteredResults = await vectorStore.similaritySearch("hello world", 1, { + whereStr: "metadata.name = '1'", +}); +console.log(filteredResults); +``` diff --git a/examples/.env.example b/examples/.env.example index 7e887324d2f2..d2f76608e6c4 100644 --- a/examples/.env.example +++ b/examples/.env.example @@ -14,3 +14,6 @@ SUPABASE_URL=ADD_YOURS_HERE # # https://app.supabase.com/project/YOUR_PROJECT_ID WEAVIATE_HOST=ADD_YOURS_HERE WEAVIATE_SCHEME=ADD_YOURS_HERE WEAVIATE_API_KEY=ADD_YOURS_HERE +MYSCALE_HOST=ADD_YOURS_HERE +MYSCALE_USERNAME=ADD_YOURS_HERE +MYSCALE_PASSWORD=ADD_YOURS_HERE \ No newline at end of file diff --git a/examples/package.json b/examples/package.json index cb96e53e43bd..e247e24cdf71 100644 --- a/examples/package.json +++ b/examples/package.json @@ -22,6 +22,7 @@ "author": "LangChain", "license": "MIT", "dependencies": { + "@clickhouse/client": "^0.0.14", "@getmetal/metal-sdk": "^2.0.1", "@opensearch-project/opensearch": "^2.2.0", "@pinecone-database/pinecone": "^0.0.12", diff --git a/examples/src/indexes/vector_stores/myscale.ts b/examples/src/indexes/vector_stores/myscale.ts new file mode 100644 index 000000000000..1e15c1d2e673 --- /dev/null +++ b/examples/src/indexes/vector_stores/myscale.ts @@ -0,0 +1,28 @@ +import { MyScaleStore } from "langchain/vectorstores/myscale"; +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; + +export async function run() { + // Create a store and fill it with some texts + metadata + const vectorStore = await MyScaleStore.fromTexts( + ["Hello world", "Bye bye", "hello nice world"], + [ + { id: 2, name: "2" }, + { id: 1, name: "1" }, + { id: 3, name: "3" }, + ], + new OpenAIEmbeddings(), + { + host: process.env.MYSCALE_HOST || "https://localhost:8443", + username: process.env.MYSCALE_USERNAME || "username", + password: process.env.MYSCALE_PASSWORD || "password", + } + ); + + const results = await vectorStore.similaritySearch("hello world", 1); + console.log(results); + + const filteredResults = await vectorStore.similaritySearch("hello world", 1, { + whereStr: "metadata.name = '1'", + }); + console.log(filteredResults); +} diff --git a/langchain/.env.example b/langchain/.env.example index ed3908416a0a..7e0f263f09bb 100644 --- a/langchain/.env.example +++ b/langchain/.env.example @@ -18,3 +18,6 @@ MILVUS_URL=ADD_YOURS_HERE WEAVIATE_HOST=ADD_YOURS_HERE WEAVIATE_SCHEME=ADD_YOURS_HERE WEAVIATE_API_KEY=ADD_YOURS_HERE +MYSCALE_HOST=ADD_YOURS_HERE +MYSCALE_USERNAME=ADD_YOURS_HERE +MYSCALE_PASSWORD=ADD_YOURS_HERE diff --git a/langchain/.gitignore b/langchain/.gitignore index 8fbe1f8197b1..1f6e276c015b 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -103,6 +103,9 @@ vectorstores/milvus.d.ts vectorstores/prisma.cjs vectorstores/prisma.js vectorstores/prisma.d.ts +vectorstores/myscale.cjs +vectorstores/myscale.js +vectorstores/myscale.d.ts text_splitter.cjs text_splitter.js text_splitter.d.ts diff --git a/langchain/package.json b/langchain/package.json index ffc9436afb92..de3c18a34d0e 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -115,6 +115,9 @@ "vectorstores/prisma.cjs", "vectorstores/prisma.js", "vectorstores/prisma.d.ts", + "vectorstores/myscale.cjs", + "vectorstores/myscale.js", + "vectorstores/myscale.d.ts", "text_splitter.cjs", "text_splitter.js", "text_splitter.d.ts", @@ -278,6 +281,7 @@ "devDependencies": { "@aws-sdk/client-lambda": "^3.310.0", "@aws-sdk/client-s3": "^3.310.0", + "@clickhouse/client": "^0.0.14", "@faker-js/faker": "^7.6.0", "@getmetal/metal-sdk": "^2.0.1", "@huggingface/inference": "^1.5.1", @@ -333,6 +337,7 @@ "peerDependencies": { "@aws-sdk/client-lambda": "^3.310.0", "@aws-sdk/client-s3": "^3.310.0", + "@clickhouse/client": "^0.0.14", "@getmetal/metal-sdk": "*", "@huggingface/inference": "^1.5.1", "@opensearch-project/opensearch": "*", @@ -365,6 +370,9 @@ "@aws-sdk/client-s3": { "optional": true }, + "@clickhouse/client": { + "optional": true + }, "@getmetal/metal-sdk": { "optional": true }, @@ -660,6 +668,11 @@ "import": "./vectorstores/prisma.js", "require": "./vectorstores/prisma.cjs" }, + "./vectorstores/myscale": { + "types": "./vectorstores/myscale.d.ts", + "import": "./vectorstores/myscale.js", + "require": "./vectorstores/myscale.cjs" + }, "./text_splitter": { "types": "./text_splitter.d.ts", "import": "./text_splitter.js", diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index b6a646fe6dc5..c7550b4f6925 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -50,6 +50,7 @@ const entrypoints = { "vectorstores/opensearch": "vectorstores/opensearch", "vectorstores/milvus": "vectorstores/milvus", "vectorstores/prisma": "vectorstores/prisma", + "vectorstores/myscale": "vectorstores/myscale", // text_splitter text_splitter: "text_splitter", // memory @@ -145,6 +146,7 @@ const requiresOptionalDependency = [ "vectorstores/supabase", "vectorstores/opensearch", "vectorstores/milvus", + "vectorstores/myscale", "document_loaders/web/cheerio", "document_loaders/web/puppeteer", "document_loaders/web/playwright", diff --git a/langchain/src/vectorstores/myscale.ts b/langchain/src/vectorstores/myscale.ts new file mode 100644 index 000000000000..be3ac701e370 --- /dev/null +++ b/langchain/src/vectorstores/myscale.ts @@ -0,0 +1,239 @@ +import { v4 as uuid } from "uuid"; +import { ClickHouseClient, createClient } from "@clickhouse/client"; + +import { Embeddings } from "../embeddings/base.js"; +import { VectorStore } from "./base.js"; +import { Document } from "../document.js"; + +export interface MyScaleLibArgs { + host: string; + username: string; + password: string; + indexType?: string; + indexParam?: Record; + columnMap?: ColumnMap; + database?: string; + table?: string; + metric?: metric; +} + +export interface ColumnMap { + id: string; + text: string; + vector: string; + metadata: string; +} + +export type metric = "ip" | "cosine" | "l2"; + +export interface MyScaleFilter { + whereStr: string; +} + +export class MyScaleStore extends VectorStore { + private client: ClickHouseClient; + + private indexType: string; + + private indexParam: Record; + + private columnMap: ColumnMap; + + private database: string; + + private table: string; + + private metric: metric; + + private isInitialized = false; + + constructor(embeddings: Embeddings, args: MyScaleLibArgs) { + super(embeddings, args); + + this.indexType = args.indexType || "IVFFLAT"; + this.indexParam = args.indexParam || {}; + this.columnMap = args.columnMap || { + id: "id", + text: "text", + vector: "vector", + metadata: "metadata", + }; + this.database = args.database || "default"; + this.table = args.table || "vector_table"; + this.metric = args.metric || "cosine"; + + this.client = createClient({ + host: args.host, + username: args.username, + password: args.password, + session_id: uuid(), + }); + } + + async addVectors(vectors: number[][], documents: Document[]): Promise { + if (vectors.length === 0) { + return; + } + + if (!this.isInitialized) { + await this.initialize(); + this.isInitialized = true; + } + + const queryStr = this.buildInsertQuery(vectors, documents); + await this.client.exec({ query: queryStr }); + } + + async addDocuments(documents: Document[]): Promise { + return this.addVectors( + await this.embeddings.embedDocuments(documents.map((d) => d.pageContent)), + documents + ); + } + + async similaritySearchVectorWithScore( + query: number[], + k: number, + filter?: MyScaleFilter + ): Promise<[Document, number][]> { + const queryStr = this.buildSearchQuery(query, k, filter); + + const queryResultSet = await this.client.query({ query: queryStr }); + const queryResult: { + data: { text: string; metadata: object; dist: number }[]; + } = await queryResultSet.json(); + + const result: [Document, number][] = queryResult.data.map((item) => [ + new Document({ pageContent: item.text, metadata: item.metadata }), + item.dist, + ]); + + return result; + } + + async similaritySearch( + query: string, + k: number, + filter?: MyScaleFilter + ): Promise { + return super.similaritySearch(query, k, filter); + } + + async similaritySearchWithScore( + query: string, + k: number, + filter?: MyScaleFilter + ): Promise<[Document, number][]> { + return super.similaritySearchWithScore(query, k, filter); + } + + static async fromTexts( + texts: string[], + metadatas: object | object[], + embeddings: Embeddings, + args: MyScaleLibArgs + ): Promise { + const docs: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return MyScaleStore.fromDocuments(docs, embeddings, args); + } + + static async fromDocuments( + docs: Document[], + embeddings: Embeddings, + args: MyScaleLibArgs + ): Promise { + const instance = new this(embeddings, args); + await instance.addDocuments(docs); + return instance; + } + + static async fromExistingIndex( + embeddings: Embeddings, + args: MyScaleLibArgs + ): Promise { + const instance = new this(embeddings, args); + + await instance.initialize(); + return instance; + } + + private async initialize(): Promise { + const dim = (await this.embeddings.embedQuery("try this out")).length; + + let indexParamStr = ""; + for (const [key, value] of Object.entries(this.indexParam)) { + indexParamStr += `, '${key}=${value}'`; + } + + const query = ` + CREATE TABLE IF NOT EXISTS ${this.database}.${this.table}( + ${this.columnMap.id} String, + ${this.columnMap.text} String, + ${this.columnMap.vector} Array(Float32), + ${this.columnMap.metadata} JSON, + CONSTRAINT cons_vec_len CHECK length(${this.columnMap.vector}) = ${dim}, + VECTOR INDEX vidx ${this.columnMap.vector} TYPE ${this.indexType}('metric_type=${this.metric}'${indexParamStr}) + ) ENGINE = MergeTree ORDER BY ${this.columnMap.id} + `; + + await this.client.exec({ query: "SET allow_experimental_object_type=1" }); + await this.client.exec({ + query: "SET output_format_json_named_tuples_as_objects = 1", + }); + await this.client.exec({ query }); + } + + private buildInsertQuery(vectors: number[][], documents: Document[]): string { + const columnsStr = Object.values(this.columnMap).join(", "); + + const data: string[] = []; + for (let i = 0; i < vectors.length; i += 1) { + const vector = vectors[i]; + const document = documents[i]; + const item = [ + `'${uuid()}'`, + `'${this.escapeString(document.pageContent)}'`, + `[${vector}]`, + `'${JSON.stringify(document.metadata)}'`, + ].join(", "); + data.push(`(${item})`); + } + const dataStr = data.join(", "); + + return ` + INSERT INTO TABLE + ${this.database}.${this.table}(${columnsStr}) + VALUES + ${dataStr} + `; + } + + private escapeString(str: string): string { + return str.replace(/\\/g, "\\\\").replace(/'/g, "\\'"); + } + + private buildSearchQuery( + query: number[], + k: number, + filter?: MyScaleFilter + ): string { + const order = this.metric === "ip" ? "DESC" : "ASC"; + + const whereStr = filter ? `PREWHERE ${filter.whereStr}` : ""; + return ` + SELECT ${this.columnMap.text} AS text, ${this.columnMap.metadata} AS metadata, dist + FROM ${this.database}.${this.table} + ${whereStr} + ORDER BY distance(${this.columnMap.vector}, [${query}]) AS dist ${order} + LIMIT ${k} + `; + } +} diff --git a/langchain/src/vectorstores/tests/myscale.int.test.ts b/langchain/src/vectorstores/tests/myscale.int.test.ts new file mode 100644 index 000000000000..56f0a67aa14f --- /dev/null +++ b/langchain/src/vectorstores/tests/myscale.int.test.ts @@ -0,0 +1,71 @@ +/* eslint-disable no-process-env */ +import { test, expect } from "@jest/globals"; + +import { MyScaleStore } from "../myscale.js"; +import { OpenAIEmbeddings } from "../../embeddings/openai.js"; +import { Document } from "../../document.js"; + +test.skip("MyScaleStore.fromText", async () => { + const vectorStore = await MyScaleStore.fromTexts( + ["Hello world", "Bye bye", "hello nice world"], + [ + { id: 2, name: "2" }, + { id: 1, name: "1" }, + { id: 3, name: "3" }, + ], + new OpenAIEmbeddings(), + { + host: process.env.MYSCALE_HOST || "https://localhost:8443", + username: process.env.MYSCALE_USERNAME || "username", + password: process.env.MYSCALE_PASSWORD || "password", + } + ); + + const results = await vectorStore.similaritySearch("hello world", 1); + expect(results).toEqual([ + new Document({ + pageContent: "Hello world", + metadata: { id: 2, name: "2" }, + }), + ]); + + const filteredResults = await vectorStore.similaritySearch("hello world", 1, { + whereStr: "metadata.name = '1'", + }); + expect(filteredResults).toEqual([ + new Document({ + pageContent: "Bye bye", + metadata: { id: 1, name: "1" }, + }), + ]); +}); + +test.skip("MyScaleStore.fromExistingIndex", async () => { + const vectorStore = await MyScaleStore.fromExistingIndex( + new OpenAIEmbeddings(), + { + host: process.env.MYSCALE_HOST || "https://localhost:8443", + username: process.env.MYSCALE_USERNAME || "username", + password: process.env.MYSCALE_PASSWORD || "password", + table: "test_table", + } + ); + + const results = await vectorStore.similaritySearch("hello world", 1); + expect(results).toEqual([ + new Document({ + pageContent: "Hello world", + metadata: { id: 2, name: "2" }, + }), + ]); + + const filteredResults = await vectorStore.similaritySearch("hello world", 1, { + whereStr: "metadata.name = '1'", + }); + expect(filteredResults).toEqual([ + new Document({ + pageContent: "Bye bye", + metadata: { id: 1, name: "1" }, + }), + ]); +}); diff --git a/langchain/tsconfig.json b/langchain/tsconfig.json index 9dfb65bc1e28..3b21adf7d1cd 100644 --- a/langchain/tsconfig.json +++ b/langchain/tsconfig.json @@ -66,6 +66,7 @@ "src/vectorstores/opensearch.ts", "src/vectorstores/milvus.ts", "src/vectorstores/prisma.ts", + "src/vectorstores/myscale.ts", "src/text_splitter.ts", "src/memory/index.ts", "src/document.ts", diff --git a/yarn.lock b/yarn.lock index 4c1a6ed745ac..855c67ce5e23 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3230,6 +3230,16 @@ __metadata: languageName: node linkType: hard +"@clickhouse/client@npm:^0.0.14": + version: 0.0.14 + resolution: "@clickhouse/client@npm:0.0.14" + dependencies: + node-abort-controller: ^3.0.1 + uuid: ^9.0.0 + checksum: dc4ce8091856497ffaa50a68194c7ff555357f2a6b0355cfa35cb335d0629000a10ef60c15a2cc33d1cd368730056c2bdd164c80a520f0550ab57c5bbfb7ad2c + languageName: node + linkType: hard + "@cloudflare/kv-asset-handler@npm:^0.2.0": version: 0.2.0 resolution: "@cloudflare/kv-asset-handler@npm:0.2.0" @@ -13004,6 +13014,7 @@ __metadata: version: 0.0.0-use.local resolution: "examples@workspace:examples" dependencies: + "@clickhouse/client": ^0.0.14 "@getmetal/metal-sdk": ^2.0.1 "@opensearch-project/opensearch": ^2.2.0 "@pinecone-database/pinecone": ^0.0.12 @@ -17137,6 +17148,7 @@ __metadata: "@anthropic-ai/sdk": ^0.4.3 "@aws-sdk/client-lambda": ^3.310.0 "@aws-sdk/client-s3": ^3.310.0 + "@clickhouse/client": ^0.0.14 "@dqbd/tiktoken": ^1.0.4 "@faker-js/faker": ^7.6.0 "@getmetal/metal-sdk": ^2.0.1 @@ -17206,6 +17218,7 @@ __metadata: peerDependencies: "@aws-sdk/client-lambda": ^3.310.0 "@aws-sdk/client-s3": ^3.310.0 + "@clickhouse/client": ^0.0.14 "@getmetal/metal-sdk": "*" "@huggingface/inference": ^1.5.1 "@opensearch-project/opensearch": "*" @@ -17235,6 +17248,8 @@ __metadata: optional: true "@aws-sdk/client-s3": optional: true + "@clickhouse/client": + optional: true "@getmetal/metal-sdk": optional: true "@huggingface/inference": @@ -18658,6 +18673,13 @@ __metadata: languageName: node linkType: hard +"node-abort-controller@npm:^3.0.1": + version: 3.1.1 + resolution: "node-abort-controller@npm:3.1.1" + checksum: 2c340916af9710328b11c0828223fc65ba320e0d082214a211311bf64c2891028e42ef276b9799188c4ada9e6e1c54cf7a0b7c05dd9d59fcdc8cd633304c8047 + languageName: node + linkType: hard + "node-addon-api@npm:^4.2.0": version: 4.3.0 resolution: "node-addon-api@npm:4.3.0"