Merge pull request #46 from codefuse-ai/modelcache_dev_mm

peng3307165 · web-flow · commit 31da4b7b0b8d · 2024-09-13T11:28:15.000+08:00
update readme，update mm cache Storage logic
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
 <h1>
-Codefuse-ModelCache
+ModelCache
 </h1>
 </div>
 
@@ -262,7 +262,7 @@ In ModelCache, we adopted the main idea of GPTCache,  includes core modules: ada
 - [ ] Support ElasticSearch
 ### Vector Storage
 - [ ] Adapts Faiss storage in multimodal scenarios.
-### Rank能力
+### Ranking
 - [ ] Add ranking model to refine the order of data after embedding recall.
 ### Service
 - [ ] Supports FastAPI.
diff --git a/README_CN.md b/README_CN.md
@@ -1,6 +1,6 @@
 <div align="center">
 <h1>
-Codefuse-ModelCache
+ModelCache
 </h1>
 </div>
 
diff --git a/modelcache_mm/manager/scalar_data/sql_storage_sqlite.py b/modelcache_mm/manager/scalar_data/sql_storage_sqlite.py
@@ -16,47 +16,34 @@ def __init__(
         self.create()
 
     def create(self):
-        # answer_table_sql = """CREATE TABLE IF NOT EXISTS `modelcache_llm_answer` (
-        # `id` bigint(20) NOT NULL AUTO_INCREMENT comment '主键',
-        # `gmt_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP comment '创建时间',
-        # `gmt_modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP comment '修改时间',
-        # `question` text NOT NULL comment 'question',
-        # `answer` text NOT NULL comment 'answer',
-        # `answer_type` int(11) NOT NULL comment 'answer_type',
-        # `hit_count` int(11) NOT NULL DEFAULT '0' comment 'hit_count',
-        # `model` varchar(1000) NOT NULL comment 'model',
-        # `embedding_data` blob NOT NULL comment 'embedding_data',
-        # PRIMARY KEY(`id`)
-        # ) AUTO_INCREMENT = 1 DEFAULT CHARSET = utf8mb4 COMMENT = 'modelcache_llm_answer';
-        # """
-        answer_table_sql = """CREATE TABLE IF NOT EXISTS modelcache_llm_answer (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                gmt_create TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-                gmt_modified TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-                question TEXT NOT NULL,
-                answer TEXT NOT NULL,
-                answer_type INTEGER NOT NULL,
-                hit_count INTEGER NOT NULL DEFAULT 0,
-                model VARCHAR(1000) NOT NULL,
-                embedding_data BLOB NOT NULL
-                );
-                """
+        # answer_table_sql = """CREATE TABLE IF NOT EXISTS modelcache_llm_answer (
+        #         id INTEGER PRIMARY KEY AUTOINCREMENT,
+        #         gmt_create TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        #         gmt_modified TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+        #         question TEXT NOT NULL,
+        #         answer TEXT NOT NULL,
+        #         answer_type INTEGER NOT NULL,
+        #         hit_count INTEGER NOT NULL DEFAULT 0,
+        #         model VARCHAR(1000) NOT NULL,
+        #         embedding_data BLOB NOT NULL
+        #         );
+        #         """
+
+        answer_table_sql = """CREATE TABLE IF NOT EXISTS `open_cache_mm_answer` (
+  `id` INTEGER PRIMARY KEY AUTOINCREMENT,
+  `gmt_create` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  `gmt_modified` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  `question_text` TEXT NOT NULL,
+  `image_url` VARCHAR(2048) NOT NULL,
+  `answer` TEXT NOT NULL,
+  `answer_type` INTEGER NOT NULL,
+  `hit_count` INTEGER NOT NULL DEFAULT 0,
+  `model` VARCHAR(1000) NOT NULL,
+  `image_raw` BLOB DEFAULT NULL,
+  `image_id` VARCHAR(1000) DEFAULT NULL
+);
+ """
 
-        # log_table_sql = """CREATE TABLE IF NOT EXISTS `modelcache_query_log` (
-        # `id` bigint(20) NOT NULL AUTO_INCREMENT comment '主键',
-        # `gmt_create` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP comment '创建时间',
-        # `gmt_modified` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP comment '修改时间',
-        # `error_code` int(11) NOT NULL comment 'errorCode',
-        # `error_desc` varchar(1000) NOT NULL comment 'errorDesc',
-        # `cache_hit` varchar(100) NOT NULL comment 'cacheHit',
-        # `delta_time` float NOT NULL comment 'delta_time',
-        # `model` varchar(1000) NOT NULL comment 'model',
-        # `query` text NOT NULL comment 'query',
-        # `hit_query` text NOT NULL comment 'hitQuery',
-        # `answer` text NOT NULL comment 'answer',
-        # PRIMARY KEY(`id`)
-        # ) AUTO_INCREMENT = 1 DEFAULT CHARSET = utf8mb4 COMMENT = 'modelcache_query_log';
-        # """
         log_table_sql = """CREATE TABLE IF NOT EXISTS modelcache_query_log (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 gmt_create TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
@@ -85,19 +72,19 @@ def create(self):
 
     def _insert(self, data: List):
         answer = data[0]
-        question = data[1]
-        embedding_data = data[2]
-        model = data[3]
+        text = data[1]
+        image_url = data[2]
+        image_id = data[3]
+        model = data[4]
         answer_type = 0
-        embedding_data = embedding_data.tobytes()
 
-        table_name = "modelcache_llm_answer"
-        insert_sql = "INSERT INTO {} (question, answer, answer_type, model, embedding_data) VALUES (?, ?, ?, ?, ?)".format(table_name)
+        table_name = "open_cache_mm_answer"
+        insert_sql = "INSERT INTO {} (question_text, image_url, image_id, answer, answer_type, model) VALUES (?, ?, ?, ?, ?, ?)".format(table_name)
 
         conn = sqlite3.connect(self._url)
         try:
             cursor = conn.cursor()
-            values = (question, answer, answer_type, model, embedding_data)
+            values = (text, image_url, image_id, answer, answer_type, model)
             cursor.execute(insert_sql, values)
             conn.commit()
             id = cursor.lastrowid
@@ -141,7 +128,7 @@ def insert_query_resp(self, query_resp, **kwargs):
             conn.close()
 
     def get_data_by_id(self, key: int):
-        table_name = "modelcache_llm_answer"
+        table_name = "open_cache_mm_answer"
         query_sql = "select question, answer, embedding_data, model from {} where id={}".format(table_name, key)
         conn = sqlite3.connect(self._url)
         try:
@@ -160,7 +147,7 @@ def get_data_by_id(self, key: int):
             return None
 
     def update_hit_count_by_id(self, primary_id: int):
-        table_name = "modelcache_llm_answer"
+        table_name = "open_cache_mm_answer"
         update_sql = "UPDATE {} SET hit_count = hit_count+1 WHERE id={}".format(table_name, primary_id)
 
         conn = sqlite3.connect(self._url)
@@ -178,7 +165,7 @@ def get_ids(self, deleted=True):
         pass
 
     def mark_deleted(self, keys):
-        table_name = "modelcache_llm_answer"
+        table_name = "open_cache_mm_answer"
         delete_sql = "Delete from {} WHERE id in ({})".format(table_name, ",".join([str(i) for i in keys]))
         conn = sqlite3.connect(self._url)
         try:
@@ -193,7 +180,7 @@ def mark_deleted(self, keys):
         return delete_count
 
     def model_deleted(self, model_name):
-        table_name = "modelcache_llm_answer"
+        table_name = "open_cache_mm_answer"
         delete_sql = "Delete from {} WHERE model='{}'".format(table_name, model_name)
         conn = sqlite3.connect(self._url)
         try:
diff --git a/modelcache_mm/manager/vector_data/faiss.py b/modelcache_mm/manager/vector_data/faiss.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+import os
+from typing import List
+import numpy as np
+from modelcache_mm.manager.vector_data.base import VectorBase, VectorData
+from modelcache_mm.utils import import_faiss
+import_faiss()
+import faiss  # pylint: disable=C0413
+
+
+class Faiss(VectorBase):
+    def __init__(self,
+                 index_file_path,
+                 dimension: int = 0,
+                 top_k: int = 1
+                 ):
+        self._dimension = dimension
+        self._index_file_path = index_file_path
+        self._index = faiss.index_factory(self._dimension, "IDMap,Flat", faiss.METRIC_L2)
+        self._top_k = top_k
+        if os.path.isfile(index_file_path):
+            self._index = faiss.read_index(index_file_path)
+
+    def add(self, datas: List[VectorData], model=None, mm_type=None):
+        data_array, id_array = map(list, zip(*((data.data, data.id) for data in datas)))
+        np_data = np.array(data_array).astype("float32")
+        ids = np.array(id_array)
+        print('insert_np_data: {}'.format(np_data))
+        print('insert_np_data: {}'.format(np_data.shape))
+        self._index.add_with_ids(np_data, ids)
+
+    def search(self, data: np.ndarray, top_k: int, model, mm_type='mm'):
+        if self._index.ntotal == 0:
+            return None
+        if top_k == -1:
+            top_k = self._top_k
+        np_data = np.array(data).astype("float32").reshape(1, -1)
+        dist, ids = self._index.search(np_data, top_k)
+        ids = [int(i) for i in ids[0]]
+        return list(zip(dist[0], ids))
+
+    def rebuild_col(self, ids=None):
+        try:
+            self._index.reset()
+        except Exception as e:
+            return f"An error occurred during index rebuild: {e}"
+
+    def rebuild(self, ids=None):
+        return True
+
+    def delete(self, ids):
+        ids_to_remove = np.array(ids)
+        self._index.remove_ids(faiss.IDSelectorBatch(ids_to_remove.size, faiss.swig_ptr(ids_to_remove)))
+
+    def create(self, model=None, mm_type=None):
+        pass
+        # collection_name_model = get_mm_index_name(model, mm_type)
+        # try:
+        #     index_prefix = get_mm_index_prefix(model, mm_type)
+        #     self.create_index(collection_name_model, mm_type, index_prefix)
+        # except Exception as e:
+        #     raise ValueError(str(e))
+        # return 'success'
+
+    def flush(self):
+        faiss.write_index(self._index, self._index_file_path)
+
+    def close(self):
+        self.flush()
+
+    def rebuild_idx(self, model):
+        pass
+
+    def count(self):
+        return self._index.ntotal
diff --git a/modelcache_mm/manager/vector_data/manager.py b/modelcache_mm/manager/vector_data/manager.py
@@ -98,36 +98,15 @@ def get(name, **kwargs):
                 t_dimension=t_dimension,
             )
         elif name == "faiss":
-            from modelcache.manager.vector_data.faiss import Faiss
-
+            from modelcache_mm.manager.vector_data.faiss import Faiss
             dimension = kwargs.get("dimension", DIMENSION)
-            index_path = kwargs.pop("index_path", FAISS_INDEX_PATH)
             VectorBase.check_dimension(dimension)
-            vector_base = Faiss(
-                index_file_path=index_path, dimension=dimension, top_k=top_k
-            )
-        elif name == "chromadb":
-            from modelcache.manager.vector_data.chroma import Chromadb
-
-            client_settings = kwargs.get("client_settings", None)
-            persist_directory = kwargs.get("persist_directory", None)
-            collection_name = kwargs.get("collection_name", COLLECTION_NAME)
-            vector_base = Chromadb(
-                client_settings=client_settings,
-                persist_directory=persist_directory,
-                collection_name=collection_name,
-                top_k=top_k,
-            )
-        elif name == "hnswlib":
-            from modelcache.manager.vector_data.hnswlib_store import Hnswlib
 
-            dimension = kwargs.get("dimension", DIMENSION)
-            index_path = kwargs.pop("index_path", "./hnswlib_index.bin")
-            max_elements = kwargs.pop("max_elements", 100000)
-            VectorBase.check_dimension(dimension)
-            vector_base = Hnswlib(
-                index_file_path=index_path, dimension=dimension,
-                top_k=top_k, max_elements=max_elements
+            index_path = kwargs.pop("index_path", FAISS_INDEX_PATH)
+            vector_base = Faiss(
+                index_file_path=index_path,
+                dimension=dimension,
+                top_k=top_k
             )
         else:
             raise NotFoundError("vector store", name)