Python: Feature new memory stores and collections (#7614)

### Motivation and Context  This PR adds new vector store and vector store record collections as well as implementations for: - Azure AI Search - Redis - Qdrant - Volatile (in-memory) It also adds samples, tests, and unit-tests for these. Next it adds the vector store record fields, definition and decorator, with tests and samples. All marked experimental. Existing Redis, Azure AI Search, Qdrant and Volatile will be marked as deprecated in the future, once the new collections are feature complete. ### Description  ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
microsoft · Aug 6, 2024 · 1356d5f · 1356d5f
1 parent 07f94f2
commit 1356d5f
Show file tree

Hide file tree

Showing 63 changed files with 6,359 additions and 282 deletions.
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -131,6 +131,7 @@ jobs:
           VERTEX_AI_PROJECT_ID: ${{ vars.VERTEX_AI_PROJECT_ID }}
           VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
           VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
+          REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
         run: |
           cd python
           poetry run pytest ./tests/integration ./tests/samples -v --junitxml=pytest.xml
@@ -242,6 +243,7 @@ jobs:
           VERTEX_AI_PROJECT_ID: ${{ vars.VERTEX_AI_PROJECT_ID }}
           VERTEX_AI_GEMINI_MODEL_ID: ${{ vars.VERTEX_AI_GEMINI_MODEL_ID }}
           VERTEX_AI_EMBEDDING_MODEL_ID: ${{ vars.VERTEX_AI_EMBEDDING_MODEL_ID }}
+          REDIS_CONNECTION_STRING: ${{ vars.REDIS_CONNECTION_STRING }}
         run: |
           if ${{ matrix.os == 'ubuntu-latest' }}; then
             docker run -d --name redis-stack-server -p 6379:6379 redis/redis-stack-server:latest

diff --git a/python/.coveragerc b/python/.coveragerc
@@ -10,8 +10,8 @@ omit =
     semantic_kernel/connectors/memory/mongodb_atlas/*
     semantic_kernel/connectors/memory/pinecone/*
     semantic_kernel/connectors/memory/postgres/*
-    semantic_kernel/connectors/memory/qdrant/*
-    semantic_kernel/connectors/memory/redis/*
+    semantic_kernel/connectors/memory/qdrant/qdrant_memory_store.py
+    semantic_kernel/connectors/memory/redis/redis_memory_store.py
     semantic_kernel/connectors/memory/usearch/*
     semantic_kernel/connectors/memory/weaviate/*
     semantic_kernel/reliability/*
@@ -33,4 +33,4 @@ exclude_lines =
     # TYPE_CHECKING and @overload blocks are never executed during pytest run
     if TYPE_CHECKING:
     @overload
-    @abstractmethod
+    @abstractmethod
diff --git a/python/.cspell.json b/python/.cspell.json
@@ -47,6 +47,10 @@
         "protos",
         "endregion",
         "vertexai",
-        "aiplatform"
+        "aiplatform",
+        "serde",
+        "datamodel",
+        "vectorstoremodel",
+        "qdrant"
     ]
 }
diff --git a/python/mypy.ini b/python/mypy.ini
@@ -26,6 +26,8 @@ ignore_errors = true
 [mypy-semantic_kernel.connectors.memory.astradb.*]
 ignore_errors = true
 
+[mypy-semantic_kernel.connectors.memory.azure_ai_search.*]
+ignore_errors = false
 [mypy-semantic_kernel.connectors.memory.azure_cognitive_search.*]
 ignore_errors = true
 
@@ -50,9 +52,13 @@ ignore_errors = true
 [mypy-semantic_kernel.connectors.memory.postgres.*]
 ignore_errors = true
 
+[mypy-semantic_kernel.connectors.memory.qdrant.qdrant_vector_record_store.*]
+ignore_errors = true
 [mypy-semantic_kernel.connectors.memory.qdrant.*]
 ignore_errors = true
 
+[mypy-semantic_kernel.connectors.memory.redis.redis_vector_record_store.*]
+ignore_errors = true
 [mypy-semantic_kernel.connectors.memory.redis.*]
 ignore_errors = true
 

diff --git a/python/poetry.lock b/python/poetry.lock
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -57,8 +57,9 @@ chromadb = { version = ">=0.4.13,<0.6.0", optional = true}
 google-cloud-aiplatform = { version = "^1.60.0", optional = true}
 google-generativeai = { version = "^0.7.2", optional = true}
 # hugging face
-transformers = { version = "^4.28.1", extras=["torch"], optional = true}
+transformers = { version = "^4.28.1", extras=['torch'], optional = true}
 sentence-transformers = { version = "^2.2.2", optional = true}
+torch = {version = "2.2.2", optional = true}
 # mongo
 motor = { version = "^3.3.2", optional = true }
 # notebooks
@@ -73,20 +74,20 @@ ollama = { version = "^0.2.1", optional = true}
 # pinecone
 pinecone-client = { version = ">=3.0.0", optional = true}
 # postgres
-psycopg = { version="^3.1.9", extras=["binary","pool"], optional = true}
+psycopg = { version="^3.2.1", extras=["binary","pool"], optional = true}
 # qdrant
 qdrant-client = { version = '^1.9', optional = true}
 # redis
-redis = { version = "^4.6.0", optional = true}
+redis = { version = "^5.0.7", extras=['hiredis'], optional = true}
+types-redis = { version="^4.6.0.20240425", optional = true }
 # usearch
 usearch = { version = "^2.9", optional = true}
 pyarrow = { version = ">=12.0.1,<18.0.0", optional = true}
 weaviate-client = { version = ">=3.18,<5.0", optional = true}
-ruff = "0.5.2"
+pandas = {version = "^2.2.2", optional = true}
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = ">=3.7.1"
-ruff = ">=0.5"
 ipykernel = "^6.29.4"
 nbconvert = "^7.16.4"
 pytest = "^8.2.1"
@@ -96,6 +97,7 @@ pytest-asyncio = "^0.23.7"
 snoop = "^0.4.3"
 mypy = ">=1.10.0"
 types-PyYAML = "^6.0.12.20240311"
+ruff = "^0.5.2"
 
 [tool.poetry.group.unit-tests]
 optional = true
@@ -109,8 +111,14 @@ mistralai = "^0.4.1"
 ollama = "^0.2.1"
 google-cloud-aiplatform = "^1.60.0"
 google-generativeai = "^0.7.2"
-transformers = { version = "^4.28.1", extras=["torch"]}
-sentence-transformers = "^2.2.2"
+transformers = { version = "^4.28.1", extras=['torch']}
+sentence-transformers = { version = "^2.2.2"}
+torch = {version = "2.2.2"}
+# qdrant
+qdrant-client = '^1.9'
+# redis
+redis = { version = "^5.0.7", extras=['hiredis']}
+pandas = {version = "^2.2.2"}
 
 [tool.poetry.group.tests]
 optional = true
@@ -129,8 +137,9 @@ chromadb = ">=0.4.13,<0.6.0"
 google-cloud-aiplatform = "^1.60.0"
 google-generativeai = "^0.7.2"
 # hugging face
-transformers = { version = "^4.28.1", extras=["torch"]}
-sentence-transformers = "^2.2.2"
+transformers = { version = "^4.28.1", extras=['torch']}
+sentence-transformers = { version = "^2.2.2"}
+torch = {version = "2.2.2"}
 # milvus
 pymilvus = ">=2.3,<2.4.4"
 milvus = { version = ">=2.3,<2.3.8", markers = 'sys_platform != "win32"'}
@@ -147,21 +156,23 @@ psycopg = { version="^3.1.9", extras=["binary","pool"]}
 # qdrant
 qdrant-client = '^1.9'
 # redis
-redis = "^4.6.0"
+redis = { version="^5.0.7", extras=['hiredis']}
+types-redis = { version="^4.6.0.20240425" }
 # usearch
 usearch = "^2.9"
 pyarrow = ">=12.0.1,<18.0.0"
 # weaviate
 weaviate-client = ">=3.18,<5.0"
+pandas = {version = "^2.2.2"}
 
 # Extras are exposed to pip, this allows a user to easily add the right dependencies to their environment
 [tool.poetry.extras]
-all = ["transformers", "sentence-transformers", "qdrant-client", "chromadb", "pymilvus", "milvus", "mistralai", "ollama", "google", "weaviate-client", "pinecone-client", "psycopg", "redis", "azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "usearch", "pyarrow", "ipykernel", "motor"]
+all = ["transformers", "sentence-transformers", "torch", "qdrant-client", "chromadb", "pymilvus", "milvus", "mistralai", "ollama", "google", "weaviate-client", "pinecone-client", "psycopg", "redis", "azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "usearch", "pyarrow", "ipykernel", "motor"]
 
 azure = ["azure-ai-inference", "azure-search-documents", "azure-core", "azure-identity", "azure-cosmos", "msgraph-sdk"]
 chromadb = ["chromadb"]
 google = ["google-cloud-aiplatform", "google-generativeai"]
-hugging_face = ["transformers", "sentence-transformers"]
+hugging_face = ["transformers", "sentence-transformers", "torch"]
 milvus = ["pymilvus", "milvus"]
 mistralai = ["mistralai"]
 ollama = ["ollama"]
@@ -170,7 +181,7 @@ notebooks = ["ipykernel"]
 pinecone = ["pinecone-client"]
 postgres = ["psycopg"]
 qdrant = ["qdrant-client"]
-redis = ["redis"]
+redis = ["redis", "types-redis"]
 usearch = ["usearch", "pyarrow"]
 weaviate = ["weaviate-client"]
 

diff --git a/python/samples/concepts/memory/data_models.py b/python/samples/concepts/memory/data_models.py
@@ -0,0 +1,160 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from dataclasses import dataclass, field
+from typing import Annotated, Any
+from uuid import uuid4
+
+from pandas import DataFrame
+from pydantic import Field
+
+from semantic_kernel.data.vector_store_model_decorator import vectorstoremodel
+from semantic_kernel.data.vector_store_model_definition import VectorStoreRecordDefinition
+from semantic_kernel.data.vector_store_record_fields import (
+    VectorStoreRecordDataField,
+    VectorStoreRecordKeyField,
+    VectorStoreRecordVectorField,
+)
+from semantic_kernel.kernel_pydantic import KernelBaseModel
+
+# This concept shows the different ways you can create a vector store data model
+# using dataclasses, Pydantic, and Python classes.
+# As well as using types like Pandas Dataframes.
+
+# There are a number of universal things about these data models:
+# they must specify the type of field through the annotation (or the definition).
+# there must be at least one field of type VectorStoreRecordKeyField.
+# If you set the embedding_property_name in the VectorStoreRecordDataField, that field must exist and be a vector field.
+# A unannotated field is allowed but must have a default value.
+
+# The purpose of these models is to be what you pass to and get back from a vector store.
+# There maybe limitations to data types that the vector store can handle,
+# so not every store will be able to handle completely the same model.
+# for instance, some stores only allow a string as the keyfield, while others allow str and int,
+# so defining the key with a int, might make some stores unusable.
+
+# The decorator takes the class and pulls out the fields and annotations to create a definition,
+# of type VectorStoreRecordDefinition.
+# This definition is used for the vector store to know how to handle the data model.
+
+# You can also create the definition yourself, and pass it to the vector stores together with a standard type,
+# like a dict or list.
+# Or you can use the definition in container mode with something like a Pandas Dataframe.
+
+
+# Data model using built-in Python dataclasses
+@vectorstoremodel
+@dataclass
+class DataModelDataclass:
+    vector: Annotated[list[float], VectorStoreRecordVectorField]
+    key: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
+    content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
+        "content1"
+    )
+    other: str | None = None
+
+
+# Data model using Pydantic BaseModels
+@vectorstoremodel
+class DataModelPydantic(KernelBaseModel):
+    vector: Annotated[list[float], VectorStoreRecordVectorField]
+    key: Annotated[str, VectorStoreRecordKeyField()] = Field(default_factory=lambda: str(uuid4()))
+    content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
+        "content1"
+    )
+    other: str | None = None
+
+
+# Data model using Pydantic BaseModels with mixed annotations (from pydantic and SK)
+@vectorstoremodel
+class DataModelPydanticComplex(KernelBaseModel):
+    vector: Annotated[list[float], VectorStoreRecordVectorField]
+    key: Annotated[str, Field(default_factory=lambda: str(uuid4())), VectorStoreRecordKeyField()]
+    content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
+        "content1"
+    )
+    other: str | None = None
+
+
+# Data model using Python classes
+# This one includes a custom serialize and deserialize method
+@vectorstoremodel
+class DataModelPython:
+    def __init__(
+        self,
+        vector: Annotated[list[float], VectorStoreRecordVectorField],
+        key: Annotated[str, VectorStoreRecordKeyField] = None,
+        content: Annotated[
+            str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")
+        ] = "content1",
+        other: str | None = None,
+    ):
+        self.vector = vector
+        self.other = other
+        self.key = key or str(uuid4())
+        self.content = content
+
+    def __str__(self) -> str:
+        return f"DataModelPython(vector={self.vector}, key={self.key}, content={self.content}, other={self.other})"
+
+    def serialize(self) -> dict[str, Any]:
+        return {
+            "vector": self.vector,
+            "key": self.key,
+            "content": self.content,
+        }
+
+    @classmethod
+    def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass":
+        return cls(
+            vector=obj["vector"],
+            key=obj["key"],
+            content=obj["content"],
+        )
+
+
+# Data model definition for use with Pandas
+# note the container mode flag, which makes sure that records that are returned are in a container
+# even when requesting a batch of records.
+# There is also a to_dict and from_dict method, which are used to convert the data model to and from a dict,
+# these should be specific to the type used, if using dict as type then these can be left off.
+data_model_definition_pandas = VectorStoreRecordDefinition(
+    fields={
+        "vector": VectorStoreRecordVectorField(property_type="list[float]"),
+        "key": VectorStoreRecordKeyField(property_type="str"),
+        "content": VectorStoreRecordDataField(
+            property_type="str", has_embedding=True, embedding_property_name="vector"
+        ),
+    },
+    container_mode=True,
+    to_dict=lambda record, **_: record.to_dict(orient="records"),
+    from_dict=lambda records, **_: DataFrame(records),
+)
+
+
+if __name__ == "__main__":
+    data_item1 = DataModelDataclass(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
+    data_item2 = DataModelPydantic(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
+    data_item3 = DataModelPydanticComplex(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
+    data_item4 = DataModelPython(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
+    print("Example records:")
+    print(f"DataClass:\n  {data_item1}", end="\n\n")
+    print(f"Pydantic:\n  {data_item2}", end="\n\n")
+    print(f"Pydantic with annotations:\n  {data_item3}", end="\n\n")
+    print(f"Python:\n  {data_item4}", end="\n\n")
+
+    print("Item definitions:")
+    print(f"DataClass:\n  {data_item1.__kernel_vectorstoremodel_definition__}", end="\n\n")
+    print(f"Pydantic:\n  {data_item2.__kernel_vectorstoremodel_definition__}", end="\n\n")
+    print(f"Pydantic with annotations:\n  {data_item3.__kernel_vectorstoremodel_definition__}", end="\n\n")
+    print(f"Python:\n  {data_item4.__kernel_vectorstoremodel_definition__}", end="\n\n")
+    print(f"Definition for use with Pandas:\n  {data_model_definition_pandas}", end="\n\n")
+    if (
+        data_item1.__kernel_vectorstoremodel_definition__.fields
+        == data_item2.__kernel_vectorstoremodel_definition__.fields
+        == data_item3.__kernel_vectorstoremodel_definition__.fields
+        == data_item4.__kernel_vectorstoremodel_definition__.fields
+        == data_model_definition_pandas.fields
+    ):
+        print("All data models are the same")
+    else:
+        print("Data models are not the same")