docarray
diff --git a/‎langchain/vectorstores/__init__.py
Lines changed: 4 additions & 0 deletions b/‎langchain/vectorstores/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎langchain/vectorstores/hnsw_lib.py
Lines changed: 36 additions & 38 deletions b/‎langchain/vectorstores/hnsw_lib.py
Lines changed: 36 additions & 38 deletions
diff --git a/‎langchain/vectorstores/in_memory.py
Lines changed: 32 additions & 163 deletions b/‎langchain/vectorstores/in_memory.py
Lines changed: 32 additions & 163 deletions
@@ -7,6 +7,8 @@
 from langchain.vectorstores.deeplake import DeepLake
 from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
 from langchain.vectorstores.faiss import FAISS
+from langchain.vectorstores.hnsw_lib import HnswLib
+from langchain.vectorstores.in_memory import InMemory
 from langchain.vectorstores.milvus import Milvus
 from langchain.vectorstores.myscale import MyScale, MyScaleSettings
 from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
@@ -34,4 +36,6 @@
     "MyScaleSettings",
     "SupabaseVectorStore",
     "AnalyticDB",
+    "HnswLib",
+    "InMemory",
 ]
@@ -1,40 +1,38 @@
-"""Wrapper around in-memory DocArray store."""
+"""Wrapper around HnswLib store."""
 from __future__ import annotations
 
-from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING
-from docarray.typing import NdArray
+from typing import List, Optional, Type
 
 from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.base import VST
-from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import
+from langchain.vectorstores.vector_store_from_doc_index import (
+    VecStoreFromDocIndex,
+    _check_docarray_import,
+)
 
 
 class HnswLib(VecStoreFromDocIndex):
     """Wrapper around HnswLib storage.
 
-    To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
+    To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed.
+    You can install it with `pip install "langchain[hnswlib]"`.
     """
+
     def __init__(
         self,
-        texts: List[str],
         embedding: Embeddings,
         work_dir: str,
         n_dim: int,
-        metadatas: Optional[List[dict]],
-        dist_metric: str = 'cosine',
-        **kwargs,
+        dist_metric: str = "cosine",
     ) -> None:
         """Initialize HnswLib store.
 
         Args:
-            texts (List[str]): Text data.
             embedding (Embeddings): Embedding function.
-            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
-                Defaults to None.
             work_dir (str): path to the location where all the data will be stored.
             n_dim (int): dimension of an embedding.
-            dist_metric (str): Distance metric for HnswLib can be one of: 'cosine',
-                'ip', and 'l2'. Defaults to 'cosine'.
+            dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
+                "ip", and "l2". Defaults to "cosine".
         """
         _check_docarray_import()
         from docarray.index import HnswDocumentIndex
@@ -43,25 +41,13 @@ def __init__(
             import google.protobuf
         except ImportError:
             raise ImportError(
-                "Could not import protobuf python package. "
-                "Please install it with `pip install -U protobuf`."
+                "Could not import all required packages. "
+                "Please install it with `pip install \"langchain[hnswlib]\"`."
             )
 
-        doc_cls = self._get_doc_cls(n_dim, dist_metric)
+        doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric})
         doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)
-        super().__init__(doc_index, texts, embedding, metadatas)
-
-    @staticmethod
-    def _get_doc_cls(n_dim: int, sim_metric: str):
-        from docarray import BaseDoc
-        from pydantic import Field
-
-        class DocArrayDoc(BaseDoc):
-            text: Optional[str]
-            embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric)
-            metadata: Optional[dict]
-
-        return DocArrayDoc
+        super().__init__(doc_index, embedding)
 
     @classmethod
     def from_texts(
@@ -71,21 +57,33 @@ def from_texts(
         metadatas: Optional[List[dict]] = None,
         work_dir: str = None,
         n_dim: int = None,
-        dist_metric: str = 'cosine',
-        **kwargs: Any
+        dist_metric: str = "cosine",
     ) -> HnswLib:
+        """Create an HnswLib store and insert data.
 
+        Args:
+            texts (List[str]): Text data.
+            embedding (Embeddings): Embedding function.
+            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
+                Defaults to None.
+            work_dir (str): path to the location where all the data will be stored.
+            n_dim (int): dimension of an embedding.
+            dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
+                "ip", and "l2". Defaults to "cosine".
+
+        Returns:
+            HnswLib Vector Store
+        """
         if work_dir is None:
-            raise ValueError('`work_dir` parameter hs not been set.')
+            raise ValueError("`work_dir` parameter hs not been set.")
         if n_dim is None:
-            raise ValueError('`n_dim` parameter has not been set.')
+            raise ValueError("`n_dim` parameter has not been set.")
 
-        return cls(
+        store = cls(
             work_dir=work_dir,
             n_dim=n_dim,
-            texts=texts,
             embedding=embedding,
-            metadatas=metadatas,
             dist_metric=dist_metric,
-            kwargs=kwargs,
         )
+        store.add_texts(texts=texts, metadatas=metadatas)
+        return store
@@ -1,199 +1,68 @@
-"""Wrapper around in-memory DocArray store."""
+"""Wrapper around in-memory storage."""
 from __future__ import annotations
 
-from typing import List, Optional, Any, Type
-
-from docarray.typing import NdArray
+from typing import List, Optional, Type
 
 from langchain.embeddings.base import Embeddings
-from langchain.schema import Document
 from langchain.vectorstores.base import VST
-from langchain.vectorstores.utils import maximal_marginal_relevance
-from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex
+from langchain.vectorstores.vector_store_from_doc_index import (
+    VecStoreFromDocIndex,
+    _check_docarray_import,
+)
 
 
 class InMemory(VecStoreFromDocIndex):
     """Wrapper around in-memory storage.
 
     To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
+    You can install it with `pip install "langchain[in_memory_store]"`.
     """
+
     def __init__(
         self,
-        texts: List[str],
         embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
-        metric: str = 'cosine_sim',
+        metric: str = "cosine_sim",
     ) -> None:
         """Initialize in-memory store.
 
         Args:
-            texts (List[str]): Text data.
             embedding (Embeddings): Embedding function.
-            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
-                Defaults to None.
             metric (str): metric for exact nearest-neighbor search.
-                Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'.
-                Defaults to 'cosine_sim'.
-
+                Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
+                Defaults to "cosine_sim".
         """
         _check_docarray_import()
-        from docarray.index import InMemoryDocIndex
-
-        doc_cls = self._get_doc_cls(metric)
-        doc_index = InMemoryDocIndex[doc_cls]()
-        super().__init__(doc_index, texts, embedding, metadatas)
-
-    @staticmethod
-    def _get_doc_cls(sim_metric: str):
-        from docarray import BaseDoc
-        from pydantic import Field
-
-        class DocArrayDoc(BaseDoc):
-            text: Optional[str]
-            embedding: Optional[NdArray] = Field(space=sim_metric)
-            metadata: Optional[dict]
+        from docarray.index import InMemoryExactNNIndex
 
-        return DocArrayDoc
+        doc_cls = self._get_doc_cls({"space": metric})
+        doc_index = InMemoryExactNNIndex[doc_cls]()
+        super().__init__(doc_index, embedding)
 
     @classmethod
     def from_texts(
         cls: Type[VST],
         texts: List[str],
         embedding: Embeddings,
         metadatas: Optional[List[dict]] = None,
-        metric: str = 'cosine_sim',
-        **kwargs: Any
+        metric: str = "cosine_sim",
     ) -> InMemory:
-        return cls(
-            texts=texts,
-            embedding=embedding,
-            metadatas=metadatas,
-            metric=metric,
-        )
-    #
-    # def add_texts(
-    #     self,
-    #     texts: Iterable[str],
-    #     metadatas: Optional[List[dict]] = None,
-    #     **kwargs: Any
-    # ) -> List[str]:
-    #     """Run more texts through the embeddings and add to the vectorstore.
-    #
-    #     Args:
-    #         texts: Iterable of strings to add to the vectorstore.
-    #         metadatas: Optional list of metadatas associated with the texts.
-    #
-    #     Returns:
-    #         List of ids from adding the texts into the vectorstore.
-    #     """
-    #     if metadatas is None:
-    #         metadatas = [{} for _ in range(len(list(texts)))]
-    #
-    #     ids = []
-    #     embeddings = self.embedding.embed_documents(texts)
-    #     for t, m, e in zip(texts, metadatas, embeddings):
-    #         doc = self.doc_cls(
-    #             text=t,
-    #             embedding=e,
-    #             metadata=m
-    #         )
-    #         self.docs.append(doc)
-    #         ids.append(doc.id)  # TODO return index of self.docs ?
-    #
-    #     return ids
-    #
-    # def similarity_search_with_score(
-    #     self, query: str, k: int = 4, **kwargs: Any
-    # ) -> List[Tuple[Document, float]]:
-    #     """Return docs most similar to query.
-    #
-    #     Args:
-    #         query: Text to look up documents similar to.
-    #         k: Number of Documents to return. Defaults to 4.
-    #
-    #     Returns:
-    #         List of Documents most similar to the query and score for each.
-    #     """
-    #     from docarray.utils.find import find  # TODO move import
-    #
-    #     query_embedding = self.embedding.embed_query(query)
-    #     query_doc = self.doc_cls(embedding=query_embedding)
-    #     docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding')
-    #
-    #     result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)]
-    #     return result
-    #
-    # def similarity_search(
-    #     self, query: str, k: int = 4, **kwargs: Any
-    # ) -> List[Document]:
-    #     """Return docs most similar to query.
-    #
-    #     Args:
-    #         query: Text to look up documents similar to.
-    #         k: Number of Documents to return. Defaults to 4.
-    #
-    #     Returns:
-    #         List of Documents most similar to the query.
-    #     """
-    #     results = self.similarity_search_with_score(query, k)
-    #     return list(map(itemgetter(0), results))
-    #
-    # def _similarity_search_with_relevance_scores(
-    #     self,
-    #     query: str,
-    #     k: int = 4,
-    #     **kwargs: Any,
-    # ) -> List[Tuple[Document, float]]:
-    #     """Return docs and relevance scores, normalized on a scale from 0 to 1.
-    #
-    #     0 is dissimilar, 1 is most similar.
-    #     """
-    #     raise NotImplementedError
-    #
-    # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]:
-    #     """Return docs most similar to embedding vector.
-    #
-    #     Args:
-    #         embedding: Embedding to look up documents similar to.
-    #         k: Number of Documents to return. Defaults to 4.
-    #
-    #     Returns:
-    #         List of Documents most similar to the query vector.
-    #     """
-    #     from docarray.utils.find import find
-    #
-    #     query_doc = self.doc_cls(embedding=embedding)
-    #     result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents
-    #
-    #     result = [Document(page_content=doc.text) for doc in result_docs]
-    #     return result
-
-    def max_marginal_relevance_search(
-        self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any
-    ) -> List[Document]:
-        """Return docs selected using the maximal marginal relevance.
-
-        Maximal marginal relevance optimizes for similarity to query AND diversity
-        among selected documents.
+        """Create an in-memory store and insert data.
 
         Args:
-            query: Text to look up documents similar to.
-            k: Number of Documents to return. Defaults to 4.
-            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+            texts (List[str]): Text data.
+            embedding (Embeddings): Embedding function.
+            metadatas (Optional[List[dict]]): Metadata for each text if it exists.
+                Defaults to None.
+            metric (str): metric for exact nearest-neighbor search.
+                Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
+                Defaults to "cosine_sim".
 
         Returns:
-            List of Documents selected by maximal marginal relevance.
-                """
-        from docarray.utils.find import find
-
-        query_embedding = self.embedding.embed_query(query)
-        query_doc = self.doc_cls(embedding=query_embedding)
-        find_res = find(self.docs, query_doc, limit=k)
-
-        embeddings = [emb for emb in find_res.documents.emb]
-        mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k)
-        results = []
-        for idx in mmr_selected:
-            results.append(Document(page_content=self.docs[idx].text))
-        return results
-
+            InMemory Vector Store
+        """
+        store = cls(
+            embedding=embedding,
+            metric=metric,
+        )
+        store.add_texts(texts=texts, metadatas=metadatas)
+        return store