|
1 |
| -"""Wrapper around in-memory DocArray store.""" |
| 1 | +"""Wrapper around in-memory storage.""" |
2 | 2 | from __future__ import annotations
|
3 | 3 |
|
4 |
| -from typing import List, Optional, Any, Type |
5 |
| - |
6 |
| -from docarray.typing import NdArray |
| 4 | +from typing import List, Optional, Type |
7 | 5 |
|
8 | 6 | from langchain.embeddings.base import Embeddings
|
9 |
| -from langchain.schema import Document |
10 | 7 | from langchain.vectorstores.base import VST
|
11 |
| -from langchain.vectorstores.utils import maximal_marginal_relevance |
12 |
| -from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex |
| 8 | +from langchain.vectorstores.vector_store_from_doc_index import ( |
| 9 | + VecStoreFromDocIndex, |
| 10 | + _check_docarray_import, |
| 11 | +) |
13 | 12 |
|
14 | 13 |
|
15 | 14 | class InMemory(VecStoreFromDocIndex):
|
16 | 15 | """Wrapper around in-memory storage.
|
17 | 16 |
|
18 | 17 | To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
|
| 18 | + You can install it with `pip install "langchain[in_memory_store]"`. |
19 | 19 | """
|
| 20 | + |
20 | 21 | def __init__(
|
21 | 22 | self,
|
22 |
| - texts: List[str], |
23 | 23 | embedding: Embeddings,
|
24 |
| - metadatas: Optional[List[dict]] = None, |
25 |
| - metric: str = 'cosine_sim', |
| 24 | + metric: str = "cosine_sim", |
26 | 25 | ) -> None:
|
27 | 26 | """Initialize in-memory store.
|
28 | 27 |
|
29 | 28 | Args:
|
30 |
| - texts (List[str]): Text data. |
31 | 29 | embedding (Embeddings): Embedding function.
|
32 |
| - metadatas (Optional[List[dict]]): Metadata for each text if it exists. |
33 |
| - Defaults to None. |
34 | 30 | metric (str): metric for exact nearest-neighbor search.
|
35 |
| - Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. |
36 |
| - Defaults to 'cosine_sim'. |
37 |
| -
|
| 31 | + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". |
| 32 | + Defaults to "cosine_sim". |
38 | 33 | """
|
39 | 34 | _check_docarray_import()
|
40 |
| - from docarray.index import InMemoryDocIndex |
41 |
| - |
42 |
| - doc_cls = self._get_doc_cls(metric) |
43 |
| - doc_index = InMemoryDocIndex[doc_cls]() |
44 |
| - super().__init__(doc_index, texts, embedding, metadatas) |
45 |
| - |
46 |
| - @staticmethod |
47 |
| - def _get_doc_cls(sim_metric: str): |
48 |
| - from docarray import BaseDoc |
49 |
| - from pydantic import Field |
50 |
| - |
51 |
| - class DocArrayDoc(BaseDoc): |
52 |
| - text: Optional[str] |
53 |
| - embedding: Optional[NdArray] = Field(space=sim_metric) |
54 |
| - metadata: Optional[dict] |
| 35 | + from docarray.index import InMemoryExactNNIndex |
55 | 36 |
|
56 |
| - return DocArrayDoc |
| 37 | + doc_cls = self._get_doc_cls({"space": metric}) |
| 38 | + doc_index = InMemoryExactNNIndex[doc_cls]() |
| 39 | + super().__init__(doc_index, embedding) |
57 | 40 |
|
58 | 41 | @classmethod
|
59 | 42 | def from_texts(
|
60 | 43 | cls: Type[VST],
|
61 | 44 | texts: List[str],
|
62 | 45 | embedding: Embeddings,
|
63 | 46 | metadatas: Optional[List[dict]] = None,
|
64 |
| - metric: str = 'cosine_sim', |
65 |
| - **kwargs: Any |
| 47 | + metric: str = "cosine_sim", |
66 | 48 | ) -> InMemory:
|
67 |
| - return cls( |
68 |
| - texts=texts, |
69 |
| - embedding=embedding, |
70 |
| - metadatas=metadatas, |
71 |
| - metric=metric, |
72 |
| - ) |
73 |
| - # |
74 |
| - # def add_texts( |
75 |
| - # self, |
76 |
| - # texts: Iterable[str], |
77 |
| - # metadatas: Optional[List[dict]] = None, |
78 |
| - # **kwargs: Any |
79 |
| - # ) -> List[str]: |
80 |
| - # """Run more texts through the embeddings and add to the vectorstore. |
81 |
| - # |
82 |
| - # Args: |
83 |
| - # texts: Iterable of strings to add to the vectorstore. |
84 |
| - # metadatas: Optional list of metadatas associated with the texts. |
85 |
| - # |
86 |
| - # Returns: |
87 |
| - # List of ids from adding the texts into the vectorstore. |
88 |
| - # """ |
89 |
| - # if metadatas is None: |
90 |
| - # metadatas = [{} for _ in range(len(list(texts)))] |
91 |
| - # |
92 |
| - # ids = [] |
93 |
| - # embeddings = self.embedding.embed_documents(texts) |
94 |
| - # for t, m, e in zip(texts, metadatas, embeddings): |
95 |
| - # doc = self.doc_cls( |
96 |
| - # text=t, |
97 |
| - # embedding=e, |
98 |
| - # metadata=m |
99 |
| - # ) |
100 |
| - # self.docs.append(doc) |
101 |
| - # ids.append(doc.id) # TODO return index of self.docs ? |
102 |
| - # |
103 |
| - # return ids |
104 |
| - # |
105 |
| - # def similarity_search_with_score( |
106 |
| - # self, query: str, k: int = 4, **kwargs: Any |
107 |
| - # ) -> List[Tuple[Document, float]]: |
108 |
| - # """Return docs most similar to query. |
109 |
| - # |
110 |
| - # Args: |
111 |
| - # query: Text to look up documents similar to. |
112 |
| - # k: Number of Documents to return. Defaults to 4. |
113 |
| - # |
114 |
| - # Returns: |
115 |
| - # List of Documents most similar to the query and score for each. |
116 |
| - # """ |
117 |
| - # from docarray.utils.find import find # TODO move import |
118 |
| - # |
119 |
| - # query_embedding = self.embedding.embed_query(query) |
120 |
| - # query_doc = self.doc_cls(embedding=query_embedding) |
121 |
| - # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') |
122 |
| - # |
123 |
| - # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] |
124 |
| - # return result |
125 |
| - # |
126 |
| - # def similarity_search( |
127 |
| - # self, query: str, k: int = 4, **kwargs: Any |
128 |
| - # ) -> List[Document]: |
129 |
| - # """Return docs most similar to query. |
130 |
| - # |
131 |
| - # Args: |
132 |
| - # query: Text to look up documents similar to. |
133 |
| - # k: Number of Documents to return. Defaults to 4. |
134 |
| - # |
135 |
| - # Returns: |
136 |
| - # List of Documents most similar to the query. |
137 |
| - # """ |
138 |
| - # results = self.similarity_search_with_score(query, k) |
139 |
| - # return list(map(itemgetter(0), results)) |
140 |
| - # |
141 |
| - # def _similarity_search_with_relevance_scores( |
142 |
| - # self, |
143 |
| - # query: str, |
144 |
| - # k: int = 4, |
145 |
| - # **kwargs: Any, |
146 |
| - # ) -> List[Tuple[Document, float]]: |
147 |
| - # """Return docs and relevance scores, normalized on a scale from 0 to 1. |
148 |
| - # |
149 |
| - # 0 is dissimilar, 1 is most similar. |
150 |
| - # """ |
151 |
| - # raise NotImplementedError |
152 |
| - # |
153 |
| - # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: |
154 |
| - # """Return docs most similar to embedding vector. |
155 |
| - # |
156 |
| - # Args: |
157 |
| - # embedding: Embedding to look up documents similar to. |
158 |
| - # k: Number of Documents to return. Defaults to 4. |
159 |
| - # |
160 |
| - # Returns: |
161 |
| - # List of Documents most similar to the query vector. |
162 |
| - # """ |
163 |
| - # from docarray.utils.find import find |
164 |
| - # |
165 |
| - # query_doc = self.doc_cls(embedding=embedding) |
166 |
| - # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents |
167 |
| - # |
168 |
| - # result = [Document(page_content=doc.text) for doc in result_docs] |
169 |
| - # return result |
170 |
| - |
171 |
| - def max_marginal_relevance_search( |
172 |
| - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any |
173 |
| - ) -> List[Document]: |
174 |
| - """Return docs selected using the maximal marginal relevance. |
175 |
| -
|
176 |
| - Maximal marginal relevance optimizes for similarity to query AND diversity |
177 |
| - among selected documents. |
| 49 | + """Create an in-memory store and insert data. |
178 | 50 |
|
179 | 51 | Args:
|
180 |
| - query: Text to look up documents similar to. |
181 |
| - k: Number of Documents to return. Defaults to 4. |
182 |
| - fetch_k: Number of Documents to fetch to pass to MMR algorithm. |
| 52 | + texts (List[str]): Text data. |
| 53 | + embedding (Embeddings): Embedding function. |
| 54 | + metadatas (Optional[List[dict]]): Metadata for each text if it exists. |
| 55 | + Defaults to None. |
| 56 | + metric (str): metric for exact nearest-neighbor search. |
| 57 | + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". |
| 58 | + Defaults to "cosine_sim". |
183 | 59 |
|
184 | 60 | Returns:
|
185 |
| - List of Documents selected by maximal marginal relevance. |
186 |
| - """ |
187 |
| - from docarray.utils.find import find |
188 |
| - |
189 |
| - query_embedding = self.embedding.embed_query(query) |
190 |
| - query_doc = self.doc_cls(embedding=query_embedding) |
191 |
| - find_res = find(self.docs, query_doc, limit=k) |
192 |
| - |
193 |
| - embeddings = [emb for emb in find_res.documents.emb] |
194 |
| - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) |
195 |
| - results = [] |
196 |
| - for idx in mmr_selected: |
197 |
| - results.append(Document(page_content=self.docs[idx].text)) |
198 |
| - return results |
199 |
| - |
| 61 | + InMemory Vector Store |
| 62 | + """ |
| 63 | + store = cls( |
| 64 | + embedding=embedding, |
| 65 | + metric=metric, |
| 66 | + ) |
| 67 | + store.add_texts(texts=texts, metadatas=metadatas) |
| 68 | + return store |
0 commit comments