Skip to content

Commit de262f9

Browse files
fix: clean up and add dependencies
Signed-off-by: anna-charlotte <[email protected]>
1 parent b687fd4 commit de262f9

File tree

8 files changed

+186
-279
lines changed

8 files changed

+186
-279
lines changed

langchain/vectorstores/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from langchain.vectorstores.deeplake import DeepLake
88
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
99
from langchain.vectorstores.faiss import FAISS
10+
from langchain.vectorstores.hnsw_lib import HnswLib
11+
from langchain.vectorstores.in_memory import InMemory
1012
from langchain.vectorstores.milvus import Milvus
1113
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
1214
from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
@@ -34,4 +36,6 @@
3436
"MyScaleSettings",
3537
"SupabaseVectorStore",
3638
"AnalyticDB",
39+
"HnswLib",
40+
"InMemory",
3741
]

langchain/vectorstores/hnsw_lib.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,38 @@
1-
"""Wrapper around in-memory DocArray store."""
1+
"""Wrapper around HnswLib store."""
22
from __future__ import annotations
33

4-
from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING
5-
from docarray.typing import NdArray
4+
from typing import List, Optional, Type
65

76
from langchain.embeddings.base import Embeddings
87
from langchain.vectorstores.base import VST
9-
from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import
8+
from langchain.vectorstores.vector_store_from_doc_index import (
9+
VecStoreFromDocIndex,
10+
_check_docarray_import,
11+
)
1012

1113

1214
class HnswLib(VecStoreFromDocIndex):
1315
"""Wrapper around HnswLib storage.
1416
15-
To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
17+
To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed.
18+
You can install it with `pip install "langchain[hnswlib]"`.
1619
"""
20+
1721
def __init__(
1822
self,
19-
texts: List[str],
2023
embedding: Embeddings,
2124
work_dir: str,
2225
n_dim: int,
23-
metadatas: Optional[List[dict]],
24-
dist_metric: str = 'cosine',
25-
**kwargs,
26+
dist_metric: str = "cosine",
2627
) -> None:
2728
"""Initialize HnswLib store.
2829
2930
Args:
30-
texts (List[str]): Text data.
3131
embedding (Embeddings): Embedding function.
32-
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
33-
Defaults to None.
3432
work_dir (str): path to the location where all the data will be stored.
3533
n_dim (int): dimension of an embedding.
36-
dist_metric (str): Distance metric for HnswLib can be one of: 'cosine',
37-
'ip', and 'l2'. Defaults to 'cosine'.
34+
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
35+
"ip", and "l2". Defaults to "cosine".
3836
"""
3937
_check_docarray_import()
4038
from docarray.index import HnswDocumentIndex
@@ -43,25 +41,13 @@ def __init__(
4341
import google.protobuf
4442
except ImportError:
4543
raise ImportError(
46-
"Could not import protobuf python package. "
47-
"Please install it with `pip install -U protobuf`."
44+
"Could not import all required packages. "
45+
"Please install it with `pip install \"langchain[hnswlib]\"`."
4846
)
4947

50-
doc_cls = self._get_doc_cls(n_dim, dist_metric)
48+
doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric})
5149
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)
52-
super().__init__(doc_index, texts, embedding, metadatas)
53-
54-
@staticmethod
55-
def _get_doc_cls(n_dim: int, sim_metric: str):
56-
from docarray import BaseDoc
57-
from pydantic import Field
58-
59-
class DocArrayDoc(BaseDoc):
60-
text: Optional[str]
61-
embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric)
62-
metadata: Optional[dict]
63-
64-
return DocArrayDoc
50+
super().__init__(doc_index, embedding)
6551

6652
@classmethod
6753
def from_texts(
@@ -71,21 +57,33 @@ def from_texts(
7157
metadatas: Optional[List[dict]] = None,
7258
work_dir: str = None,
7359
n_dim: int = None,
74-
dist_metric: str = 'cosine',
75-
**kwargs: Any
60+
dist_metric: str = "cosine",
7661
) -> HnswLib:
62+
"""Create an HnswLib store and insert data.
7763
64+
Args:
65+
texts (List[str]): Text data.
66+
embedding (Embeddings): Embedding function.
67+
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
68+
Defaults to None.
69+
work_dir (str): path to the location where all the data will be stored.
70+
n_dim (int): dimension of an embedding.
71+
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
72+
"ip", and "l2". Defaults to "cosine".
73+
74+
Returns:
75+
HnswLib Vector Store
76+
"""
7877
if work_dir is None:
79-
raise ValueError('`work_dir` parameter hs not been set.')
78+
raise ValueError("`work_dir` parameter hs not been set.")
8079
if n_dim is None:
81-
raise ValueError('`n_dim` parameter has not been set.')
80+
raise ValueError("`n_dim` parameter has not been set.")
8281

83-
return cls(
82+
store = cls(
8483
work_dir=work_dir,
8584
n_dim=n_dim,
86-
texts=texts,
8785
embedding=embedding,
88-
metadatas=metadatas,
8986
dist_metric=dist_metric,
90-
kwargs=kwargs,
9187
)
88+
store.add_texts(texts=texts, metadatas=metadatas)
89+
return store

langchain/vectorstores/in_memory.py

Lines changed: 32 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -1,199 +1,68 @@
1-
"""Wrapper around in-memory DocArray store."""
1+
"""Wrapper around in-memory storage."""
22
from __future__ import annotations
33

4-
from typing import List, Optional, Any, Type
5-
6-
from docarray.typing import NdArray
4+
from typing import List, Optional, Type
75

86
from langchain.embeddings.base import Embeddings
9-
from langchain.schema import Document
107
from langchain.vectorstores.base import VST
11-
from langchain.vectorstores.utils import maximal_marginal_relevance
12-
from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex
8+
from langchain.vectorstores.vector_store_from_doc_index import (
9+
VecStoreFromDocIndex,
10+
_check_docarray_import,
11+
)
1312

1413

1514
class InMemory(VecStoreFromDocIndex):
1615
"""Wrapper around in-memory storage.
1716
1817
To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
18+
You can install it with `pip install "langchain[in_memory_store]"`.
1919
"""
20+
2021
def __init__(
2122
self,
22-
texts: List[str],
2323
embedding: Embeddings,
24-
metadatas: Optional[List[dict]] = None,
25-
metric: str = 'cosine_sim',
24+
metric: str = "cosine_sim",
2625
) -> None:
2726
"""Initialize in-memory store.
2827
2928
Args:
30-
texts (List[str]): Text data.
3129
embedding (Embeddings): Embedding function.
32-
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
33-
Defaults to None.
3430
metric (str): metric for exact nearest-neighbor search.
35-
Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'.
36-
Defaults to 'cosine_sim'.
37-
31+
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
32+
Defaults to "cosine_sim".
3833
"""
3934
_check_docarray_import()
40-
from docarray.index import InMemoryDocIndex
41-
42-
doc_cls = self._get_doc_cls(metric)
43-
doc_index = InMemoryDocIndex[doc_cls]()
44-
super().__init__(doc_index, texts, embedding, metadatas)
45-
46-
@staticmethod
47-
def _get_doc_cls(sim_metric: str):
48-
from docarray import BaseDoc
49-
from pydantic import Field
50-
51-
class DocArrayDoc(BaseDoc):
52-
text: Optional[str]
53-
embedding: Optional[NdArray] = Field(space=sim_metric)
54-
metadata: Optional[dict]
35+
from docarray.index import InMemoryExactNNIndex
5536

56-
return DocArrayDoc
37+
doc_cls = self._get_doc_cls({"space": metric})
38+
doc_index = InMemoryExactNNIndex[doc_cls]()
39+
super().__init__(doc_index, embedding)
5740

5841
@classmethod
5942
def from_texts(
6043
cls: Type[VST],
6144
texts: List[str],
6245
embedding: Embeddings,
6346
metadatas: Optional[List[dict]] = None,
64-
metric: str = 'cosine_sim',
65-
**kwargs: Any
47+
metric: str = "cosine_sim",
6648
) -> InMemory:
67-
return cls(
68-
texts=texts,
69-
embedding=embedding,
70-
metadatas=metadatas,
71-
metric=metric,
72-
)
73-
#
74-
# def add_texts(
75-
# self,
76-
# texts: Iterable[str],
77-
# metadatas: Optional[List[dict]] = None,
78-
# **kwargs: Any
79-
# ) -> List[str]:
80-
# """Run more texts through the embeddings and add to the vectorstore.
81-
#
82-
# Args:
83-
# texts: Iterable of strings to add to the vectorstore.
84-
# metadatas: Optional list of metadatas associated with the texts.
85-
#
86-
# Returns:
87-
# List of ids from adding the texts into the vectorstore.
88-
# """
89-
# if metadatas is None:
90-
# metadatas = [{} for _ in range(len(list(texts)))]
91-
#
92-
# ids = []
93-
# embeddings = self.embedding.embed_documents(texts)
94-
# for t, m, e in zip(texts, metadatas, embeddings):
95-
# doc = self.doc_cls(
96-
# text=t,
97-
# embedding=e,
98-
# metadata=m
99-
# )
100-
# self.docs.append(doc)
101-
# ids.append(doc.id) # TODO return index of self.docs ?
102-
#
103-
# return ids
104-
#
105-
# def similarity_search_with_score(
106-
# self, query: str, k: int = 4, **kwargs: Any
107-
# ) -> List[Tuple[Document, float]]:
108-
# """Return docs most similar to query.
109-
#
110-
# Args:
111-
# query: Text to look up documents similar to.
112-
# k: Number of Documents to return. Defaults to 4.
113-
#
114-
# Returns:
115-
# List of Documents most similar to the query and score for each.
116-
# """
117-
# from docarray.utils.find import find # TODO move import
118-
#
119-
# query_embedding = self.embedding.embed_query(query)
120-
# query_doc = self.doc_cls(embedding=query_embedding)
121-
# docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding')
122-
#
123-
# result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)]
124-
# return result
125-
#
126-
# def similarity_search(
127-
# self, query: str, k: int = 4, **kwargs: Any
128-
# ) -> List[Document]:
129-
# """Return docs most similar to query.
130-
#
131-
# Args:
132-
# query: Text to look up documents similar to.
133-
# k: Number of Documents to return. Defaults to 4.
134-
#
135-
# Returns:
136-
# List of Documents most similar to the query.
137-
# """
138-
# results = self.similarity_search_with_score(query, k)
139-
# return list(map(itemgetter(0), results))
140-
#
141-
# def _similarity_search_with_relevance_scores(
142-
# self,
143-
# query: str,
144-
# k: int = 4,
145-
# **kwargs: Any,
146-
# ) -> List[Tuple[Document, float]]:
147-
# """Return docs and relevance scores, normalized on a scale from 0 to 1.
148-
#
149-
# 0 is dissimilar, 1 is most similar.
150-
# """
151-
# raise NotImplementedError
152-
#
153-
# def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]:
154-
# """Return docs most similar to embedding vector.
155-
#
156-
# Args:
157-
# embedding: Embedding to look up documents similar to.
158-
# k: Number of Documents to return. Defaults to 4.
159-
#
160-
# Returns:
161-
# List of Documents most similar to the query vector.
162-
# """
163-
# from docarray.utils.find import find
164-
#
165-
# query_doc = self.doc_cls(embedding=embedding)
166-
# result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents
167-
#
168-
# result = [Document(page_content=doc.text) for doc in result_docs]
169-
# return result
170-
171-
def max_marginal_relevance_search(
172-
self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any
173-
) -> List[Document]:
174-
"""Return docs selected using the maximal marginal relevance.
175-
176-
Maximal marginal relevance optimizes for similarity to query AND diversity
177-
among selected documents.
49+
"""Create an in-memory store and insert data.
17850
17951
Args:
180-
query: Text to look up documents similar to.
181-
k: Number of Documents to return. Defaults to 4.
182-
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
52+
texts (List[str]): Text data.
53+
embedding (Embeddings): Embedding function.
54+
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
55+
Defaults to None.
56+
metric (str): metric for exact nearest-neighbor search.
57+
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
58+
Defaults to "cosine_sim".
18359
18460
Returns:
185-
List of Documents selected by maximal marginal relevance.
186-
"""
187-
from docarray.utils.find import find
188-
189-
query_embedding = self.embedding.embed_query(query)
190-
query_doc = self.doc_cls(embedding=query_embedding)
191-
find_res = find(self.docs, query_doc, limit=k)
192-
193-
embeddings = [emb for emb in find_res.documents.emb]
194-
mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k)
195-
results = []
196-
for idx in mmr_selected:
197-
results.append(Document(page_content=self.docs[idx].text))
198-
return results
199-
61+
InMemory Vector Store
62+
"""
63+
store = cls(
64+
embedding=embedding,
65+
metric=metric,
66+
)
67+
store.add_texts(texts=texts, metadatas=metadatas)
68+
return store

0 commit comments

Comments
 (0)