Skip to content

Commit 2c00233

Browse files
davidbuniatDavit Buniatyan
andauthored
Deep Lake mini upgrades (langchain-ai#3375)
Improvements * set default num_workers for ingestion to 0 * upgraded notebooks for avoiding dataset creation ambiguity * added `force_delete_dataset_by_path` * bumped deeplake to 3.3.0 * creds arg passing to deeplake object that would allow custom S3 Notes * please double check if poetry is not messed up (thanks!) Asks * Would be great to create a shared slack channel for quick questions --------- Co-authored-by: Davit Buniatyan <[email protected]>
1 parent 93d53e4 commit 2c00233

File tree

7 files changed

+475
-164
lines changed

7 files changed

+475
-164
lines changed

docs/modules/indexes/vectorstores/examples/deeplake.ipynb

Lines changed: 398 additions & 86 deletions
Large diffs are not rendered by default.

docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb

Lines changed: 25 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,24 @@
4040
"from langchain.vectorstores import DeepLake\n",
4141
"\n",
4242
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
43-
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
44-
"embeddings = OpenAIEmbeddings()"
43+
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"embeddings = OpenAIEmbeddings(disallowed_special=())"
53+
]
54+
},
55+
{
56+
"attachments": {},
57+
"cell_type": "markdown",
58+
"metadata": {},
59+
"source": [
60+
"disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
4561
]
4662
},
4763
{
@@ -120,7 +136,9 @@
120136
"metadata": {},
121137
"outputs": [],
122138
"source": [
123-
"db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
139+
"username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
140+
"db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
141+
"db.add_documents(texts)"
124142
]
125143
},
126144
{
@@ -133,61 +151,9 @@
133151
},
134152
{
135153
"cell_type": "code",
136-
"execution_count": 7,
154+
"execution_count": null,
137155
"metadata": {},
138-
"outputs": [
139-
{
140-
"name": "stderr",
141-
"output_type": "stream",
142-
"text": [
143-
"-"
144-
]
145-
},
146-
{
147-
"name": "stdout",
148-
"output_type": "stream",
149-
"text": [
150-
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
151-
"\n"
152-
]
153-
},
154-
{
155-
"name": "stderr",
156-
"output_type": "stream",
157-
"text": [
158-
"-"
159-
]
160-
},
161-
{
162-
"name": "stdout",
163-
"output_type": "stream",
164-
"text": [
165-
"hub://davitbun/twitter-algorithm loaded successfully.\n",
166-
"\n"
167-
]
168-
},
169-
{
170-
"name": "stderr",
171-
"output_type": "stream",
172-
"text": [
173-
"Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
174-
]
175-
},
176-
{
177-
"name": "stdout",
178-
"output_type": "stream",
179-
"text": [
180-
"Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
181-
"\n",
182-
" tensor htype shape dtype compression\n",
183-
" ------- ------- ------- ------- ------- \n",
184-
" embedding generic (23152, 1536) float32 None \n",
185-
" ids text (23152, 1) str None \n",
186-
" metadata json (23152, 1) str None \n",
187-
" text text (23152, 1) str None \n"
188-
]
189-
}
190-
],
156+
"outputs": [],
191157
"source": [
192158
"db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
193159
]
@@ -203,7 +169,7 @@
203169
"retriever.search_kwargs['distance_metric'] = 'cos'\n",
204170
"retriever.search_kwargs['fetch_k'] = 100\n",
205171
"retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
206-
"retriever.search_kwargs['k'] = 20"
172+
"retriever.search_kwargs['k'] = 10"
207173
]
208174
},
209175
{
@@ -241,7 +207,7 @@
241207
"from langchain.chat_models import ChatOpenAI\n",
242208
"from langchain.chains import ConversationalRetrievalChain\n",
243209
"\n",
244-
"model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
210+
"model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
245211
"qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
246212
]
247213
},

docs/use_cases/question_answering/semantic-search-over-chat.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
"\n",
109109
"dataset_path = 'hub://'+org+'/data'\n",
110110
"embeddings = OpenAIEmbeddings()\n",
111-
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
111+
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
112112
]
113113
},
114114
{

langchain/vectorstores/deeplake.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ def vector_search(
4343
returns:
4444
nearest_indices: List, indices of nearest neighbors
4545
"""
46+
if data_vectors.shape[0] == 0:
47+
return [], []
48+
4649
# Calculate the distance between the query_vector and all data_vectors
4750
distances = distance_metric_map[distance_metric](query_embedding, data_vectors)
4851
nearest_indices = np.argsort(distances)
@@ -87,7 +90,7 @@ class DeepLake(VectorStore):
8790
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
8891
"""
8992

90-
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
93+
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/"
9194

9295
def __init__(
9396
self,
@@ -96,7 +99,7 @@ def __init__(
9699
embedding_function: Optional[Embeddings] = None,
97100
read_only: Optional[bool] = False,
98101
ingestion_batch_size: int = 1024,
99-
num_workers: int = 4,
102+
num_workers: int = 0,
100103
**kwargs: Any,
101104
) -> None:
102105
"""Initialize with Deep Lake client."""
@@ -112,8 +115,13 @@ def __init__(
112115
"Please install it with `pip install deeplake`."
113116
)
114117
self._deeplake = deeplake
118+
self.dataset_path = dataset_path
119+
creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {}
115120

116-
if deeplake.exists(dataset_path, token=token):
121+
if (
122+
deeplake.exists(dataset_path, token=token, **creds_args)
123+
and "overwrite" not in kwargs
124+
):
117125
self.ds = deeplake.load(
118126
dataset_path, token=token, read_only=read_only, **kwargs
119127
)
@@ -123,6 +131,9 @@ def __init__(
123131
)
124132
self.ds.summary()
125133
else:
134+
if "overwrite" in kwargs:
135+
del kwargs["overwrite"]
136+
126137
self.ds = deeplake.empty(
127138
dataset_path, token=token, overwrite=True, **kwargs
128139
)
@@ -215,14 +226,18 @@ def ingest(sample_in: list, sample_out: list) -> None:
215226
)
216227

217228
batch_size = min(self.ingestion_batch_size, len(elements))
229+
if batch_size == 0:
230+
return []
231+
218232
batched = [
219233
elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
220234
]
221235

222236
ingest().eval(
223237
batched,
224238
self.ds,
225-
num_workers=min(self.num_workers, len(batched) // self.num_workers),
239+
num_workers=min(self.num_workers, len(batched) // max(self.num_workers, 1)),
240+
**kwargs,
226241
)
227242
self.ds.commit(allow_empty=True)
228243
self.ds.summary()
@@ -443,8 +458,8 @@ def from_texts(
443458
) -> DeepLake:
444459
"""Create a Deep Lake dataset from a raw documents.
445460
446-
If a dataset_path is specified, the dataset will be persisted there.
447-
Otherwise, the data will be ephemeral in-memory.
461+
If a dataset_path is specified, the dataset will be persisted in that location,
462+
otherwise by default at `./deeplake`
448463
449464
Args:
450465
path (str, pathlib.Path): - The full path to the dataset. Can be:
@@ -493,7 +508,7 @@ def delete(
493508
Defaults to None.
494509
"""
495510
if delete_all:
496-
self.ds.delete()
511+
self.ds.delete(large_ok=True)
497512
return True
498513

499514
view = None
@@ -515,6 +530,18 @@ def delete(
515530

516531
return True
517532

533+
@classmethod
534+
def force_delete_by_path(cls, path: str) -> None:
535+
"""Force delete dataset by path"""
536+
try:
537+
import deeplake
538+
except ImportError:
539+
raise ValueError(
540+
"Could not import deeplake python package. "
541+
"Please install it with `pip install deeplake`."
542+
)
543+
deeplake.delete(path, large_ok=True, force=True)
544+
518545
def delete_dataset(self) -> None:
519546
"""Delete the collection."""
520547
self.delete(delete_all=True)

poetry.lock

Lines changed: 9 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ arxiv = {version = "^1.4", optional = true}
5656
pypdf = {version = "^3.4.0", optional = true}
5757
networkx = {version="^2.6.3", optional = true}
5858
aleph-alpha-client = {version="^2.15.0", optional = true}
59-
deeplake = {version = "^3.2.21", optional = true}
59+
deeplake = {version = "^3.3.0", optional = true}
6060
pgvector = {version = "^0.1.6", optional = true}
6161
psycopg2-binary = {version = "^2.9.5", optional = true}
6262
#boto3 = {version = "^1.26.96", optional = true} # TODO: fix it, commented because the version failed with deeplake

tests/integration_tests/vectorstores/test_deeplake.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,10 @@ def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
164164
assert len(deeplake_datastore.ds) == 2
165165

166166
deeplake_datastore.delete_dataset()
167+
168+
169+
def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
170+
"""Test delete dataset."""
171+
path = deeplake_datastore.dataset_path
172+
DeepLake.force_delete_by_path(path)
173+
assert not deeplake.exists(path)

0 commit comments

Comments
 (0)