Skip to content

Commit 9b63427

Browse files
Anush008NirantK
andauthored
chore: pre-commit formatting (#91)
* chore: formatting * chore: formatting * chore: remove other hooks * Update poetry lock --------- Co-authored-by: Nirant Kasliwal <[email protected]>
1 parent b01f882 commit 9b63427

19 files changed

+733
-789
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
name: ci
1+
name: ci
22
on:
33
push:
44
branches:
5-
- master
5+
- master
66
- main
77
permissions:
88
contents: write
@@ -14,7 +14,7 @@ jobs:
1414
- uses: actions/setup-python@v4
1515
with:
1616
python-version: 3.x
17-
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
17+
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
1818
- uses: actions/cache@v3
1919
with:
2020
key: mkdocs-material-${{ env.cache_id }}

.pre-commit-config.yaml

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
repos:
2-
- repo: https://github.com/pre-commit/pre-commit-hooks
3-
rev: v3.2.0
4-
hooks:
5-
- id: trailing-whitespace
6-
- id: end-of-file-fixer
7-
- id: check-yaml
8-
- id: check-added-large-files
9-
- repo: https://github.com/psf/black
10-
rev: 23.7.0
11-
hooks:
12-
- id: black
2+
- repo: https://github.com/astral-sh/ruff-pre-commit
3+
rev: v0.1.13
4+
hooks:
5+
- id: ruff
6+
types_or: [ python, pyi, jupyter ]
7+
args: [ --fix ]
8+
- id: ruff-format
9+
types_or: [ python, pyi, jupyter ]

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ The default embedding supports "query" and "passage" prefixes for the input text
1717

1818
## 🚀 Installation
1919

20-
To install the FastEmbed library, pip works:
20+
To install the FastEmbed library, pip works:
2121

2222
```bash
2323
pip install fastembed
@@ -36,8 +36,8 @@ documents: List[str] = [
3636
"passage: This is an example passage.",
3737
"fastembed is supported by and maintained by Qdrant." # You can leave out the prefix but it's recommended
3838
]
39-
embedding_model = Embedding(model_name="BAAI/bge-base-en", max_length=512)
40-
embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) # Note the list() call - this is a generator
39+
embedding_model = Embedding(model_name="BAAI/bge-base-en", max_length=512)
40+
embeddings: List[np.ndarray] = list(embedding_model.embed(documents)) # Note the list() call - this is a generator
4141
```
4242

4343
## Usage with Qdrant
@@ -48,7 +48,7 @@ Installation with Qdrant Client in Python:
4848
pip install qdrant-client[fastembed]
4949
```
5050

51-
Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
51+
Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
5252

5353
```python
5454
from qdrant_client import QdrantClient

docs/examples/FastEmbed_vs_HF_Comparison.ipynb

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,8 @@
2020
"outputs": [],
2121
"source": [
2222
"import time\n",
23-
"from pathlib import Path\n",
24-
"from typing import Any, Callable, List, Tuple\n",
23+
"from typing import Callable, List, Tuple\n",
2524
"\n",
26-
"import numpy as np\n",
2725
"import torch.nn.functional as F\n",
2826
"from fastembed.embedding import DefaultEmbedding\n",
2927
"import matplotlib.pyplot as plt\n",
@@ -116,6 +114,7 @@
116114
" sentence_embeddings = F.normalize(sentence_embeddings)\n",
117115
" return sentence_embeddings\n",
118116
"\n",
117+
"\n",
119118
"hf = HF(model_id=\"BAAI/bge-small-en-v1.5\")\n",
120119
"hf.embed(documents).shape"
121120
]
@@ -165,6 +164,8 @@
165164
],
166165
"source": [
167166
"import types\n",
167+
"\n",
168+
"\n",
168169
"def calculate_time_stats(embed_func: Callable, documents: list, k: int) -> Tuple[float, float, float]:\n",
169170
" times = []\n",
170171
" for _ in range(k):\n",
@@ -181,6 +182,7 @@
181182
" # Returning mean, max, and min time for the call\n",
182183
" return (sum(times) / k, max(times), min(times))\n",
183184
"\n",
185+
"\n",
184186
"hf_stats = calculate_time_stats(hf.embed, documents, k=2)\n",
185187
"print(f\"Huggingface Transformers (Average, Max, Min): {hf_stats}\")\n",
186188
"fst_stats = calculate_time_stats(embedding_model.embed, documents, k=2)\n",
@@ -289,6 +291,7 @@
289291
" \"\"\"\n",
290292
" return F.cosine_similarity(embeddings1, embeddings2).mean().item()\n",
291293
"\n",
294+
"\n",
292295
"calculate_cosine_similarity(hf.embed(documents), Tensor(list(embedding_model.embed(documents))))"
293296
]
294297
},

docs/examples/Supported_Models.ipynb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"%load_ext autoreload\n",
10+
"%autoreload 2"
11+
]
12+
},
313
{
414
"cell_type": "code",
515
"execution_count": 1,
@@ -141,12 +151,10 @@
141151
}
142152
],
143153
"source": [
144-
"%load_ext autoreload\n",
145-
"%autoreload 2\n",
146-
"\n",
147154
"from fastembed.embedding import Embedding\n",
148155
"import pandas as pd\n",
149-
"pd.set_option('display.max_colwidth', None)\n",
156+
"\n",
157+
"pd.set_option(\"display.max_colwidth\", None)\n",
150158
"pd.DataFrame(Embedding.list_supported_models())"
151159
]
152160
}

docs/examples/Usage_With_Qdrant.ipynb

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@
4747
"outputs": [],
4848
"source": [
4949
"from typing import List\n",
50-
"import numpy as np\n",
51-
"from fastembed.embedding import FlagEmbedding as Embedding\n",
5250
"from qdrant_client import QdrantClient"
5351
]
5452
},
@@ -170,12 +168,7 @@
170168
"ids = [42, 2]\n",
171169
"\n",
172170
"# Use the new add method\n",
173-
"client.add(\n",
174-
" collection_name=\"demo_collection\",\n",
175-
" documents=docs,\n",
176-
" metadata=metadata,\n",
177-
" ids=ids\n",
178-
")"
171+
"client.add(collection_name=\"demo_collection\", documents=docs, metadata=metadata, ids=ids)"
179172
]
180173
},
181174
{
@@ -199,10 +192,7 @@
199192
}
200193
],
201194
"source": [
202-
"search_result = client.query(\n",
203-
" collection_name=\"demo_collection\",\n",
204-
" query_text=[\"This is a query document\"]\n",
205-
")\n",
195+
"search_result = client.query(collection_name=\"demo_collection\", query_text=[\"This is a query document\"])\n",
206196
"print(search_result)"
207197
]
208198
},

docs/experimental/Binary Quantization from Scratch.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
"source": [
4747
"import numpy as np\n",
4848
"import pandas as pd\n",
49-
"import matplotlib.pyplot as plt\n",
5049
"from tqdm import tqdm"
5150
]
5251
},
@@ -305,9 +304,11 @@
305304
"sampling_rate = [1, 2, 3, 5]\n",
306305
"results = []\n",
307306
"\n",
307+
"\n",
308308
"def mean_accuracy(number_of_samples, limit, sampling_rate):\n",
309309
" return np.mean([accuracy(i, limit=limit, oversampling=sampling_rate) for i in range(number_of_samples)])\n",
310310
"\n",
311+
"\n",
311312
"for i in tqdm(sampling_rate):\n",
312313
" for j in tqdm(limits):\n",
313314
" result = {\"sampling_rate\": i, \"limit\": j, \"recall\": mean_accuracy(number_of_samples, j, i)}\n",

docs/experimental/Binary Quantization with Qdrant.ipynb

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,8 @@
3434
"outputs": [],
3535
"source": [
3636
"import pandas as pd\n",
37-
"import uuid\n",
3837
"from qdrant_client import QdrantClient\n",
39-
"from qdrant_client.http import models\n",
40-
"from qdrant_client.http.models import PointStruct"
38+
"from qdrant_client.http import models"
4139
]
4240
},
4341
{
@@ -71,6 +69,7 @@
7169
],
7270
"source": [
7371
"import datasets\n",
72+
"\n",
7473
"dataset = datasets.load_dataset(\"KShivendu/dbpedia-entities-openai-1M\", split=\"train[0:100000]\")"
7574
]
7675
},
@@ -133,10 +132,8 @@
133132
}
134133
],
135134
"source": [
136-
"from qdrant_client import QdrantClient\n",
137-
"\n",
138135
"# client = QdrantClient(\n",
139-
"# url=\"https://2aaa9439-b209-4ba6-8beb-d0b61dbd9388.us-east-1-0.aws.cloud.qdrant.io:6333\", \n",
136+
"# url=\"https://2aaa9439-b209-4ba6-8beb-d0b61dbd9388.us-east-1-0.aws.cloud.qdrant.io:6333\",\n",
140137
"# api_key=\"FCF8_ADVuSRrtNGeg_rBJvAMJecEDgQhzuXMZGW8F7OzvaC9wYOPeQ\",\n",
141138
"# prefer_grpc=True\n",
142139
"# )\n",
@@ -175,12 +172,10 @@
175172
"bs = 10000\n",
176173
"for i in range(0, len(dataset), bs):\n",
177174
" client.upload_collection(\n",
178-
" collection_name=collection_name, \n",
179-
" ids=range(i, i+bs),\n",
180-
" vectors=dataset[i:i+bs][\"openai\"],\n",
181-
" payload=[\n",
182-
" {\"text\": x} for x in dataset[i:i+bs][\"text\"]\n",
183-
" ],\n",
175+
" collection_name=collection_name,\n",
176+
" ids=range(i, i + bs),\n",
177+
" vectors=dataset[i : i + bs][\"openai\"],\n",
178+
" payload=[{\"text\": x} for x in dataset[i : i + bs][\"text\"]],\n",
184179
" parallel=10,\n",
185180
" )"
186181
]
@@ -203,10 +198,7 @@
203198
],
204199
"source": [
205200
"client.update_collection(\n",
206-
" collection_name=f\"{collection_name}\",\n",
207-
" optimizer_config=models.OptimizersConfigDiff(\n",
208-
" indexing_threshold=20000\n",
209-
" )\n",
201+
" collection_name=f\"{collection_name}\", optimizer_config=models.OptimizersConfigDiff(indexing_threshold=20000)\n",
210202
")"
211203
]
212204
},
@@ -289,6 +281,7 @@
289281
"source": [
290282
"import random\n",
291283
"from random import randint\n",
284+
"\n",
292285
"random.seed(37)\n",
293286
"\n",
294287
"query_indices = [randint(0, len(dataset)) for _ in range(100)]\n",
@@ -304,7 +297,10 @@
304297
"source": [
305298
"## Add Gaussian noise to any vector\n",
306299
"import numpy as np\n",
300+
"\n",
307301
"np.random.seed(37)\n",
302+
"\n",
303+
"\n",
308304
"def add_noise(vector, noise=0.05):\n",
309305
" return vector + noise * np.random.randn(*vector.shape)"
310306
]
@@ -959,6 +955,8 @@
959955
],
960956
"source": [
961957
"import time\n",
958+
"\n",
959+
"\n",
962960
"def correct(results, text):\n",
963961
" result_texts = [x.payload[\"text\"] for x in results]\n",
964962
" return text in result_texts\n",
@@ -977,7 +975,7 @@
977975
" rescore=rescore,\n",
978976
" oversampling=oversampling,\n",
979977
" )\n",
980-
" )\n",
978+
" ),\n",
981979
" )\n",
982980
" correct_results += correct(results, text)\n",
983981
" return correct_results\n",
@@ -996,14 +994,16 @@
996994
" start = time.time()\n",
997995
" correct_results = count_correct(query_dataset, limit=limit, oversampling=oversampling, rescore=rescore)\n",
998996
" end = time.time()\n",
999-
" results.append({\n",
1000-
" \"limit\": limit,\n",
1001-
" \"oversampling\": oversampling,\n",
1002-
" \"rescore\": rescore,\n",
1003-
" \"correct\": correct_results,\n",
1004-
" \"total queries\": len(query_dataset[\"text\"]),\n",
1005-
" \"time\": end - start,\n",
1006-
" })\n",
997+
" results.append(\n",
998+
" {\n",
999+
" \"limit\": limit,\n",
1000+
" \"oversampling\": oversampling,\n",
1001+
" \"rescore\": rescore,\n",
1002+
" \"correct\": correct_results,\n",
1003+
" \"total queries\": len(query_dataset[\"text\"]),\n",
1004+
" \"time\": end - start,\n",
1005+
" }\n",
1006+
" )\n",
10071007
"\n",
10081008
"results_df = pd.DataFrame(results)\n",
10091009
"results_df"

docs/index.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ The default embedding supports "query" and "passage" prefixes for the input text
1515

1616
## 🚀 Installation
1717

18-
To install the FastEmbed library, pip works:
18+
To install the FastEmbed library, pip works:
1919

2020
```bash
2121
pip install fastembed
@@ -32,8 +32,8 @@ documents: List[str] = [
3232
"passage: This is an example passage.",
3333
"fastembed is supported by and maintained by Qdrant." # You can leave out the prefix but it's recommended
3434
]
35-
embedding_model = Embedding(model_name="BAAI/bge-base-en", max_length=512)
36-
embeddings: List[np.ndarray] = embedding_model.embed(documents) # If you use
35+
embedding_model = Embedding(model_name="BAAI/bge-base-en", max_length=512)
36+
embeddings: List[np.ndarray] = embedding_model.embed(documents) # If you use
3737
```
3838

3939
## Usage with Qdrant
@@ -44,7 +44,7 @@ Installation with Qdrant Client in Python:
4444
pip install qdrant-client[fastembed]
4545
```
4646

47-
Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
47+
Might have to use ```pip install 'qdrant-client[fastembed]'``` on zsh.
4848

4949
```python
5050
from qdrant_client import QdrantClient
@@ -73,4 +73,4 @@ search_result = client.query(
7373
query_text="This is a query document"
7474
)
7575
print(search_result)
76-
```
76+
```

docs/overrides/main.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<a href="{{ page.nb_url }}" title="Download Notebook" class="md-content__button md-icon jp-DownloadNB">
66
{% include ".icons/material/download.svg" %}
77
</a>
8-
{% endif %}
8+
{% endif %}
99

1010
{{ super() }}
1111

@@ -24,4 +24,4 @@
2424
href="https://cloud.qdrant.io?utm_source=twitter&utm_medium=website&utm_campaign=fastembed">Qdrant Cloud</a> to
2525
get started with vector search!
2626
</div>
27-
{% endblock %}
27+
{% endblock %}

0 commit comments

Comments
 (0)