|
47 | 47 | },
|
48 | 48 | {
|
49 | 49 | "cell_type": "code",
|
50 |
| - "execution_count": 2, |
| 50 | + "execution_count": null, |
51 | 51 | "metadata": {
|
52 | 52 | "ExecuteTime": {
|
53 | 53 | "end_time": "2024-03-30T00:49:20.516644Z",
|
|
56 | 56 | },
|
57 | 57 | "outputs": [],
|
58 | 58 | "source": [
|
59 |
| - "from fastembed import SparseTextEmbedding, SparseEmbedding\n", |
60 |
| - "from typing import List" |
| 59 | + "from fastembed import SparseTextEmbedding, SparseEmbedding" |
61 | 60 | ]
|
62 | 61 | },
|
63 | 62 | {
|
|
134 | 133 | },
|
135 | 134 | {
|
136 | 135 | "cell_type": "code",
|
137 |
| - "execution_count": 5, |
| 136 | + "execution_count": null, |
138 | 137 | "metadata": {
|
139 | 138 | "ExecuteTime": {
|
140 | 139 | "end_time": "2024-03-30T00:49:28.624109Z",
|
|
143 | 142 | },
|
144 | 143 | "outputs": [],
|
145 | 144 | "source": [
|
146 |
| - "documents: List[str] = [\n", |
| 145 | + "documents: list[str] = [\n", |
147 | 146 | " \"Chandrayaan-3 is India's third lunar mission\",\n",
|
148 | 147 | " \"It aimed to land a rover on the Moon's surface - joining the US, China and Russia\",\n",
|
149 | 148 | " \"The mission is a follow-up to Chandrayaan-2, which had partial success\",\n",
|
|
157 | 156 | " \"Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota\",\n",
|
158 | 157 | " \"Chandrayaan-3 was launched earlier in the year 2023\",\n",
|
159 | 158 | "]\n",
|
160 |
| - "sparse_embeddings_list: List[SparseEmbedding] = list(\n", |
| 159 | + "sparse_embeddings_list: list[SparseEmbedding] = list(\n", |
161 | 160 | " model.embed(documents, batch_size=6)\n",
|
162 | 161 | ") # batch_size is optional, notice the generator"
|
163 | 162 | ]
|
|
235 | 234 | "source": [
|
236 | 235 | "# Let's print the first 5 features and their weights for better understanding.\n",
|
237 | 236 | "for i in range(5):\n",
|
238 |
| - " print(f\"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}\")" |
| 237 | + " print(\n", |
| 238 | + " f\"Token at index {sparse_embeddings_list[0].indices[i]} has weight {sparse_embeddings_list[0].values[i]}\"\n", |
| 239 | + " )" |
239 | 240 | ]
|
240 | 241 | },
|
241 | 242 | {
|
|
261 | 262 | "import json\n",
|
262 | 263 | "from transformers import AutoTokenizer\n",
|
263 | 264 | "\n",
|
264 |
| - "tokenizer = AutoTokenizer.from_pretrained(SparseTextEmbedding.list_supported_models()[0][\"sources\"][\"hf\"])" |
| 265 | + "tokenizer = AutoTokenizer.from_pretrained(\n", |
| 266 | + " SparseTextEmbedding.list_supported_models()[0][\"sources\"][\"hf\"]\n", |
| 267 | + ")" |
265 | 268 | ]
|
266 | 269 | },
|
267 | 270 | {
|
|
326 | 329 | " token_weight_dict[token] = weight\n",
|
327 | 330 | "\n",
|
328 | 331 | " # Sort the dictionary by weights\n",
|
329 |
| - " token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))\n", |
| 332 | + " token_weight_dict = dict(\n", |
| 333 | + " sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)\n", |
| 334 | + " )\n", |
330 | 335 | " return token_weight_dict\n",
|
331 | 336 | "\n",
|
332 | 337 | "\n",
|
|
0 commit comments