Skip to content

Commit 406f432

Browse files
authored
feat: Support sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 (#129)
* feat: Support sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 * test: Include sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 * docs: supported models update
1 parent defb618 commit 406f432

File tree

5 files changed

+60
-27
lines changed

5 files changed

+60
-27
lines changed

docs/examples/Supported_Models.ipynb

+39-27
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
},
1313
{
1414
"cell_type": "code",
15-
"execution_count": 4,
15+
"execution_count": 2,
1616
"metadata": {},
1717
"outputs": [
1818
{
@@ -110,38 +110,46 @@
110110
" </tr>\n",
111111
" <tr>\n",
112112
" <th>8</th>\n",
113+
" <td>sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2</td>\n",
114+
" <td>384</td>\n",
115+
" <td>Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2</td>\n",
116+
" <td>0.46</td>\n",
117+
" <td>{'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'}</td>\n",
118+
" </tr>\n",
119+
" <tr>\n",
120+
" <th>9</th>\n",
113121
" <td>nomic-ai/nomic-embed-text-v1</td>\n",
114122
" <td>768</td>\n",
115123
" <td>8192 context length english model</td>\n",
116124
" <td>0.54</td>\n",
117-
" <td>{'hf': 'xenova/nomic-embed-text-v1'}</td>\n",
125+
" <td>{'hf': 'nomic-ai/nomic-embed-text-v1'}</td>\n",
118126
" </tr>\n",
119127
" <tr>\n",
120-
" <th>9</th>\n",
128+
" <th>10</th>\n",
121129
" <td>intfloat/multilingual-e5-large</td>\n",
122130
" <td>1024</td>\n",
123131
" <td>Multilingual model, e5-large. Recommend using this model for non-English languages</td>\n",
124132
" <td>2.24</td>\n",
125133
" <td>{'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'}</td>\n",
126134
" </tr>\n",
127135
" <tr>\n",
128-
" <th>10</th>\n",
136+
" <th>11</th>\n",
129137
" <td>sentence-transformers/paraphrase-multilingual-mpnet-base-v2</td>\n",
130138
" <td>768</td>\n",
131139
" <td>Sentence-transformers model for tasks like clustering or semantic search</td>\n",
132140
" <td>1.11</td>\n",
133141
" <td>{'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'}</td>\n",
134142
" </tr>\n",
135143
" <tr>\n",
136-
" <th>11</th>\n",
144+
" <th>12</th>\n",
137145
" <td>jinaai/jina-embeddings-v2-base-en</td>\n",
138146
" <td>768</td>\n",
139147
" <td>English embedding model supporting 8192 sequence length</td>\n",
140148
" <td>0.55</td>\n",
141149
" <td>{'hf': 'xenova/jina-embeddings-v2-base-en'}</td>\n",
142150
" </tr>\n",
143151
" <tr>\n",
144-
" <th>12</th>\n",
152+
" <th>13</th>\n",
145153
" <td>jinaai/jina-embeddings-v2-small-en</td>\n",
146154
" <td>512</td>\n",
147155
" <td>English embedding model supporting 8192 sequence length</td>\n",
@@ -162,11 +170,12 @@
162170
"5 BAAI/bge-small-en-v1.5 384 \n",
163171
"6 BAAI/bge-small-zh-v1.5 512 \n",
164172
"7 sentence-transformers/all-MiniLM-L6-v2 384 \n",
165-
"8 nomic-ai/nomic-embed-text-v1 768 \n",
166-
"9 intfloat/multilingual-e5-large 1024 \n",
167-
"10 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n",
168-
"11 jinaai/jina-embeddings-v2-base-en 768 \n",
169-
"12 jinaai/jina-embeddings-v2-small-en 512 \n",
173+
"8 sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 384 \n",
174+
"9 nomic-ai/nomic-embed-text-v1 768 \n",
175+
"10 intfloat/multilingual-e5-large 1024 \n",
176+
"11 sentence-transformers/paraphrase-multilingual-mpnet-base-v2 768 \n",
177+
"12 jinaai/jina-embeddings-v2-base-en 768 \n",
178+
"13 jinaai/jina-embeddings-v2-small-en 512 \n",
170179
"\n",
171180
" description \\\n",
172181
"0 Base English model \n",
@@ -177,11 +186,12 @@
177186
"5 Fast and Default English model \n",
178187
"6 Fast and recommended Chinese model \n",
179188
"7 Sentence Transformer model, MiniLM-L6-v2 \n",
180-
"8 8192 context length english model \n",
181-
"9 Multilingual model, e5-large. Recommend using this model for non-English languages \n",
182-
"10 Sentence-transformers model for tasks like clustering or semantic search \n",
183-
"11 English embedding model supporting 8192 sequence length \n",
189+
"8 Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2 \n",
190+
"9 8192 context length english model \n",
191+
"10 Multilingual model, e5-large. Recommend using this model for non-English languages \n",
192+
"11 Sentence-transformers model for tasks like clustering or semantic search \n",
184193
"12 English embedding model supporting 8192 sequence length \n",
194+
"13 English embedding model supporting 8192 sequence length \n",
185195
"\n",
186196
" size_in_GB \\\n",
187197
"0 0.50 \n",
@@ -192,11 +202,12 @@
192202
"5 0.13 \n",
193203
"6 0.10 \n",
194204
"7 0.09 \n",
195-
"8 0.54 \n",
196-
"9 2.24 \n",
197-
"10 1.11 \n",
198-
"11 0.55 \n",
199-
"12 0.13 \n",
205+
"8 0.46 \n",
206+
"9 0.54 \n",
207+
"10 2.24 \n",
208+
"11 1.11 \n",
209+
"12 0.55 \n",
210+
"13 0.13 \n",
200211
"\n",
201212
" sources \n",
202213
"0 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz'} \n",
@@ -207,14 +218,15 @@
207218
"5 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-en-v1.5.tar.gz', 'hf': 'qdrant/bge-small-en-v1.5-onnx-q'} \n",
208219
"6 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz'} \n",
209220
"7 {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz', 'hf': 'qdrant/all-MiniLM-L6-v2-onnx'} \n",
210-
"8 {'hf': 'xenova/nomic-embed-text-v1'} \n",
211-
"9 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n",
212-
"10 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n",
213-
"11 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n",
214-
"12 {'hf': 'xenova/jina-embeddings-v2-small-en'} "
221+
"8 {'hf': 'qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q'} \n",
222+
"9 {'hf': 'nomic-ai/nomic-embed-text-v1'} \n",
223+
"10 {'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz', 'hf': 'qdrant/multilingual-e5-large-onnx'} \n",
224+
"11 {'hf': 'xenova/paraphrase-multilingual-mpnet-base-v2'} \n",
225+
"12 {'hf': 'xenova/jina-embeddings-v2-base-en'} \n",
226+
"13 {'hf': 'xenova/jina-embeddings-v2-small-en'} "
215227
]
216228
},
217-
"execution_count": 4,
229+
"execution_count": 2,
218230
"metadata": {},
219231
"output_type": "execute_result"
220232
}
@@ -244,7 +256,7 @@
244256
"name": "python",
245257
"nbconvert_exporter": "python",
246258
"pygments_lexer": "ipython3",
247-
"version": "3.11.7"
259+
"version": "3.11.4"
248260
},
249261
"orig_nbformat": 4
250262
},

fastembed/models.json

+10
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,16 @@
110110
"https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz"
111111
]
112112
},
113+
{
114+
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
115+
"dim": 384,
116+
"description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2",
117+
"size_in_GB": 0.46,
118+
"hf_sources": [
119+
"qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q"
120+
],
121+
"compressed_url_sources": []
122+
},
113123
{
114124
"model": "xenova/multilingual-e5-large",
115125
"dim": 1024,

fastembed/text/onnx_embedding.py

+9
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,15 @@
9898
"hf": "qdrant/all-MiniLM-L6-v2-onnx",
9999
},
100100
},
101+
{
102+
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
103+
"dim": 384,
104+
"description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2",
105+
"size_in_GB": 0.46,
106+
"sources": {
107+
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
108+
},
109+
},
101110
{
102111
"model": "nomic-ai/nomic-embed-text-v1",
103112
"dim": 768,

tests/test_onnx_embeddings.py

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"BAAI/bge-base-en-v1.5": np.array([0.01129394, 0.05493144, 0.02615099, 0.00328772, 0.02996045]),
1414
"BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
1515
"sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
16+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]),
1617
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
1718
"xenova/multilingual-e5-large": np.array([0.00975464, 0.00446568, 0.00655449, -0.0354155, 0.00702112]),
1819
"xenova/paraphrase-multilingual-mpnet-base-v2": np.array(

tests/test_text_onnx_embeddings.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
"BAAI/bge-large-en-v1.5": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
1515
"BAAI/bge-large-en-v1.5-quantized": np.array([0.03434538, 0.03316108, 0.02191251, -0.03713358, -0.01577825]),
1616
"sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
17+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2": np.array([0.0094, 0.0184, 0.0328, 0.0072, -0.0351]),
1718
"intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
1819
"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": np.array(
1920
[-0.01341097, 0.0416553, -0.00480805, 0.02844842, 0.0505299]

0 commit comments

Comments
 (0)