Skip to content

Commit 49762a6

Browse files
mrscoopersEvgeniya Sukhodolskaya
and
Evgeniya Sukhodolskaya
authored
Description of text embedding models, fix for consistency (#317)
* Description of text embedding models, fix for consistency * fixed misplacing of one description --------- Co-authored-by: Evgeniya Sukhodolskaya <[email protected]>
1 parent 782273f commit 49762a6

5 files changed

+25
-25
lines changed

fastembed/text/clip_embedding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
{
1111
"model": "Qdrant/clip-ViT-B-32-text",
1212
"dim": 512,
13-
"description": "CLIP text encoder",
13+
"description": "Text embeddings, Multimodal (text&image), English, 77 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
1414
"size_in_GB": 0.25,
1515
"sources": {
1616
"hf": "Qdrant/clip-ViT-B-32-text",

fastembed/text/e5_onnx_embedding.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
{
1010
"model": "intfloat/multilingual-e5-large",
1111
"dim": 1024,
12-
"description": "Multilingual model, e5-large. Recommend using this model for non-English languages",
12+
"description": "Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
1313
"size_in_GB": 2.24,
1414
"sources": {
1515
"url": "https://storage.googleapis.com/qdrant-fastembed/fast-multilingual-e5-large.tar.gz",
@@ -21,7 +21,7 @@
2121
{
2222
"model": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
2323
"dim": 768,
24-
"description": "Sentence-transformers model for tasks like clustering or semantic search",
24+
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 384 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
2525
"size_in_GB": 1.00,
2626
"sources": {
2727
"hf": "xenova/paraphrase-multilingual-mpnet-base-v2",

fastembed/text/onnx_embedding.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
{
1313
"model": "BAAI/bge-base-en",
1414
"dim": 768,
15-
"description": "Base English model",
15+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year",
1616
"size_in_GB": 0.42,
1717
"sources": {
1818
"url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz",
@@ -22,7 +22,7 @@
2222
{
2323
"model": "BAAI/bge-base-en-v1.5",
2424
"dim": 768,
25-
"description": "Base English model, v1.5",
25+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year",
2626
"size_in_GB": 0.21,
2727
"sources": {
2828
"url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz",
@@ -33,7 +33,7 @@
3333
{
3434
"model": "BAAI/bge-large-en-v1.5",
3535
"dim": 1024,
36-
"description": "Large English model, v1.5",
36+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year",
3737
"size_in_GB": 1.20,
3838
"sources": {
3939
"hf": "qdrant/bge-large-en-v1.5-onnx",
@@ -43,7 +43,7 @@
4343
{
4444
"model": "BAAI/bge-small-en",
4545
"dim": 384,
46-
"description": "Fast English model",
46+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year",
4747
"size_in_GB": 0.13,
4848
"sources": {
4949
"url": "https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz",
@@ -53,7 +53,7 @@
5353
{
5454
"model": "BAAI/bge-small-en-v1.5",
5555
"dim": 384,
56-
"description": "Fast and Default English model",
56+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year",
5757
"size_in_GB": 0.067,
5858
"sources": {
5959
"hf": "qdrant/bge-small-en-v1.5-onnx-q",
@@ -63,7 +63,7 @@
6363
{
6464
"model": "BAAI/bge-small-zh-v1.5",
6565
"dim": 512,
66-
"description": "Fast and recommended Chinese model",
66+
"description": "Text embeddings, Unimodal (text), Chinese, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year",
6767
"size_in_GB": 0.09,
6868
"sources": {
6969
"url": "https://storage.googleapis.com/qdrant-fastembed/fast-bge-small-zh-v1.5.tar.gz",
@@ -73,7 +73,7 @@
7373
{
7474
"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
7575
"dim": 384,
76-
"description": "Sentence Transformer model, paraphrase-multilingual-MiniLM-L12-v2",
76+
"description": "Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year",
7777
"size_in_GB": 0.22,
7878
"sources": {
7979
"hf": "qdrant/paraphrase-multilingual-MiniLM-L12-v2-onnx-Q",
@@ -83,7 +83,7 @@
8383
{
8484
"model": "thenlper/gte-large",
8585
"dim": 1024,
86-
"description": "Large general text embeddings model",
86+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year",
8787
"size_in_GB": 1.20,
8888
"sources": {
8989
"hf": "qdrant/gte-large-onnx",
@@ -93,7 +93,7 @@
9393
{
9494
"model": "mixedbread-ai/mxbai-embed-large-v1",
9595
"dim": 1024,
96-
"description": "MixedBread Base sentence embedding model, does well on MTEB",
96+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
9797
"size_in_GB": 0.64,
9898
"sources": {
9999
"hf": "mixedbread-ai/mxbai-embed-large-v1",
@@ -103,7 +103,7 @@
103103
{
104104
"model": "snowflake/snowflake-arctic-embed-xs",
105105
"dim": 384,
106-
"description": "Based on all-MiniLM-L6-v2 model with only 22m parameters, ideal for latency/TCO budgets.",
106+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
107107
"size_in_GB": 0.09,
108108
"sources": {
109109
"hf": "snowflake/snowflake-arctic-embed-xs",
@@ -113,7 +113,7 @@
113113
{
114114
"model": "snowflake/snowflake-arctic-embed-s",
115115
"dim": 384,
116-
"description": "Based on infloat/e5-small-unsupervised, does not trade off retrieval accuracy for its small size.",
116+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
117117
"size_in_GB": 0.13,
118118
"sources": {
119119
"hf": "snowflake/snowflake-arctic-embed-s",
@@ -123,7 +123,7 @@
123123
{
124124
"model": "snowflake/snowflake-arctic-embed-m",
125125
"dim": 768,
126-
"description": "Based on intfloat/e5-base-unsupervised model, provides the best retrieval without slowing down inference.",
126+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
127127
"size_in_GB": 0.43,
128128
"sources": {
129129
"hf": "Snowflake/snowflake-arctic-embed-m",
@@ -133,7 +133,7 @@
133133
{
134134
"model": "snowflake/snowflake-arctic-embed-m-long",
135135
"dim": 768,
136-
"description": "Based on nomic-ai/nomic-embed-text-v1-unsupervised model, 8192 context-length model",
136+
"description": "Text embeddings, Unimodal (text), English, 2048 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
137137
"size_in_GB": 0.54,
138138
"sources": {
139139
"hf": "snowflake/snowflake-arctic-embed-m-long",
@@ -143,7 +143,7 @@
143143
{
144144
"model": "snowflake/snowflake-arctic-embed-l",
145145
"dim": 1024,
146-
"description": "Based on intfloat/e5-large-unsupervised, large model for most accurate retrieval.",
146+
"description": "Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
147147
"size_in_GB": 1.02,
148148
"sources": {
149149
"hf": "snowflake/snowflake-arctic-embed-l",

fastembed/text/pooled_embedding.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
{
1212
"model": "nomic-ai/nomic-embed-text-v1.5",
1313
"dim": 768,
14-
"description": "8192 context length english model",
14+
"description": "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
1515
"size_in_GB": 0.52,
1616
"sources": {
1717
"hf": "nomic-ai/nomic-embed-text-v1.5",
@@ -21,7 +21,7 @@
2121
{
2222
"model": "nomic-ai/nomic-embed-text-v1.5-Q",
2323
"dim": 768,
24-
"description": "Quantized 8192 context length english model",
24+
"description": "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
2525
"size_in_GB": 0.13,
2626
"sources": {
2727
"hf": "nomic-ai/nomic-embed-text-v1.5",
@@ -31,7 +31,7 @@
3131
{
3232
"model": "nomic-ai/nomic-embed-text-v1",
3333
"dim": 768,
34-
"description": "8192 context length english model",
34+
"description": "Text embeddings, Multimodal (text, image), English, 8192 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year",
3535
"size_in_GB": 0.52,
3636
"sources": {
3737
"hf": "nomic-ai/nomic-embed-text-v1",

fastembed/text/pooled_normalized_embedding.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
{
1313
"model": "sentence-transformers/all-MiniLM-L6-v2",
1414
"dim": 384,
15-
"description": "Sentence Transformer model, MiniLM-L6-v2",
15+
"description": "Text embeddings, Unimodal (text), English, 256 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year",
1616
"size_in_GB": 0.09,
1717
"sources": {
1818
"url": "https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz",
@@ -23,31 +23,31 @@
2323
{
2424
"model": "jinaai/jina-embeddings-v2-base-en",
2525
"dim": 768,
26-
"description": "English embedding model supporting 8192 sequence length",
26+
"description": "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year",
2727
"size_in_GB": 0.52,
2828
"sources": {"hf": "xenova/jina-embeddings-v2-base-en"},
2929
"model_file": "onnx/model.onnx",
3030
},
3131
{
3232
"model": "jinaai/jina-embeddings-v2-small-en",
3333
"dim": 512,
34-
"description": "English embedding model supporting 8192 sequence length",
34+
"description": "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year",
3535
"size_in_GB": 0.12,
3636
"sources": {"hf": "xenova/jina-embeddings-v2-small-en"},
3737
"model_file": "onnx/model.onnx",
3838
},
3939
{
4040
"model": "jinaai/jina-embeddings-v2-base-de",
4141
"dim": 768,
42-
"description": "German embedding model supporting 8192 sequence length",
42+
"description": "Text embeddings, Unimodal (text), Multilingual (German, English), 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year",
4343
"size_in_GB": 0.32,
4444
"sources": {"hf": "jinaai/jina-embeddings-v2-base-de"},
4545
"model_file": "onnx/model_fp16.onnx",
4646
},
4747
{
4848
"model": "jinaai/jina-embeddings-v2-base-code",
4949
"dim": 768,
50-
"description": "Source code embedding model supporting 8192 sequence length",
50+
"description": "Text embeddings, Unimodal (text), Multilingual (English, 30 programming languages), 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2024 year",
5151
"size_in_GB": 0.64,
5252
"sources": {"hf": "jinaai/jina-embeddings-v2-base-code"},
5353
"model_file": "onnx/model.onnx",

0 commit comments

Comments
 (0)