Skip to content

Commit 2d91ea0

Browse files
committed
add base64 vector benchmark as unit test
1 parent 74cbc92 commit 2d91ea0

File tree

5 files changed

+1321
-0
lines changed

5 files changed

+1321
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package co.elastic.clients.elasticsearch.experiments.benchmark.base64vectors;
21+
22+
public record BenchmarkOutput(int dataset_size, int chunk_size, Float32 float32, Base64 base64) {
23+
public record Float32(long duration) {
24+
}
25+
26+
public record Base64(long duration) {
27+
}
28+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package co.elastic.clients.elasticsearch.experiments.benchmark.base64vectors;
21+
22+
public record Elasticsearch64Doc(String docid, String title, String text, String emb) {
23+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package co.elastic.clients.elasticsearch.experiments.benchmark.base64vectors;
21+
22+
public record ElasticsearchDoc(String docid, String title, String text, float[] emb) {
23+
}
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
/*
2+
* Licensed to Elasticsearch B.V. under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch B.V. licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package co.elastic.clients.elasticsearch.experiments.benchmark.base64vectors;
21+
22+
import co.elastic.clients.elasticsearch.ElasticsearchClient;
23+
import co.elastic.clients.elasticsearch.ElasticsearchTestServer;
24+
import co.elastic.clients.elasticsearch.core.BulkRequest;
25+
import co.elastic.clients.elasticsearch.core.bulk.BulkOperation;
26+
import com.fasterxml.jackson.databind.MappingIterator;
27+
import com.fasterxml.jackson.databind.ObjectMapper;
28+
import com.fasterxml.jackson.databind.ObjectWriter;
29+
import org.junit.Ignore;
30+
import org.junit.jupiter.api.BeforeAll;
31+
import org.junit.jupiter.api.Test;
32+
33+
import java.io.IOException;
34+
import java.io.InputStream;
35+
import java.io.StringReader;
36+
import java.nio.ByteBuffer;
37+
import java.time.Duration;
38+
import java.time.Instant;
39+
import java.util.ArrayList;
40+
import java.util.Base64;
41+
import java.util.List;
42+
43+
/*
44+
This test compares the speed of serializing and sending vectors to the server
45+
between the standard array of floats and the new base64 format introduced in 9.3
46+
*/
47+
@Ignore
48+
public class MainCommonBenchmark64 {
49+
50+
static ElasticsearchClient elasticsearchClient;
51+
52+
@BeforeAll
53+
public static void setup() {
54+
elasticsearchClient = ElasticsearchTestServer.global().client();
55+
}
56+
57+
public static List<ElasticsearchDoc> readMultipleObjects(InputStream file) throws IOException {
58+
ObjectMapper mapper = new ObjectMapper();
59+
60+
try (MappingIterator<ElasticsearchDoc> it =
61+
mapper.readerFor(ElasticsearchDoc.class).readValues(file)) {
62+
List<ElasticsearchDoc> list = new ArrayList<>();
63+
while (it.hasNext()) {
64+
list.add(it.next());
65+
}
66+
return list;
67+
}
68+
}
69+
70+
71+
public static String convertToBase64Bytes(float[] vec) {
72+
ByteBuffer buff = ByteBuffer.allocate(Float.BYTES * vec.length);
73+
for (int i = 0; i < vec.length; i++) {
74+
buff.putFloat(vec[i]);
75+
}
76+
return Base64.getEncoder().encodeToString(buff.array());
77+
}
78+
79+
@Test
80+
public void benchmark20k() throws IOException {
81+
82+
InputStream input = this.getClass()
83+
.getResourceAsStream("open_ai_corpus-initial-indexing-1k.json");
84+
85+
List<ElasticsearchDoc> docs = readMultipleObjects(input);
86+
87+
// warmup round
88+
List<BulkOperation> bulkOperations = new ArrayList<>();
89+
for (ElasticsearchDoc doc : docs) {
90+
91+
BulkOperation op = BulkOperation.of(o -> o
92+
.index(idx -> idx
93+
.index("warmup-index")
94+
.document(doc)
95+
)
96+
);
97+
bulkOperations.add(op);
98+
if (bulkOperations.size() >= 100) {
99+
List<BulkOperation> finalBulkOperations = bulkOperations;
100+
BulkRequest request = BulkRequest.of(b -> b.operations(finalBulkOperations));
101+
elasticsearchClient.bulk(request);
102+
bulkOperations = new ArrayList<>();
103+
}
104+
}
105+
elasticsearchClient.indices().delete(d -> d.index("warmup-index"));
106+
107+
108+
List<BenchmarkOutput> result = new ArrayList<>();
109+
110+
int[] chunks = {100, 250, 500, 1000};
111+
for (int chunk : chunks) {
112+
long[] floatTimes = new long[3]; // 3 runs averaged
113+
long[] base64Times = new long[3]; //same for base64
114+
for (int i = 0; i < 3; i++) {
115+
116+
if (elasticsearchClient.indices().exists(e -> e.index("vec-test")).value()) {
117+
elasticsearchClient.indices().delete(d -> d.index("vec-test"));
118+
elasticsearchClient.indices().create(c -> c.index("vec-test")
119+
.withJson(new StringReader("{\n" +
120+
" \"mappings\": {\n" +
121+
" \"properties\": {\n" +
122+
" \"text\": {\n" +
123+
" \"type\": \"text\",\n" +
124+
" \"fields\": {\n" +
125+
" \"keyword\": {\n" +
126+
" \"type\": \"keyword\",\n" +
127+
" \"ignore_above\": 256\n" +
128+
" }\n" +
129+
" }\n" +
130+
" },\n" +
131+
" \"emb\": {\n" +
132+
" \"type\": \"dense_vector\",\n" +
133+
" \"dims\": 1536,\n" +
134+
" \"index\": true,\n" +
135+
" \"similarity\": \"cosine\",\n" +
136+
" \"index_options\": {\n" +
137+
" \"type\": \"flat\"\n" +
138+
" }\n" +
139+
" }\n" +
140+
" }\n" +
141+
" }\n" +
142+
"}")));
143+
elasticsearchClient.indices().refresh();
144+
}
145+
146+
Instant start = Instant.now();
147+
148+
bulkOperations = new ArrayList<>();
149+
150+
for (int d = 0; d < 20; d++) { // repeating dataset 20 times
151+
for (ElasticsearchDoc doc : docs) {
152+
153+
BulkOperation op = BulkOperation.of(o -> o
154+
.index(idx -> idx
155+
.index("vec-test")
156+
.document(doc)
157+
)
158+
);
159+
bulkOperations.add(op);
160+
if (bulkOperations.size() >= chunk) {
161+
List<BulkOperation> finalBulkOperations = bulkOperations;
162+
BulkRequest request = BulkRequest.of(b -> b.operations(finalBulkOperations));
163+
elasticsearchClient.bulk(request);
164+
bulkOperations = new ArrayList<>();
165+
}
166+
}
167+
}
168+
169+
Instant end = Instant.now();
170+
floatTimes[i] = Duration.between(start, end).toMillis();
171+
172+
if (elasticsearchClient.indices().exists(e -> e.index("vec-test-64")).value()) {
173+
elasticsearchClient.indices().delete(d -> d.index("vec-test-64"));
174+
elasticsearchClient.indices().create(c -> c.index("vec-test-64")
175+
.withJson(new StringReader("{\n" +
176+
" \"mappings\": {\n" +
177+
" \"properties\": {\n" +
178+
" \"text\": {\n" +
179+
" \"type\": \"text\",\n" +
180+
" \"fields\": {\n" +
181+
" \"keyword\": {\n" +
182+
" \"type\": \"keyword\",\n" +
183+
" \"ignore_above\": 256\n" +
184+
" }\n" +
185+
" }\n" +
186+
" },\n" +
187+
" \"emb\": {\n" +
188+
" \"type\": \"dense_vector\",\n" +
189+
" \"dims\": 1536,\n" +
190+
" \"index\": true,\n" +
191+
" \"similarity\": \"cosine\",\n" +
192+
" \"index_options\": {\n" +
193+
" \"type\": \"flat\"\n" +
194+
" }\n" +
195+
" }\n" +
196+
" }\n" +
197+
" }\n" +
198+
"}")));
199+
elasticsearchClient.indices().refresh();
200+
}
201+
202+
start = Instant.now();
203+
204+
bulkOperations = new ArrayList<>();
205+
206+
for (int d = 0; d < 20; d++) { // repeating dataset 20 times
207+
for (ElasticsearchDoc doc : docs) {
208+
Elasticsearch64Doc doc64 = new Elasticsearch64Doc(doc.docid(), doc.title(),
209+
doc.text(),
210+
convertToBase64Bytes(doc.emb()));
211+
212+
BulkOperation op = BulkOperation.of(o -> o
213+
.index(idx -> idx
214+
.index("vec-test-64")
215+
.document(doc64)
216+
)
217+
);
218+
bulkOperations.add(op);
219+
if (bulkOperations.size() >= chunk) {
220+
List<BulkOperation> finalBulkOperations = bulkOperations;
221+
BulkRequest request =
222+
BulkRequest.of(b -> b.operations(finalBulkOperations));
223+
elasticsearchClient.bulk(request);
224+
bulkOperations = new ArrayList<>();
225+
}
226+
}
227+
}
228+
229+
end = Instant.now();
230+
base64Times[i] = Duration.between(start, end).toMillis();
231+
}
232+
// only counting the last 3, the first 3 are for warmup
233+
long totalFloat = (floatTimes[0] + floatTimes[1] + floatTimes[2]) / 3;
234+
long totalBase64 = (base64Times[0] + base64Times[1] + base64Times[2]) / 3;
235+
BenchmarkOutput benchmarkOutput = new BenchmarkOutput(20000, chunk,
236+
new BenchmarkOutput.Float32(totalFloat), new BenchmarkOutput.Base64(totalBase64));
237+
238+
result.add(benchmarkOutput);
239+
}
240+
ObjectWriter ow = new ObjectMapper().writer().withDefaultPrettyPrinter();
241+
String json = ow.writeValueAsString(result);
242+
System.out.println(json);
243+
244+
elasticsearchClient.indices().delete(d -> d.index("vec-test"));
245+
elasticsearchClient.indices().delete(d -> d.index("vec-test-64"));
246+
}
247+
}

java-client/src/test/resources/co/elastic/clients/elasticsearch/experiments/benchmark/base64vectors/open_ai_corpus-initial-indexing-1k.json

Lines changed: 1000 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)