Skip to content

Commit 05f4deb

Browse files
committed
Remove the useless reduced dimensions dataset
1 parent 6c0feab commit 05f4deb

File tree

5 files changed

+20
-69
lines changed

5 files changed

+20
-69
lines changed

benchmarks/src/arroy_bench.rs

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ use rand::SeedableRng;
1111
use rayon::iter::{IntoParallelIterator, ParallelIterator};
1212
use roaring::RoaringBitmap;
1313

14-
use crate::{normalize_vector, partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
14+
use crate::{partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
1515
const TWENTY_HUNDRED_MIB: usize = 2000 * 1024 * 1024 * 1024;
1616

1717
pub fn measure_arroy_distance<
@@ -21,7 +21,6 @@ pub fn measure_arroy_distance<
2121
const FILTER_SUBSET_PERCENT: usize,
2222
>(
2323
dimensions: usize,
24-
require_normalization: bool,
2524
points: &[(u32, &[f32])],
2625
) {
2726
let dir = tempfile::tempdir().unwrap();
@@ -36,14 +35,7 @@ pub fn measure_arroy_distance<
3635
let database = env
3736
.create_database::<internals::KeyCodec, NodeCodec<ArroyDistance>>(&mut wtxn, None)
3837
.unwrap();
39-
let inserted = load_into_arroy(
40-
&mut arroy_seed,
41-
&mut wtxn,
42-
database,
43-
dimensions,
44-
require_normalization,
45-
points,
46-
);
38+
let inserted = load_into_arroy(&mut arroy_seed, &mut wtxn, database, dimensions, points);
4739
wtxn.commit().unwrap();
4840

4941
let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
@@ -144,18 +136,13 @@ fn load_into_arroy<D: arroy::Distance>(
144136
wtxn: &mut RwTxn,
145137
database: Database<D>,
146138
dimensions: usize,
147-
require_normalization: bool,
148139
points: &[(ItemId, &[f32])],
149140
) -> RoaringBitmap {
150141
let writer = Writer::<D>::new(database, 0, dimensions);
151142
let mut candidates = RoaringBitmap::new();
152143
for (i, vector) in points.iter() {
153144
assert_eq!(vector.len(), dimensions);
154-
if require_normalization {
155-
writer.add_item(wtxn, *i, &normalize_vector(vector)).unwrap();
156-
} else {
157-
writer.add_item(wtxn, *i, vector).unwrap();
158-
}
145+
writer.add_item(wtxn, *i, vector).unwrap();
159146
assert!(candidates.push(*i));
160147
}
161148
writer.build(wtxn, rng, None).unwrap();

benchmarks/src/dataset.rs

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ pub struct MatLEView<T> {
1010
name: &'static str,
1111
mmap: Mmap,
1212
dimensions: usize,
13-
reduced_dimensions: usize,
1413
_marker: PhantomData<T>,
1514
}
1615

@@ -20,29 +19,20 @@ impl<T: AnyBitPattern> MatLEView<T> {
2019
let mmap = unsafe { Mmap::map(&file).unwrap() };
2120

2221
assert!((mmap.len() / mem::size_of::<T>()) % dimensions == 0);
23-
MatLEView { name, mmap, dimensions, reduced_dimensions: dimensions, _marker: PhantomData }
24-
}
25-
26-
pub fn reduce_dimensions_to(&mut self, dimensions: usize) {
27-
self.reduced_dimensions = dimensions;
22+
MatLEView { name, mmap, dimensions, _marker: PhantomData }
2823
}
2924

3025
pub fn header(&self) {
3126
println!(
3227
"{} - {} vectors of \x1b[1m{}\x1b[0m dimensions",
3328
self.name,
3429
self.len(),
35-
self.reduced_dimensions
30+
self.dimensions
3631
);
3732
}
3833

39-
#[allow(clippy::misnamed_getters)]
4034
pub fn dimensions(&self) -> usize {
41-
self.reduced_dimensions
42-
}
43-
44-
pub fn reduced_dimensions(&self) -> bool {
45-
self.reduced_dimensions != self.dimensions
35+
self.dimensions
4636
}
4737

4838
pub fn is_empty(&self) -> bool {
@@ -57,7 +47,7 @@ impl<T: AnyBitPattern> MatLEView<T> {
5747
let tsize = mem::size_of::<T>();
5848
if (index * self.dimensions + self.dimensions) * tsize < self.mmap.len() {
5949
let start = index * self.dimensions;
60-
let bytes = &self.mmap[start * tsize..(start + self.reduced_dimensions) * tsize];
50+
let bytes = &self.mmap[start * tsize..(start + self.dimensions) * tsize];
6151
match bytemuck::try_cast_slice::<u8, T>(bytes) {
6252
Ok(slice) => Some(Ok(slice)),
6353
Err(e) => Some(Err(e)),

benchmarks/src/lib.rs

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#![allow(clippy::type_complexity)]
2+
13
mod arroy_bench;
24
mod dataset;
35
mod qdrant;
@@ -17,11 +19,7 @@ use crate::qdrant::measure_qdrant_distance;
1719
pub const RECALL_TESTED: [usize; 6] = [1, 10, 20, 50, 100, 500];
1820
pub const RNG_SEED: u64 = 38;
1921

20-
pub fn bench_over_all_distances(
21-
dimensions: usize,
22-
require_normalization: bool,
23-
vectors: &[(u32, &[f32])],
24-
) {
22+
pub fn bench_over_all_distances(dimensions: usize, vectors: &[(u32, &[f32])]) {
2523
println!("\x1b[1m{}\x1b[0m vectors are used for this measure", vectors.len());
2624
let mut recall_tested = String::new();
2725
RECALL_TESTED.iter().for_each(|recall| write!(&mut recall_tested, "{recall:4}, ").unwrap());
@@ -73,7 +71,7 @@ pub fn bench_over_all_distances(
7371
// bench_qdrant_distance::<DotProduct, false>(),
7472
// bench_arroy_distance::<DotProduct, 1>(),
7573
] {
76-
(func)(dimensions, require_normalization, vectors);
74+
(func)(dimensions, vectors);
7775
}
7876
}
7977

@@ -122,20 +120,15 @@ fn bench_arroy_distance<
122120
D: Distance,
123121
const OVERSAMPLING: usize,
124122
const FILTER_SUBSET_PERCENT: usize,
125-
>() -> fn(usize, bool, &[(u32, &[f32])]) {
123+
>() -> fn(usize, &[(u32, &[f32])]) {
126124
measure_arroy_distance::<D, D::RealDistance, OVERSAMPLING, FILTER_SUBSET_PERCENT>
127125
}
128126

129127
fn bench_qdrant_distance<D: Distance, const EXACT: bool, const FILTER_SUBSET_PERCENT: usize>(
130-
) -> fn(usize, bool, &[(u32, &[f32])]) {
128+
) -> fn(usize, &[(u32, &[f32])]) {
131129
measure_qdrant_distance::<D, EXACT, FILTER_SUBSET_PERCENT>
132130
}
133131

134-
fn normalize_vector(input: &[f32]) -> Vec<f32> {
135-
let norm: f32 = input.iter().map(|&x| x * x).sum::<f32>().sqrt();
136-
input.iter().map(|&x| x / norm).collect()
137-
}
138-
139132
fn partial_sort_by<'a, D: arroy::Distance>(
140133
mut vectors: impl Iterator<Item = (ItemId, &'a [f32])>,
141134
sort_by: &[f32],

benchmarks/src/main.rs

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,6 @@ fn db_pedia_3_large() -> MatLEView<f32> {
1616
)
1717
}
1818

19-
fn db_pedia_3_large_reduced_to_1536() -> MatLEView<f32> {
20-
let mut mat = MatLEView::new(
21-
"db pedia OpenAI text-embedding 3 large 1536",
22-
"assets/db-pedia-OpenAI-text-embedding-3-large.mat",
23-
3072,
24-
);
25-
mat.reduce_dimensions_to(1536);
26-
mat
27-
}
28-
2919
fn db_pedia_ada_002_large() -> MatLEView<f32> {
3020
MatLEView::new(
3121
"db pedia OpenAI text-embedding ada 002",
@@ -45,22 +35,17 @@ fn wikipedia_768() -> MatLEView<f32> {
4535
fn main() {
4636
let take = 100_000;
4737
for dataset in [
48-
// &hn_posts(),
49-
// &hn_top_post(),
50-
// &db_pedia_3_large_reduced_to_1536(),
51-
// &db_pedia_3_large(),
52-
// &db_pedia_ada_002_large(),
38+
&hn_posts(),
39+
&hn_top_post(),
40+
&db_pedia_3_large(),
41+
&db_pedia_ada_002_large(),
5342
&wikipedia_768(),
5443
] {
5544
let vectors: Vec<(u32, &[f32])> =
5645
dataset.iter().enumerate().map(|(i, v)| (i as u32, v)).take(take).collect();
5746

5847
dataset.header();
59-
bench_over_all_distances(
60-
dataset.dimensions(),
61-
dataset.reduced_dimensions(),
62-
vectors.as_slice(),
63-
);
48+
bench_over_all_distances(dataset.dimensions(), vectors.as_slice());
6449
println!();
6550
}
6651
}

benchmarks/src/qdrant.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,14 @@ use rand::prelude::SliceRandom;
1313
use rand::rngs::StdRng;
1414
use rand::SeedableRng;
1515

16-
use crate::{normalize_vector, partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};
16+
use crate::{partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};
1717

1818
pub fn measure_qdrant_distance<
1919
D: Distance,
2020
const EXACT: bool,
2121
const FILTER_SUBSET_PERCENT: usize,
2222
>(
2323
dimensions: usize,
24-
require_normalization: bool,
2524
points: &[(u32, &[f32])],
2625
) {
2726
let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
@@ -35,12 +34,9 @@ pub fn measure_qdrant_distance<
3534
let points: Vec<_> = points
3635
.iter()
3736
.map(|(id, vector)| {
38-
let vector =
39-
if require_normalization { normalize_vector(vector) } else { vector.to_vec() };
40-
4137
PointStruct::new(
4238
*id as u64,
43-
vector,
39+
vector.to_vec(),
4440
Payload::try_from(serde_json::json!({ "id": *id })).unwrap(),
4541
)
4642
})

0 commit comments

Comments
 (0)