Skip to content

Commit

Permalink
Remove the useless reduced dimensions dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Kerollmops committed Sep 21, 2024
1 parent 6c0feab commit 05f4deb
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 69 deletions.
19 changes: 3 additions & 16 deletions benchmarks/src/arroy_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use rand::SeedableRng;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap;

use crate::{normalize_vector, partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
use crate::{partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
const TWENTY_HUNDRED_MIB: usize = 2000 * 1024 * 1024 * 1024;

pub fn measure_arroy_distance<
Expand All @@ -21,7 +21,6 @@ pub fn measure_arroy_distance<
const FILTER_SUBSET_PERCENT: usize,
>(
dimensions: usize,
require_normalization: bool,
points: &[(u32, &[f32])],
) {
let dir = tempfile::tempdir().unwrap();
Expand All @@ -36,14 +35,7 @@ pub fn measure_arroy_distance<
let database = env
.create_database::<internals::KeyCodec, NodeCodec<ArroyDistance>>(&mut wtxn, None)
.unwrap();
let inserted = load_into_arroy(
&mut arroy_seed,
&mut wtxn,
database,
dimensions,
require_normalization,
points,
);
let inserted = load_into_arroy(&mut arroy_seed, &mut wtxn, database, dimensions, points);
wtxn.commit().unwrap();

let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
Expand Down Expand Up @@ -144,18 +136,13 @@ fn load_into_arroy<D: arroy::Distance>(
wtxn: &mut RwTxn,
database: Database<D>,
dimensions: usize,
require_normalization: bool,
points: &[(ItemId, &[f32])],
) -> RoaringBitmap {
let writer = Writer::<D>::new(database, 0, dimensions);
let mut candidates = RoaringBitmap::new();
for (i, vector) in points.iter() {
assert_eq!(vector.len(), dimensions);
if require_normalization {
writer.add_item(wtxn, *i, &normalize_vector(vector)).unwrap();
} else {
writer.add_item(wtxn, *i, vector).unwrap();
}
writer.add_item(wtxn, *i, vector).unwrap();
assert!(candidates.push(*i));
}
writer.build(wtxn, rng, None).unwrap();
Expand Down
18 changes: 4 additions & 14 deletions benchmarks/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ pub struct MatLEView<T> {
name: &'static str,
mmap: Mmap,
dimensions: usize,
reduced_dimensions: usize,
_marker: PhantomData<T>,
}

Expand All @@ -20,29 +19,20 @@ impl<T: AnyBitPattern> MatLEView<T> {
let mmap = unsafe { Mmap::map(&file).unwrap() };

assert!((mmap.len() / mem::size_of::<T>()) % dimensions == 0);
MatLEView { name, mmap, dimensions, reduced_dimensions: dimensions, _marker: PhantomData }
}

pub fn reduce_dimensions_to(&mut self, dimensions: usize) {
self.reduced_dimensions = dimensions;
MatLEView { name, mmap, dimensions, _marker: PhantomData }
}

pub fn header(&self) {
println!(
"{} - {} vectors of \x1b[1m{}\x1b[0m dimensions",
self.name,
self.len(),
self.reduced_dimensions
self.dimensions
);
}

#[allow(clippy::misnamed_getters)]
pub fn dimensions(&self) -> usize {
self.reduced_dimensions
}

pub fn reduced_dimensions(&self) -> bool {
self.reduced_dimensions != self.dimensions
self.dimensions
}

pub fn is_empty(&self) -> bool {
Expand All @@ -57,7 +47,7 @@ impl<T: AnyBitPattern> MatLEView<T> {
let tsize = mem::size_of::<T>();
if (index * self.dimensions + self.dimensions) * tsize < self.mmap.len() {
let start = index * self.dimensions;
let bytes = &self.mmap[start * tsize..(start + self.reduced_dimensions) * tsize];
let bytes = &self.mmap[start * tsize..(start + self.dimensions) * tsize];
match bytemuck::try_cast_slice::<u8, T>(bytes) {
Ok(slice) => Some(Ok(slice)),
Err(e) => Some(Err(e)),
Expand Down
19 changes: 6 additions & 13 deletions benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![allow(clippy::type_complexity)]

mod arroy_bench;
mod dataset;
mod qdrant;
Expand All @@ -17,11 +19,7 @@ use crate::qdrant::measure_qdrant_distance;
pub const RECALL_TESTED: [usize; 6] = [1, 10, 20, 50, 100, 500];
pub const RNG_SEED: u64 = 38;

pub fn bench_over_all_distances(
dimensions: usize,
require_normalization: bool,
vectors: &[(u32, &[f32])],
) {
pub fn bench_over_all_distances(dimensions: usize, vectors: &[(u32, &[f32])]) {
println!("\x1b[1m{}\x1b[0m vectors are used for this measure", vectors.len());
let mut recall_tested = String::new();
RECALL_TESTED.iter().for_each(|recall| write!(&mut recall_tested, "{recall:4}, ").unwrap());
Expand Down Expand Up @@ -73,7 +71,7 @@ pub fn bench_over_all_distances(
// bench_qdrant_distance::<DotProduct, false>(),
// bench_arroy_distance::<DotProduct, 1>(),
] {
(func)(dimensions, require_normalization, vectors);
(func)(dimensions, vectors);
}
}

Expand Down Expand Up @@ -122,20 +120,15 @@ fn bench_arroy_distance<
D: Distance,
const OVERSAMPLING: usize,
const FILTER_SUBSET_PERCENT: usize,
>() -> fn(usize, bool, &[(u32, &[f32])]) {
>() -> fn(usize, &[(u32, &[f32])]) {
measure_arroy_distance::<D, D::RealDistance, OVERSAMPLING, FILTER_SUBSET_PERCENT>
}

fn bench_qdrant_distance<D: Distance, const EXACT: bool, const FILTER_SUBSET_PERCENT: usize>(
) -> fn(usize, bool, &[(u32, &[f32])]) {
) -> fn(usize, &[(u32, &[f32])]) {
measure_qdrant_distance::<D, EXACT, FILTER_SUBSET_PERCENT>
}

fn normalize_vector(input: &[f32]) -> Vec<f32> {
let norm: f32 = input.iter().map(|&x| x * x).sum::<f32>().sqrt();
input.iter().map(|&x| x / norm).collect()
}

fn partial_sort_by<'a, D: arroy::Distance>(
mut vectors: impl Iterator<Item = (ItemId, &'a [f32])>,
sort_by: &[f32],
Expand Down
25 changes: 5 additions & 20 deletions benchmarks/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ fn db_pedia_3_large() -> MatLEView<f32> {
)
}

fn db_pedia_3_large_reduced_to_1536() -> MatLEView<f32> {
let mut mat = MatLEView::new(
"db pedia OpenAI text-embedding 3 large 1536",
"assets/db-pedia-OpenAI-text-embedding-3-large.mat",
3072,
);
mat.reduce_dimensions_to(1536);
mat
}

fn db_pedia_ada_002_large() -> MatLEView<f32> {
MatLEView::new(
"db pedia OpenAI text-embedding ada 002",
Expand All @@ -45,22 +35,17 @@ fn wikipedia_768() -> MatLEView<f32> {
fn main() {
let take = 100_000;
for dataset in [
// &hn_posts(),
// &hn_top_post(),
// &db_pedia_3_large_reduced_to_1536(),
// &db_pedia_3_large(),
// &db_pedia_ada_002_large(),
&hn_posts(),
&hn_top_post(),
&db_pedia_3_large(),
&db_pedia_ada_002_large(),
&wikipedia_768(),
] {
let vectors: Vec<(u32, &[f32])> =
dataset.iter().enumerate().map(|(i, v)| (i as u32, v)).take(take).collect();

dataset.header();
bench_over_all_distances(
dataset.dimensions(),
dataset.reduced_dimensions(),
vectors.as_slice(),
);
bench_over_all_distances(dataset.dimensions(), vectors.as_slice());
println!();
}
}
8 changes: 2 additions & 6 deletions benchmarks/src/qdrant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,14 @@ use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::SeedableRng;

use crate::{normalize_vector, partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};
use crate::{partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};

pub fn measure_qdrant_distance<
D: Distance,
const EXACT: bool,
const FILTER_SUBSET_PERCENT: usize,
>(
dimensions: usize,
require_normalization: bool,
points: &[(u32, &[f32])],
) {
let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
Expand All @@ -35,12 +34,9 @@ pub fn measure_qdrant_distance<
let points: Vec<_> = points
.iter()
.map(|(id, vector)| {
let vector =
if require_normalization { normalize_vector(vector) } else { vector.to_vec() };

PointStruct::new(
*id as u64,
vector,
vector.to_vec(),
Payload::try_from(serde_json::json!({ "id": *id })).unwrap(),
)
})
Expand Down

0 comments on commit 05f4deb

Please sign in to comment.