Remove the useless reduced dimensions dataset

Kerollmops · Kerollmops · commit 05f4deb67f4f · 2024-09-21T14:09:35.000+02:00
diff --git a/benchmarks/src/arroy_bench.rs b/benchmarks/src/arroy_bench.rs
@@ -11,7 +11,7 @@ use rand::SeedableRng;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use roaring::RoaringBitmap;
 
-use crate::{normalize_vector, partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
+use crate::{partial_sort_by, Recall, RECALL_TESTED, RNG_SEED};
 const TWENTY_HUNDRED_MIB: usize = 2000 * 1024 * 1024 * 1024;
 
 pub fn measure_arroy_distance<
@@ -21,7 +21,6 @@ pub fn measure_arroy_distance<
     const FILTER_SUBSET_PERCENT: usize,
 >(
     dimensions: usize,
-    require_normalization: bool,
     points: &[(u32, &[f32])],
 ) {
     let dir = tempfile::tempdir().unwrap();
@@ -36,14 +35,7 @@ pub fn measure_arroy_distance<
     let database = env
         .create_database::<internals::KeyCodec, NodeCodec<ArroyDistance>>(&mut wtxn, None)
         .unwrap();
-    let inserted = load_into_arroy(
-        &mut arroy_seed,
-        &mut wtxn,
-        database,
-        dimensions,
-        require_normalization,
-        points,
-    );
+    let inserted = load_into_arroy(&mut arroy_seed, &mut wtxn, database, dimensions, points);
     wtxn.commit().unwrap();
 
     let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
@@ -144,18 +136,13 @@ fn load_into_arroy<D: arroy::Distance>(
     wtxn: &mut RwTxn,
     database: Database<D>,
     dimensions: usize,
-    require_normalization: bool,
     points: &[(ItemId, &[f32])],
 ) -> RoaringBitmap {
     let writer = Writer::<D>::new(database, 0, dimensions);
     let mut candidates = RoaringBitmap::new();
     for (i, vector) in points.iter() {
         assert_eq!(vector.len(), dimensions);
-        if require_normalization {
-            writer.add_item(wtxn, *i, &normalize_vector(vector)).unwrap();
-        } else {
-            writer.add_item(wtxn, *i, vector).unwrap();
-        }
+        writer.add_item(wtxn, *i, vector).unwrap();
         assert!(candidates.push(*i));
     }
     writer.build(wtxn, rng, None).unwrap();
diff --git a/benchmarks/src/dataset.rs b/benchmarks/src/dataset.rs
@@ -10,7 +10,6 @@ pub struct MatLEView<T> {
     name: &'static str,
     mmap: Mmap,
     dimensions: usize,
-    reduced_dimensions: usize,
     _marker: PhantomData<T>,
 }
 
@@ -20,29 +19,20 @@ impl<T: AnyBitPattern> MatLEView<T> {
         let mmap = unsafe { Mmap::map(&file).unwrap() };
 
         assert!((mmap.len() / mem::size_of::<T>()) % dimensions == 0);
-        MatLEView { name, mmap, dimensions, reduced_dimensions: dimensions, _marker: PhantomData }
-    }
-
-    pub fn reduce_dimensions_to(&mut self, dimensions: usize) {
-        self.reduced_dimensions = dimensions;
+        MatLEView { name, mmap, dimensions, _marker: PhantomData }
     }
 
     pub fn header(&self) {
         println!(
             "{} - {} vectors of \x1b[1m{}\x1b[0m dimensions",
             self.name,
             self.len(),
-            self.reduced_dimensions
+            self.dimensions
         );
     }
 
-    #[allow(clippy::misnamed_getters)]
     pub fn dimensions(&self) -> usize {
-        self.reduced_dimensions
-    }
-
-    pub fn reduced_dimensions(&self) -> bool {
-        self.reduced_dimensions != self.dimensions
+        self.dimensions
     }
 
     pub fn is_empty(&self) -> bool {
@@ -57,7 +47,7 @@ impl<T: AnyBitPattern> MatLEView<T> {
         let tsize = mem::size_of::<T>();
         if (index * self.dimensions + self.dimensions) * tsize < self.mmap.len() {
             let start = index * self.dimensions;
-            let bytes = &self.mmap[start * tsize..(start + self.reduced_dimensions) * tsize];
+            let bytes = &self.mmap[start * tsize..(start + self.dimensions) * tsize];
             match bytemuck::try_cast_slice::<u8, T>(bytes) {
                 Ok(slice) => Some(Ok(slice)),
                 Err(e) => Some(Err(e)),
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::type_complexity)]
+
 mod arroy_bench;
 mod dataset;
 mod qdrant;
@@ -17,11 +19,7 @@ use crate::qdrant::measure_qdrant_distance;
 pub const RECALL_TESTED: [usize; 6] = [1, 10, 20, 50, 100, 500];
 pub const RNG_SEED: u64 = 38;
 
-pub fn bench_over_all_distances(
-    dimensions: usize,
-    require_normalization: bool,
-    vectors: &[(u32, &[f32])],
-) {
+pub fn bench_over_all_distances(dimensions: usize, vectors: &[(u32, &[f32])]) {
     println!("\x1b[1m{}\x1b[0m vectors are used for this measure", vectors.len());
     let mut recall_tested = String::new();
     RECALL_TESTED.iter().for_each(|recall| write!(&mut recall_tested, "{recall:4}, ").unwrap());
@@ -73,7 +71,7 @@ pub fn bench_over_all_distances(
         // bench_qdrant_distance::<DotProduct, false>(),
         // bench_arroy_distance::<DotProduct, 1>(),
     ] {
-        (func)(dimensions, require_normalization, vectors);
+        (func)(dimensions, vectors);
     }
 }
 
@@ -122,20 +120,15 @@ fn bench_arroy_distance<
     D: Distance,
     const OVERSAMPLING: usize,
     const FILTER_SUBSET_PERCENT: usize,
->() -> fn(usize, bool, &[(u32, &[f32])]) {
+>() -> fn(usize, &[(u32, &[f32])]) {
     measure_arroy_distance::<D, D::RealDistance, OVERSAMPLING, FILTER_SUBSET_PERCENT>
 }
 
 fn bench_qdrant_distance<D: Distance, const EXACT: bool, const FILTER_SUBSET_PERCENT: usize>(
-) -> fn(usize, bool, &[(u32, &[f32])]) {
+) -> fn(usize, &[(u32, &[f32])]) {
     measure_qdrant_distance::<D, EXACT, FILTER_SUBSET_PERCENT>
 }
 
-fn normalize_vector(input: &[f32]) -> Vec<f32> {
-    let norm: f32 = input.iter().map(|&x| x * x).sum::<f32>().sqrt();
-    input.iter().map(|&x| x / norm).collect()
-}
-
 fn partial_sort_by<'a, D: arroy::Distance>(
     mut vectors: impl Iterator<Item = (ItemId, &'a [f32])>,
     sort_by: &[f32],
diff --git a/benchmarks/src/main.rs b/benchmarks/src/main.rs
@@ -16,16 +16,6 @@ fn db_pedia_3_large() -> MatLEView<f32> {
     )
 }
 
-fn db_pedia_3_large_reduced_to_1536() -> MatLEView<f32> {
-    let mut mat = MatLEView::new(
-        "db pedia OpenAI text-embedding 3 large 1536",
-        "assets/db-pedia-OpenAI-text-embedding-3-large.mat",
-        3072,
-    );
-    mat.reduce_dimensions_to(1536);
-    mat
-}
-
 fn db_pedia_ada_002_large() -> MatLEView<f32> {
     MatLEView::new(
         "db pedia OpenAI text-embedding ada  002",
@@ -45,22 +35,17 @@ fn wikipedia_768() -> MatLEView<f32> {
 fn main() {
     let take = 100_000;
     for dataset in [
-        // &hn_posts(),
-        // &hn_top_post(),
-        // &db_pedia_3_large_reduced_to_1536(),
-        // &db_pedia_3_large(),
-        // &db_pedia_ada_002_large(),
+        &hn_posts(),
+        &hn_top_post(),
+        &db_pedia_3_large(),
+        &db_pedia_ada_002_large(),
         &wikipedia_768(),
     ] {
         let vectors: Vec<(u32, &[f32])> =
             dataset.iter().enumerate().map(|(i, v)| (i as u32, v)).take(take).collect();
 
         dataset.header();
-        bench_over_all_distances(
-            dataset.dimensions(),
-            dataset.reduced_dimensions(),
-            vectors.as_slice(),
-        );
+        bench_over_all_distances(dataset.dimensions(), vectors.as_slice());
         println!();
     }
 }
diff --git a/benchmarks/src/qdrant.rs b/benchmarks/src/qdrant.rs
@@ -13,15 +13,14 @@ use rand::prelude::SliceRandom;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 
-use crate::{normalize_vector, partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};
+use crate::{partial_sort_by, Distance, Recall, RECALL_TESTED, RNG_SEED};
 
 pub fn measure_qdrant_distance<
     D: Distance,
     const EXACT: bool,
     const FILTER_SUBSET_PERCENT: usize,
 >(
     dimensions: usize,
-    require_normalization: bool,
     points: &[(u32, &[f32])],
 ) {
     let filtered_percentage = FILTER_SUBSET_PERCENT as f32;
@@ -35,12 +34,9 @@ pub fn measure_qdrant_distance<
     let points: Vec<_> = points
         .iter()
         .map(|(id, vector)| {
-            let vector =
-                if require_normalization { normalize_vector(vector) } else { vector.to_vec() };
-
             PointStruct::new(
                 *id as u64,
-                vector,
+                vector.to_vec(),
                 Payload::try_from(serde_json::json!({ "id": *id })).unwrap(),
             )
         })