Skip to content

Commit

Permalink
Expose a better arg interface to run the benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
Kerollmops committed Sep 21, 2024
1 parent 05f4deb commit 5443682
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 41 deletions.
34 changes: 28 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ anyhow = "1.0.86"
arroy = { git = "https://github.com/meilisearch/arroy", rev = "2386594" }
byte-unit = "5.1.4"
bytemuck = "1.16.1"
clap = { version = "4.5.18", features = ["derive"] }
enum-iterator = "2.1.0"
futures-util = "0.3.30"
heed = "0.20.3"
memmap2 = "0.9.4"
Expand Down
92 changes: 57 additions & 35 deletions benchmarks/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,49 +1,71 @@
use benchmarks::{bench_over_all_distances, MatLEView};
use clap::{Parser, ValueEnum};
use enum_iterator::Sequence;

fn hn_top_post() -> MatLEView<f32> {
MatLEView::new("Hackernews top posts", "assets/hn-top-posts.mat", 1024)
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum, Sequence)]
enum Dataset {
/// Hackernews posts (512)
HnPosts,
/// Wikipedia (768)
Wikipedia,
/// Hackernews top posts (1024)
HnTopPost,
/// db pedia OpenAI text-embedding ada 002 (1536)
DbPediaAda002,
/// db pedia OpenAI text-embedding 3 large (3072)
DbPedia3Large,
}

fn hn_posts() -> MatLEView<f32> {
MatLEView::new("Hackernews posts", "assets/hn-posts.mat", 512)
impl From<Dataset> for MatLEView<f32> {
fn from(dataset: Dataset) -> Self {
match dataset {
Dataset::HnPosts => MatLEView::new("Hackernews posts", "assets/hn-posts.mat", 512),
Dataset::Wikipedia => MatLEView::new(
"wikipedia 22 12 simple embeddings",
"assets/wikipedia-22-12-simple-embeddings.mat",
768,
),
Dataset::HnTopPost => {
MatLEView::new("Hackernews top posts", "assets/hn-top-posts.mat", 1024)
}
Dataset::DbPediaAda002 => MatLEView::new(
"db pedia OpenAI text-embedding ada 002",
"assets/db-pedia-OpenAI-text-embedding-ada-002.mat",
1536,
),
Dataset::DbPedia3Large => MatLEView::new(
"db pedia OpenAI text-embedding 3 large",
"assets/db-pedia-OpenAI-text-embedding-3-large.mat",
3072,
),
}
}
}

fn db_pedia_3_large() -> MatLEView<f32> {
MatLEView::new(
"db pedia OpenAI text-embedding 3 large",
"assets/db-pedia-OpenAI-text-embedding-3-large.mat",
3072,
)
}
#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
struct Args {
/// The datasets to run and all of them are ran if empty.
#[arg(value_enum)]
datasets: Vec<Dataset>,

fn db_pedia_ada_002_large() -> MatLEView<f32> {
MatLEView::new(
"db pedia OpenAI text-embedding ada 002",
"assets/db-pedia-OpenAI-text-embedding-ada-002.mat",
1536,
)
}

fn wikipedia_768() -> MatLEView<f32> {
MatLEView::new(
"wikipedia 22 12 simple embeddings",
"assets/wikipedia-22-12-simple-embeddings.mat",
768,
)
/// Number of vectors to evaluate from the datasets.
#[arg(long, default_value_t = 100_000)]
count: usize,
}

fn main() {
let take = 100_000;
for dataset in [
&hn_posts(),
&hn_top_post(),
&db_pedia_3_large(),
&db_pedia_ada_002_large(),
&wikipedia_768(),
] {
let vectors: Vec<(u32, &[f32])> =
dataset.iter().enumerate().map(|(i, v)| (i as u32, v)).take(take).collect();
let Args { datasets, count } = Args::parse();

let datasets: Vec<MatLEView<_>> = if datasets.is_empty() {
enum_iterator::all::<Dataset>().map(Into::into).collect()
} else {
datasets.into_iter().map(Into::into).collect()
};

for dataset in datasets {
let vectors: Vec<_> =
dataset.iter().enumerate().map(|(i, v)| (i as u32, v)).take(count).collect();
dataset.header();
bench_over_all_distances(dataset.dimensions(), vectors.as_slice());
println!();
Expand Down

0 comments on commit 5443682

Please sign in to comment.