Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow benchmarking against remote files #2297

Merged
merged 37 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
184f038
fix: allocate aligned buffers when reading through ObjectStore
a10y Nov 22, 2024
292069d
save
a10y Nov 22, 2024
1848072
ave
a10y Nov 25, 2024
fe04e4c
run queries
a10y Nov 25, 2024
cbb42cb
skip q15 for real
a10y Nov 25, 2024
4585171
fix
danking Feb 8, 2025
40483f9
remove unused dotenv dependency
danking Feb 10, 2025
455f737
clippy
danking Feb 10, 2025
469bd14
add --s3-data-dir and --formats
danking Feb 10, 2025
07c134a
add --scale-factor
danking Feb 10, 2025
0a32e4c
fixes
danking Feb 10, 2025
2252a5a
fixes
danking Feb 10, 2025
e6eb19e
add --do-not-use-object-store
danking Feb 10, 2025
cad4eb6
revert unnecessary changes
danking Feb 10, 2025
4e8e497
revert unnecessary changes
danking Feb 10, 2025
d211816
clippy
danking Feb 10, 2025
c0d351f
revert unnecssary changes
danking Feb 10, 2025
2415b96
use Iterator::last
danking Feb 10, 2025
22da380
no debugging on named lock
danking Feb 10, 2025
e1cbc37
set up tracing
danking Feb 10, 2025
28943b9
try to avoid object store?
danking Feb 10, 2025
20b8d44
remove --do-not-use-object-store
danking Feb 10, 2025
cb49b17
remove do --do-not-use-object-store
danking Feb 10, 2025
cc99cae
trace the object location as well
danking Feb 10, 2025
e08e8ab
cargo fmt
danking Feb 10, 2025
c8e407b
clippy
danking Feb 10, 2025
2c45176
Merge remote-tracking branch 'origin/develop' into dk/tpch-objectstore2
danking Feb 13, 2025
ce7470e
remove commented writer code
danking Feb 13, 2025
6ba9adb
Merge remote-tracking branch 'origin/develop' into dk/tpch-objectstore2
danking Feb 13, 2025
6f57071
slim diff
danking Feb 13, 2025
3bfa8f8
clippy
danking Feb 13, 2025
de329bf
admonitions on stderr
danking Feb 13, 2025
c965136
information on stderr
danking Feb 13, 2025
538f594
fix
danking Feb 13, 2025
1da36f5
maybe more understandable path manipulation
danking Feb 13, 2025
816b16b
cleanup
danking Feb 13, 2025
a907ffc
cleanup
danking Feb 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion bench-vortex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ indicatif = { workspace = true }
itertools = { workspace = true }
log = { workspace = true, features = ["max_level_debug"] }
mimalloc = { workspace = true }
object_store = { workspace = true, features = ["aws"] }
object_store = { workspace = true, features = ["aws", "gcp"] }
parquet = { workspace = true, features = ["async"] }
rand = { workspace = true }
rand_distr = { workspace = true }
Expand All @@ -66,6 +66,7 @@ tracing-subscriber = { workspace = true, features = [
"env-filter",
"tracing-log",
] }
url = { workspace = true }
uuid = { workspace = true, features = ["v4"] }
vortex = { workspace = true, features = ["object_store", "parquet"] }
vortex-datafusion = { workspace = true }
Expand Down
79 changes: 70 additions & 9 deletions bench-vortex/src/bin/tpch_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ use futures::future::try_join_all;
use indicatif::ProgressBar;
use itertools::Itertools;
use tokio::runtime::Builder;
use url::Url;
use vortex::aliases::hash_map::HashMap;
use vortex::error::VortexExpect as _;

feature_flagged_allocator!();

Expand All @@ -26,13 +28,19 @@ struct Args {
exclude_queries: Option<Vec<usize>>,
#[arg(short, long)]
threads: Option<usize>,
#[arg(long)]
use_remote_data_dir: Option<String>,
#[arg(short, long, default_value_t = true, default_missing_value = "true", action = ArgAction::Set)]
warmup: bool,
#[arg(short, long, default_value = "5")]
iterations: usize,
#[arg(long, value_delimiter = ',')]
formats: Option<Vec<String>>,
#[arg(long, default_value_t = 1)]
scale_factor: u8,
#[arg(long)]
only_vortex: bool,
#[arg(short, long)]
#[arg(short)]
verbose: bool,
#[arg(short, long, default_value_t, value_enum)]
display_format: DisplayFormat,
Expand All @@ -57,45 +65,98 @@ fn main() -> ExitCode {
}
.expect("Failed building the Runtime");

let url = match args.use_remote_data_dir {
None => {
let db_gen_options = DBGenOptions::default().with_scale_factor(args.scale_factor);
let data_dir = DBGen::new(db_gen_options).generate().unwrap();
eprintln!(
"Using existing or generating new files located at {}.",
data_dir.display()
);
Url::parse(
("file:".to_owned() + data_dir.to_str().vortex_expect("path should be utf8") + "/")
.as_ref(),
)
.unwrap()
}
Some(tpch_benchmark_remote_data_dir) => {
// e.g. "s3://vortex-bench-dev/parquet/"
//
// The trailing slash is significant!
//
// The folder must already be populated with data!
if !tpch_benchmark_remote_data_dir.ends_with("/") {
eprintln!("Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev/parquet/");
}
eprintln!(
concat!(
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
"If it does not, you should kill this command, locally generate the files (by running without\n",
"--use-remote-data-dir) and upload data/tpch/1/ to some remote location.",
),
tpch_benchmark_remote_data_dir,
);
Url::parse(&tpch_benchmark_remote_data_dir).unwrap()
}
};

if args.only_vortex {
panic!("use `--formats vortex,arrow` instead of `--only-vortex`");
}

runtime.block_on(bench_main(
args.queries,
args.exclude_queries,
args.iterations,
args.warmup,
args.only_vortex,
args.formats,
args.display_format,
args.emulate_object_store,
url,
))
}

#[allow(clippy::too_many_arguments)]
async fn bench_main(
queries: Option<Vec<usize>>,
exclude_queries: Option<Vec<usize>>,
iterations: usize,
warmup: bool,
only_vortex: bool,
formats: Option<Vec<String>>,
display_format: DisplayFormat,
emulate_object_store: bool,
url: Url,
) -> ExitCode {
// uncomment the below to enable trace logging of datafusion execution
// let filter = default_env_filter(true);
// setup_logger(filter);

// Run TPC-H data gen.
let data_dir = DBGen::new(DBGenOptions::default()).generate().unwrap();

// The formats to run against (vs the baseline)
let formats = if only_vortex {
vec![Format::Arrow, Format::OnDiskVortex]
} else {
vec![Format::Arrow, Format::Parquet, Format::OnDiskVortex]
let formats = match formats {
None => vec![Format::Arrow, Format::Parquet, Format::OnDiskVortex],
Some(formats) => formats
.into_iter()
.map(|format| match format.as_ref() {
"arrow" => Format::Arrow,
"parquet" => Format::Parquet,
"vortex" => Format::OnDiskVortex,
_ => panic!("unrecognized format: {}", format),
})
.collect::<Vec<_>>(),
};

eprintln!(
"Benchmarking against these formats: {}.",
formats.iter().join(", ")
);

// Load datasets
let ctxs = try_join_all(
formats
.iter()
.map(|format| load_datasets(&data_dir, *format, emulate_object_store)),
.map(|format| load_datasets(&url, *format, emulate_object_store)),
)
.await
.unwrap();
Expand Down
8 changes: 8 additions & 0 deletions bench-vortex/src/tpch/dbgen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ impl DBGenOptions {
cache_dir: self.cache_dir,
}
}

pub fn with_scale_factor(self, scale_factor: u8) -> Self {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should make an issue to followup by

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Self {
base_dir: self.base_dir,
scale_factor,
cache_dir: self.cache_dir,
}
}
}

impl DBGen {
Expand Down
Loading
Loading