Skip to content
This repository has been archived by the owner on Jan 7, 2025. It is now read-only.

Commit

Permalink
feat: load indexes for TPC-H, 13x speedup on SF 0.1 (#131)
Browse files Browse the repository at this point in the history
**Summary**: Now loading constraints + indexes (though constraints are
temporarily broken) when loading data for TPC-H.

**Demo**:
Without constraints, TPC-H SF0.1 takes ~13s. With constraints, it takes
~1s.
![Screenshot 2024-03-22 at 17 30
15](https://github.com/cmu-db/optd/assets/20631215/39f8fd3e-84cc-47fc-8882-5002ef33d63d)
![Screenshot 2024-03-22 at 17 31
19](https://github.com/cmu-db/optd/assets/20631215/8759f6b7-a611-44de-83b3-d8eb12aa5952)

**Details**:
* Constraints file is from dbgym (is a private repo so I can't link it).
* Note that constraints are currently broken so I commented it. Not
adding constraints doesn't cause noticeable issues so I'm punting this
to later.
* Made `get_optd_root()` check that it's actually in the optd repo (and
not the tpch-kit repo).
* Added option to run commands in a directory instead of manually
changing directory beforehand (which might cause us to CD into tpch-kit
and mess up `get_optd_root()`).
  • Loading branch information
wangpatrick57 authored Mar 23, 2024
1 parent b2a4a77 commit 0efc874
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 25 deletions.
12 changes: 12 additions & 0 deletions optd-perftest/src/postgres_dbms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,13 @@ impl PostgresDBMS {
Self::copy_from_stdin(client, tbl_fpath).await?;
}

// load the constraints and indexes
// TODO: constraints are currently broken
// let sql = fs::read_to_string(tpch_kit.constraints_fpath.to_str().unwrap())?;
// client.batch_execute(&sql).await?;
let sql = fs::read_to_string(tpch_kit.indexes_fpath.to_str().unwrap())?;
client.batch_execute(&sql).await?;

// create stats
// you need to do VACUUM FULL ANALYZE and not just ANALYZE to make sure the stats are created in a deterministic way
// this is standard practice for postgres benchmarking
Expand Down Expand Up @@ -240,6 +247,8 @@ impl PostgresDBMS {
dbname: &str, // used by truecard_cache
truecard_cache: &mut TruecardCache,
) -> anyhow::Result<Vec<usize>> {
let start = Instant::now();

let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
tpch_kit.gen_queries(tpch_config)?;

Expand All @@ -257,6 +266,9 @@ impl PostgresDBMS {
truecards.push(truecard);
}

let duration = start.elapsed();
println!("postgres eval_tpch_truecards duration: {:?}", duration);

Ok(truecards)
}

Expand Down
34 changes: 28 additions & 6 deletions optd-perftest/src/shell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,26 @@ use std::{fs, io};

/// Runs a command, exiting the program immediately if the command fails
pub fn run_command_with_status_check(cmd_str: &str) -> io::Result<Output> {
// we need to bind it to some arbitrary type that implements AsRef<Path>. I just chose &Path
run_command_with_status_check_in_dir::<&Path>(cmd_str, None)
}

/// Runs a command in a directory, exiting the program immediately if the command fails
pub fn run_command_with_status_check_in_dir<P: AsRef<Path>>(
cmd_str: &str,
in_path: Option<P>,
) -> io::Result<Output> {
// use shlex::split() instead of split_whitespace() to handle cases like quotes and escape chars
let mut cmd_components: Vec<String> = shlex::split(cmd_str).unwrap();
let cmd = cmd_components.remove(0);
let cmd_name = cmd_components.remove(0);
let args = cmd_components;
let output = Command::new(cmd).args(args).output()?;
let mut cmd = Command::new(cmd_name);
cmd.args(args);
if let Some(in_path) = in_path {
cmd.current_dir(in_path);
}
let output = cmd.output()?;

if output.status.success() {
Ok(output)
} else {
Expand Down Expand Up @@ -41,10 +56,17 @@ where

/// Get the path of the root "optd" repo directory
pub fn get_optd_root() -> io::Result<PathBuf> {
let output = run_command_with_status_check("git rev-parse --show-toplevel")?;
let path = str::from_utf8(&output.stdout).unwrap().trim();
let path = PathBuf::from(path);
Ok(path)
let url_output = run_command_with_status_check("git config --get remote.origin.url")?;
let url_string = str::from_utf8(&url_output.stdout).unwrap().trim();
assert!(
url_string.contains("cmu-db/optd"),
"You are in the repo with url_string={}. This was not recognized as the optd repo.",
url_string
);
let toplevel_output = run_command_with_status_check("git rev-parse --show-toplevel")?;
let toplevel_str = str::from_utf8(&toplevel_output.stdout).unwrap().trim();
let toplevel_dpath = PathBuf::from(toplevel_str);
Ok(toplevel_dpath)
}

/// Can be an absolute path or a relative path. Regardless of where this CLI is run, relative paths are evaluated relative to the optd repo root.
Expand Down
43 changes: 24 additions & 19 deletions optd-perftest/src/tpch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ pub struct TpchKit {
genned_tables_dpath: PathBuf,
genned_queries_dpath: PathBuf,
pub schema_fpath: PathBuf,
pub constraints_fpath: PathBuf,
pub indexes_fpath: PathBuf,
}

/// I keep the same conventions for these methods as I do for PostgresDBMS
Expand All @@ -74,6 +76,8 @@ impl TpchKit {
fs::create_dir(&genned_queries_dpath)?;
}
let schema_fpath = dbgen_dpath.join("dss.ddl");
let constraints_fpath = dbgen_dpath.join("constraints.sql");
let indexes_fpath = dbgen_dpath.join("indexes.sql");

// create Self
let kit = TpchKit {
Expand All @@ -85,6 +89,8 @@ impl TpchKit {
genned_tables_dpath,
genned_queries_dpath,
schema_fpath,
constraints_fpath,
indexes_fpath,
};

// set envvars (DSS_PATH can change so we don't set it now)
Expand All @@ -110,24 +116,22 @@ impl TpchKit {
} else {
log::debug!("[skip] cloning tpch-kit repo");
}
env::set_current_dir(&self.tpch_kit_repo_dpath)?;
log::debug!("[start] pulling latest tpch-kit repo");
shell::run_command_with_status_check("git pull")?;
shell::run_command_with_status_check_in_dir("git pull", Some(&self.tpch_kit_repo_dpath))?;
log::debug!("[end] pulling latest tpch-kit repo");
// make sure to do this so that get_optd_root() doesn't break
Ok(())
}

pub fn make(&self, dbms: &str) -> io::Result<()> {
env::set_current_dir(&self.dbgen_dpath)?;
log::debug!("[start] building dbgen");
// we need to call "make clean" because we might have called make earlier with
// a different dbms
shell::run_command_with_status_check("make clean")?;
shell::run_command_with_status_check(&format!(
"make MACHINE={} DATABASE={}",
TpchKit::get_machine(),
dbms
))?;
shell::run_command_with_status_check_in_dir("make clean", Some(&self.dbgen_dpath))?;
shell::run_command_with_status_check_in_dir(
&format!("make MACHINE={} DATABASE={}", TpchKit::get_machine(), dbms),
Some(&self.dbgen_dpath),
)?;
log::debug!("[end] building dbgen");
Ok(())
}
Expand All @@ -148,13 +152,12 @@ impl TpchKit {
if !done_fpath.exists() {
self.make(&tpch_config.dbms)?;
shell::make_into_empty_dir(&this_genned_tables_dpath)?;
env::set_current_dir(&self.dbgen_dpath)?;
env::set_var("DSS_PATH", this_genned_tables_dpath.to_str().unwrap());
log::debug!("[start] generating tables for {}", tpch_config);
shell::run_command_with_status_check(&format!(
"./dbgen -s{}",
tpch_config.scale_factor
))?;
shell::run_command_with_status_check_in_dir(
&format!("./dbgen -s{}", tpch_config.scale_factor),
Some(&self.dbgen_dpath),
)?;
File::create(done_fpath)?;
log::debug!("[end] generating tables for {}", tpch_config);
} else {
Expand All @@ -170,14 +173,16 @@ impl TpchKit {
if !done_fpath.exists() {
self.make(&tpch_config.dbms)?;
shell::make_into_empty_dir(&this_genned_queries_dpath)?;
env::set_current_dir(&self.dbgen_dpath)?;
log::debug!("[start] generating queries for {}", tpch_config);
// we don't use -d in qgen because -r controls the substitution values we use
for query_i in 1..=NUM_TPCH_QUERIES {
let output = shell::run_command_with_status_check(&format!(
"./qgen -s{} -r{} {}",
tpch_config.scale_factor, tpch_config.seed, query_i
))?;
let output = shell::run_command_with_status_check_in_dir(
&format!(
"./qgen -s{} -r{} {}",
tpch_config.scale_factor, tpch_config.seed, query_i
),
Some(&self.dbgen_dpath),
)?;
let this_genned_queries_fpath =
this_genned_queries_dpath.join(format!("{}.sql", query_i));
fs::write(&this_genned_queries_fpath, output.stdout)?;
Expand Down

0 comments on commit 0efc874

Please sign in to comment.