Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 0efc874

Browse files
feat: load indexes for TPC-H, 13x speedup on SF 0.1 (#131)
**Summary**: Now loading constraints + indexes (though constraints are temporarily broken) when loading data for TPC-H. **Demo**: Without constraints, TPC-H SF0.1 takes ~13s. With constraints, it takes ~1s. ![Screenshot 2024-03-22 at 17 30 15](https://github.com/cmu-db/optd/assets/20631215/39f8fd3e-84cc-47fc-8882-5002ef33d63d) ![Screenshot 2024-03-22 at 17 31 19](https://github.com/cmu-db/optd/assets/20631215/8759f6b7-a611-44de-83b3-d8eb12aa5952) **Details**: * Constraints file is from dbgym (is a private repo so I can't link it). * Note that constraints are currently broken so I commented it. Not adding constraints doesn't cause noticeable issues so I'm punting this to later. * Made `get_optd_root()` check that it's actually in the optd repo (and not the tpch-kit repo). * Added option to run commands in a directory instead of manually changing directory beforehand (which might cause us to CD into tpch-kit and mess up `get_optd_root()`).
1 parent b2a4a77 commit 0efc874

File tree

3 files changed

+64
-25
lines changed

3 files changed

+64
-25
lines changed

optd-perftest/src/postgres_dbms.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,13 @@ impl PostgresDBMS {
156156
Self::copy_from_stdin(client, tbl_fpath).await?;
157157
}
158158

159+
// load the constraints and indexes
160+
// TODO: constraints are currently broken
161+
// let sql = fs::read_to_string(tpch_kit.constraints_fpath.to_str().unwrap())?;
162+
// client.batch_execute(&sql).await?;
163+
let sql = fs::read_to_string(tpch_kit.indexes_fpath.to_str().unwrap())?;
164+
client.batch_execute(&sql).await?;
165+
159166
// create stats
160167
// you need to do VACUUM FULL ANALYZE and not just ANALYZE to make sure the stats are created in a deterministic way
161168
// this is standard practice for postgres benchmarking
@@ -240,6 +247,8 @@ impl PostgresDBMS {
240247
dbname: &str, // used by truecard_cache
241248
truecard_cache: &mut TruecardCache,
242249
) -> anyhow::Result<Vec<usize>> {
250+
let start = Instant::now();
251+
243252
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
244253
tpch_kit.gen_queries(tpch_config)?;
245254

@@ -257,6 +266,9 @@ impl PostgresDBMS {
257266
truecards.push(truecard);
258267
}
259268

269+
let duration = start.elapsed();
270+
println!("postgres eval_tpch_truecards duration: {:?}", duration);
271+
260272
Ok(truecards)
261273
}
262274

optd-perftest/src/shell.rs

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,26 @@ use std::{fs, io};
55

66
/// Runs a command, exiting the program immediately if the command fails
77
pub fn run_command_with_status_check(cmd_str: &str) -> io::Result<Output> {
8+
// we need to bind it to some arbitrary type that implements AsRef<Path>. I just chose &Path
9+
run_command_with_status_check_in_dir::<&Path>(cmd_str, None)
10+
}
11+
12+
/// Runs a command in a directory, exiting the program immediately if the command fails
13+
pub fn run_command_with_status_check_in_dir<P: AsRef<Path>>(
14+
cmd_str: &str,
15+
in_path: Option<P>,
16+
) -> io::Result<Output> {
817
// use shlex::split() instead of split_whitespace() to handle cases like quotes and escape chars
918
let mut cmd_components: Vec<String> = shlex::split(cmd_str).unwrap();
10-
let cmd = cmd_components.remove(0);
19+
let cmd_name = cmd_components.remove(0);
1120
let args = cmd_components;
12-
let output = Command::new(cmd).args(args).output()?;
21+
let mut cmd = Command::new(cmd_name);
22+
cmd.args(args);
23+
if let Some(in_path) = in_path {
24+
cmd.current_dir(in_path);
25+
}
26+
let output = cmd.output()?;
27+
1328
if output.status.success() {
1429
Ok(output)
1530
} else {
@@ -41,10 +56,17 @@ where
4156

4257
/// Get the path of the root "optd" repo directory
4358
pub fn get_optd_root() -> io::Result<PathBuf> {
44-
let output = run_command_with_status_check("git rev-parse --show-toplevel")?;
45-
let path = str::from_utf8(&output.stdout).unwrap().trim();
46-
let path = PathBuf::from(path);
47-
Ok(path)
59+
let url_output = run_command_with_status_check("git config --get remote.origin.url")?;
60+
let url_string = str::from_utf8(&url_output.stdout).unwrap().trim();
61+
assert!(
62+
url_string.contains("cmu-db/optd"),
63+
"You are in the repo with url_string={}. This was not recognized as the optd repo.",
64+
url_string
65+
);
66+
let toplevel_output = run_command_with_status_check("git rev-parse --show-toplevel")?;
67+
let toplevel_str = str::from_utf8(&toplevel_output.stdout).unwrap().trim();
68+
let toplevel_dpath = PathBuf::from(toplevel_str);
69+
Ok(toplevel_dpath)
4870
}
4971

5072
/// Can be an absolute path or a relative path. Regardless of where this CLI is run, relative paths are evaluated relative to the optd repo root.

optd-perftest/src/tpch.rs

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ pub struct TpchKit {
4949
genned_tables_dpath: PathBuf,
5050
genned_queries_dpath: PathBuf,
5151
pub schema_fpath: PathBuf,
52+
pub constraints_fpath: PathBuf,
53+
pub indexes_fpath: PathBuf,
5254
}
5355

5456
/// I keep the same conventions for these methods as I do for PostgresDBMS
@@ -74,6 +76,8 @@ impl TpchKit {
7476
fs::create_dir(&genned_queries_dpath)?;
7577
}
7678
let schema_fpath = dbgen_dpath.join("dss.ddl");
79+
let constraints_fpath = dbgen_dpath.join("constraints.sql");
80+
let indexes_fpath = dbgen_dpath.join("indexes.sql");
7781

7882
// create Self
7983
let kit = TpchKit {
@@ -85,6 +89,8 @@ impl TpchKit {
8589
genned_tables_dpath,
8690
genned_queries_dpath,
8791
schema_fpath,
92+
constraints_fpath,
93+
indexes_fpath,
8894
};
8995

9096
// set envvars (DSS_PATH can change so we don't set it now)
@@ -110,24 +116,22 @@ impl TpchKit {
110116
} else {
111117
log::debug!("[skip] cloning tpch-kit repo");
112118
}
113-
env::set_current_dir(&self.tpch_kit_repo_dpath)?;
114119
log::debug!("[start] pulling latest tpch-kit repo");
115-
shell::run_command_with_status_check("git pull")?;
120+
shell::run_command_with_status_check_in_dir("git pull", Some(&self.tpch_kit_repo_dpath))?;
116121
log::debug!("[end] pulling latest tpch-kit repo");
122+
// make sure to do this so that get_optd_root() doesn't break
117123
Ok(())
118124
}
119125

120126
pub fn make(&self, dbms: &str) -> io::Result<()> {
121-
env::set_current_dir(&self.dbgen_dpath)?;
122127
log::debug!("[start] building dbgen");
123128
// we need to call "make clean" because we might have called make earlier with
124129
// a different dbms
125-
shell::run_command_with_status_check("make clean")?;
126-
shell::run_command_with_status_check(&format!(
127-
"make MACHINE={} DATABASE={}",
128-
TpchKit::get_machine(),
129-
dbms
130-
))?;
130+
shell::run_command_with_status_check_in_dir("make clean", Some(&self.dbgen_dpath))?;
131+
shell::run_command_with_status_check_in_dir(
132+
&format!("make MACHINE={} DATABASE={}", TpchKit::get_machine(), dbms),
133+
Some(&self.dbgen_dpath),
134+
)?;
131135
log::debug!("[end] building dbgen");
132136
Ok(())
133137
}
@@ -148,13 +152,12 @@ impl TpchKit {
148152
if !done_fpath.exists() {
149153
self.make(&tpch_config.dbms)?;
150154
shell::make_into_empty_dir(&this_genned_tables_dpath)?;
151-
env::set_current_dir(&self.dbgen_dpath)?;
152155
env::set_var("DSS_PATH", this_genned_tables_dpath.to_str().unwrap());
153156
log::debug!("[start] generating tables for {}", tpch_config);
154-
shell::run_command_with_status_check(&format!(
155-
"./dbgen -s{}",
156-
tpch_config.scale_factor
157-
))?;
157+
shell::run_command_with_status_check_in_dir(
158+
&format!("./dbgen -s{}", tpch_config.scale_factor),
159+
Some(&self.dbgen_dpath),
160+
)?;
158161
File::create(done_fpath)?;
159162
log::debug!("[end] generating tables for {}", tpch_config);
160163
} else {
@@ -170,14 +173,16 @@ impl TpchKit {
170173
if !done_fpath.exists() {
171174
self.make(&tpch_config.dbms)?;
172175
shell::make_into_empty_dir(&this_genned_queries_dpath)?;
173-
env::set_current_dir(&self.dbgen_dpath)?;
174176
log::debug!("[start] generating queries for {}", tpch_config);
175177
// we don't use -d in qgen because -r controls the substitution values we use
176178
for query_i in 1..=NUM_TPCH_QUERIES {
177-
let output = shell::run_command_with_status_check(&format!(
178-
"./qgen -s{} -r{} {}",
179-
tpch_config.scale_factor, tpch_config.seed, query_i
180-
))?;
179+
let output = shell::run_command_with_status_check_in_dir(
180+
&format!(
181+
"./qgen -s{} -r{} {}",
182+
tpch_config.scale_factor, tpch_config.seed, query_i
183+
),
184+
Some(&self.dbgen_dpath),
185+
)?;
181186
let this_genned_queries_fpath =
182187
this_genned_queries_dpath.join(format!("{}.sql", query_i));
183188
fs::write(&this_genned_queries_fpath, output.stdout)?;

0 commit comments

Comments
 (0)