Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 9dfbb47

Browse files
feat: caching true cardinalities, 22x speedup for TPC-H SF1 (#124)
**Summary**: Instead of executing queries every time we run cardtest (takes multiple minutes even for scale factor 1.0), we cache the true cardinalities from the first execution. **Demo**: 22x speedup (29m55s -> 1m21s) on TPC-H scale factor 1 ![Screenshot 2024-03-21 at 21 06 10](https://github.com/cmu-db/optd/assets/20631215/ccd4d117-42c9-4420-a3dc-b4137a18a2ea) ![Screenshot 2024-03-21 at 21 05 32](https://github.com/cmu-db/optd/assets/20631215/a694a566-3b80-4ed3-8394-d27d54e5b7f2) JSON cache file ![Screenshot 2024-03-21 at 19 00 09](https://github.com/cmu-db/optd/assets/20631215/374a7a1c-d6b7-495f-86bb-233c3629abe9) **Details**: * We now call `VACUUM FULL ANALYZE` in Postgres right after loading data to create stats, which we weren't doing before. * Because Postgres is faster and less buggy than optd+DataFusion, we only get truecards from Postgres. * Fixed bug where `dbname` and `tables/` were parameterized by the seed, which would make us miss caching opportunities. --------- Co-authored-by: Zhidong Guo <[email protected]>
1 parent 368bcd6 commit 9dfbb47

14 files changed

+169
-64
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
/.DS_Store
44
/.idea
55
.history
6-
**/*_workspace/**
6+
**/*_workspace/

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-perftest/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ tokio-util = "0.7"
4141
futures-util = "0.3"
4242
statistical = "1.0"
4343
prettytable-rs = "0.10"
44+
serde_json = "1.0"
4445

4546
[dev_dependencies]
4647
assert_cmd = "2.0"

optd-perftest/src/benchmark.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ impl Benchmark {
2626
let dbname = match self {
2727
Self::Test => String::from("test"),
2828
Self::Tpch(tpch_config) => {
29-
format!("tpch_sf{}_sd{}", tpch_config.scale_factor, tpch_config.seed)
29+
format!("tpch_sf{}", tpch_config.scale_factor)
3030
}
3131
};
3232
// since Postgres names cannot contain periods

optd-perftest/src/cardtest.rs

+16-9
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use std::collections::HashMap;
22
use std::path::Path;
33

4-
use crate::postgres_db::PostgresDb;
5-
use crate::{benchmark::Benchmark, datafusion_db::DatafusionDb, tpch::TpchConfig};
4+
use crate::postgres_dbms::{PostgresDBMS, POSTGRES_DBMS_NAME};
5+
use crate::{benchmark::Benchmark, datafusion_dbms::DatafusionDBMS, tpch::TpchConfig};
66

77
use anyhow::{self};
88
use async_trait::async_trait;
@@ -32,14 +32,21 @@ impl CardtestRunner {
3232
) -> anyhow::Result<HashMap<String, Vec<f64>>> {
3333
let mut qerrors_alldbs = HashMap::new();
3434

35+
// postgres runs faster and is less buggy so we use their true cardinalities
36+
// in the future, it's probably a good idea to get the truecards of datafusion to ensure that they match
37+
let pg_dbms = self
38+
.dbmss
39+
.iter_mut()
40+
.find(|dbms| dbms.get_name() == POSTGRES_DBMS_NAME)
41+
.unwrap();
42+
let pg_truecards = pg_dbms.eval_benchmark_truecards(benchmark).await?;
43+
3544
for dbms in &mut self.dbmss {
3645
let estcards = dbms.eval_benchmark_estcards(benchmark).await?;
37-
let truecards = dbms.eval_benchmark_truecards(benchmark).await?;
38-
assert!(truecards.len() == estcards.len());
3946
let qerrors = estcards
4047
.into_iter()
41-
.zip(truecards.into_iter())
42-
.map(|(estcard, truecard)| CardtestRunner::calc_qerror(estcard, truecard))
48+
.zip(pg_truecards.iter())
49+
.map(|(estcard, truecard)| CardtestRunner::calc_qerror(estcard, *truecard))
4350
.collect();
4451
qerrors_alldbs.insert(String::from(dbms.get_name()), qerrors);
4552
}
@@ -95,9 +102,9 @@ pub async fn cardtest<P: AsRef<Path> + Clone>(
95102
pgpassword: &str,
96103
tpch_config: TpchConfig,
97104
) -> anyhow::Result<HashMap<String, Vec<f64>>> {
98-
let pg_db = PostgresDb::new(workspace_dpath.clone(), pguser, pgpassword);
99-
let df_db = DatafusionDb::new(workspace_dpath).await?;
100-
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![Box::new(pg_db), Box::new(df_db)];
105+
let pg_dbms = PostgresDBMS::build(workspace_dpath.clone(), pguser, pgpassword)?;
106+
let df_dbms = DatafusionDBMS::new(workspace_dpath).await?;
107+
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![Box::new(pg_dbms), Box::new(df_dbms)];
101108

102109
let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
103110
let mut cardtest_runner = CardtestRunner::new(dbmss).await?;

optd-perftest/src/datafusion_db.rs renamed to optd-perftest/src/datafusion_dbms.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@ use optd_datafusion_bridge::{DatafusionCatalog, OptdQueryPlanner};
2929
use optd_datafusion_repr::{cost::BaseTableStats, cost::PerTableStats, DatafusionOptimizer};
3030
use regex::Regex;
3131

32-
pub struct DatafusionDb {
32+
pub struct DatafusionDBMS {
3333
workspace_dpath: PathBuf,
3434
ctx: SessionContext,
3535
}
3636

3737
#[async_trait]
38-
impl CardtestRunnerDBMSHelper for DatafusionDb {
38+
impl CardtestRunnerDBMSHelper for DatafusionDBMS {
3939
fn get_name(&self) -> &str {
4040
"DataFusion"
4141
}
@@ -63,9 +63,9 @@ impl CardtestRunnerDBMSHelper for DatafusionDb {
6363
}
6464
}
6565

66-
impl DatafusionDb {
66+
impl DatafusionDBMS {
6767
pub async fn new<P: AsRef<Path>>(workspace_dpath: P) -> anyhow::Result<Self> {
68-
Ok(DatafusionDb {
68+
Ok(DatafusionDBMS {
6969
workspace_dpath: workspace_dpath.as_ref().to_path_buf(),
7070
ctx: Self::new_session_ctx(None).await?,
7171
})
@@ -319,4 +319,4 @@ impl DatafusionDb {
319319
}
320320
}
321321

322-
unsafe impl Send for DatafusionDb {}
322+
unsafe impl Send for DatafusionDBMS {}

optd-perftest/src/lib.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod benchmark;
22
pub mod cardtest;
3-
mod datafusion_db;
4-
mod postgres_db;
3+
mod datafusion_dbms;
4+
mod postgres_dbms;
55
pub mod shell;
66
pub mod tpch;
7+
mod truecard_cache;

optd-perftest/src/main.rs

+15-13
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ enum Commands {
4747
},
4848
}
4949

50+
// q-errors are always >= 1.0 so two decimal points is enough
51+
fn fmt_qerror(qerror: f64) -> String {
52+
format!("{:.2}", qerror)
53+
}
54+
5055
#[tokio::main]
5156
async fn main() -> anyhow::Result<()> {
5257
env_logger::init();
@@ -73,15 +78,11 @@ async fn main() -> anyhow::Result<()> {
7378
};
7479
let qerrors_alldbs =
7580
cardtest::cardtest(&workspace_dpath, &pguser, &pgpassword, tpch_config).await?;
81+
println!();
7682
println!(" Aggregate Q-Error Comparison");
7783
let mut agg_qerror_table = Table::new();
7884
agg_qerror_table.set_titles(prettytable::row![
79-
"DBMS",
80-
"Median",
81-
"# Infinite",
82-
"Mean",
83-
"Min",
84-
"Max"
85+
"DBMS", "Median", "# Inf", "Mean", "Min", "Max"
8586
]);
8687
for (dbms, qerrors) in &qerrors_alldbs {
8788
if !qerrors.is_empty() {
@@ -93,22 +94,22 @@ async fn main() -> anyhow::Result<()> {
9394
let ninf_qerrors = qerrors.len() - finite_qerrors.len();
9495
let mean_qerror =
9596
finite_qerrors.iter().sum::<f64>() / finite_qerrors.len() as f64;
96-
let min_qerror = finite_qerrors
97+
let min_qerror = qerrors
9798
.iter()
9899
.min_by(|a, b| a.partial_cmp(b).unwrap())
99100
.unwrap();
100101
let median_qerror = statistical::median(qerrors);
101-
let max_qerror = finite_qerrors
102+
let max_qerror = qerrors
102103
.iter()
103104
.max_by(|a, b| a.partial_cmp(b).unwrap())
104105
.unwrap();
105106
agg_qerror_table.add_row(prettytable::row![
106107
dbms,
107-
median_qerror,
108+
fmt_qerror(median_qerror),
108109
ninf_qerrors,
109-
mean_qerror,
110-
min_qerror,
111-
max_qerror
110+
fmt_qerror(mean_qerror),
111+
fmt_qerror(*min_qerror),
112+
fmt_qerror(*max_qerror),
112113
]);
113114
} else {
114115
agg_qerror_table
@@ -119,6 +120,7 @@ async fn main() -> anyhow::Result<()> {
119120
agg_qerror_table.printstd();
120121

121122
let mut per_query_qerror_table = Table::new();
123+
println!();
122124
println!(" Per-Query Q-Error Comparison");
123125
let title_cells = iter::once(Cell::new("Query #"))
124126
.chain(qerrors_alldbs.keys().map(|dbms| Cell::new(dbms)))
@@ -129,7 +131,7 @@ async fn main() -> anyhow::Result<()> {
129131
row_cells.push(prettytable::cell!(query_id));
130132
for qerrors in qerrors_alldbs.values() {
131133
let qerror = qerrors.get(i).unwrap();
132-
row_cells.push(prettytable::cell!(qerror));
134+
row_cells.push(prettytable::cell!(fmt_qerror(*qerror)));
133135
}
134136
per_query_qerror_table.add_row(Row::new(row_cells));
135137
}

optd-perftest/src/postgres_db.rs renamed to optd-perftest/src/postgres_dbms.rs

+43-14
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::{
22
benchmark::Benchmark,
33
cardtest::CardtestRunnerDBMSHelper,
44
tpch::{TpchConfig, TpchKit},
5+
truecard_cache::DBMSTruecardCache,
56
};
67
use async_trait::async_trait;
78
use futures::Sink;
@@ -17,27 +18,39 @@ use tokio::fs::File;
1718
use tokio::io::AsyncReadExt;
1819
use tokio_postgres::{Client, NoTls, Row};
1920

21+
/// The name of the Postgres DBMS (as opposed to the DataFusion DBMS for instance)
22+
pub const POSTGRES_DBMS_NAME: &str = "Postgres";
23+
2024
/// This dbname is assumed to always exist
2125
const DEFAULT_DBNAME: &str = "postgres";
2226

23-
pub struct PostgresDb {
27+
pub struct PostgresDBMS {
2428
workspace_dpath: PathBuf,
2529
pguser: String,
2630
pgpassword: String,
31+
truecard_cache: DBMSTruecardCache,
2732
}
2833

2934
/// Conventions I keep for methods of this class:
3035
/// - Functions should be idempotent. For instance, start_postgres() should not fail if Postgres is already running
3136
/// - For instance, this is why "createdb" is _not_ a function
3237
/// - Stop and start functions should be separate
3338
/// - Setup should be done in build() unless it requires more information (like benchmark)
34-
impl PostgresDb {
35-
pub fn new<P: AsRef<Path>>(workspace_dpath: P, pguser: &str, pgpassword: &str) -> Self {
36-
Self {
37-
workspace_dpath: PathBuf::from(workspace_dpath.as_ref()),
39+
impl PostgresDBMS {
40+
pub fn build<P: AsRef<Path>>(
41+
workspace_dpath: P,
42+
pguser: &str,
43+
pgpassword: &str,
44+
) -> anyhow::Result<Self> {
45+
let workspace_dpath = PathBuf::from(workspace_dpath.as_ref());
46+
let truecard_cache = DBMSTruecardCache::build(&workspace_dpath, POSTGRES_DBMS_NAME)?;
47+
let pg_dbms = Self {
48+
workspace_dpath,
3849
pguser: String::from(pguser),
3950
pgpassword: String::from(pgpassword),
40-
}
51+
truecard_cache,
52+
};
53+
Ok(pg_dbms)
4154
}
4255

4356
/// Create a connection to a Postgres database
@@ -145,6 +158,11 @@ impl PostgresDb {
145158
Self::copy_from_stdin(client, tbl_fpath).await?;
146159
}
147160

161+
// create stats
162+
// you need to do VACUUM FULL ANALYZE and not just ANALYZE to make sure the stats are created in a deterministic way
163+
// this is standard practice for postgres benchmarking
164+
client.query("VACUUM FULL ANALYZE", &[]).await?;
165+
148166
Ok(())
149167
}
150168

@@ -178,9 +196,9 @@ impl PostgresDb {
178196
}
179197

180198
#[async_trait]
181-
impl CardtestRunnerDBMSHelper for PostgresDb {
199+
impl CardtestRunnerDBMSHelper for PostgresDBMS {
182200
fn get_name(&self) -> &str {
183-
"Postgres"
201+
POSTGRES_DBMS_NAME
184202
}
185203

186204
async fn eval_benchmark_estcards(
@@ -205,13 +223,16 @@ impl CardtestRunnerDBMSHelper for PostgresDb {
205223
let client = self.connect_to_db(&dbname).await?;
206224
match benchmark {
207225
Benchmark::Test => unimplemented!(),
208-
Benchmark::Tpch(tpch_config) => self.eval_tpch_truecards(&client, tpch_config).await,
226+
Benchmark::Tpch(tpch_config) => {
227+
self.eval_tpch_truecards(&client, tpch_config, &dbname)
228+
.await
229+
}
209230
}
210231
}
211232
}
212233

213-
/// This impl has helpers for ```impl CardtestRunnerDBMSHelper for PostgresDb```
214-
impl PostgresDb {
234+
/// This impl has helpers for ```impl CardtestRunnerDBMSHelper for PostgresDBMS```
235+
impl PostgresDBMS {
215236
async fn eval_tpch_estcards(
216237
&self,
217238
client: &Client,
@@ -231,17 +252,25 @@ impl PostgresDb {
231252
}
232253

233254
async fn eval_tpch_truecards(
234-
&self,
255+
&mut self,
235256
client: &Client,
236257
tpch_config: &TpchConfig,
258+
dbname: &str, // used by truecard_cache
237259
) -> anyhow::Result<Vec<usize>> {
238260
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
239261
tpch_kit.gen_queries(tpch_config)?;
240262

241263
let mut truecards = vec![];
242264
for sql_fpath in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
243265
let sql = fs::read_to_string(sql_fpath)?;
244-
let truecard = self.eval_query_truecard(client, &sql).await?;
266+
let truecard = match self.truecard_cache.get_truecard(dbname, &sql) {
267+
Some(truecard) => truecard,
268+
None => {
269+
let truecard = self.eval_query_truecard(client, &sql).await?;
270+
self.truecard_cache.insert_truecard(dbname, &sql, truecard);
271+
truecard
272+
}
273+
};
245274
truecards.push(truecard);
246275
}
247276

@@ -259,7 +288,7 @@ impl PostgresDb {
259288
self.log_explain(&explain_rows);
260289
// the first line contains the explain of the root node
261290
let first_explain_line: &str = explain_rows.first().unwrap().get(0);
262-
let estcard = PostgresDb::extract_row_count(first_explain_line).unwrap();
291+
let estcard = PostgresDBMS::extract_row_count(first_explain_line).unwrap();
263292
Ok(estcard)
264293
}
265294

optd-perftest/src/tpch.rs

+2-5
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pub struct TpchKit {
4949
pub schema_fpath: PathBuf,
5050
}
5151

52-
/// I keep the same conventions for these methods as I do for PostgresDb
52+
/// I keep the same conventions for these methods as I do for PostgresDBMS
5353
impl TpchKit {
5454
pub fn build<P: AsRef<Path>>(workspace_dpath: P) -> io::Result<Self> {
5555
log::debug!("[start] building TpchKit");
@@ -193,10 +193,7 @@ impl TpchKit {
193193
/// If two TpchConfig instances would *not always* generate the same data, then their
194194
/// directory names must be different.
195195
fn get_this_genned_tables_dpath(&self, tpch_config: &TpchConfig) -> PathBuf {
196-
let dname = format!(
197-
"db{}_sf{}_sd{}",
198-
tpch_config.dbms, tpch_config.scale_factor, tpch_config.seed
199-
);
196+
let dname = format!("db{}_sf{}", tpch_config.dbms, tpch_config.scale_factor,);
200197
self.genned_tables_dpath.join(dname)
201198
}
202199

0 commit comments

Comments
 (0)