Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit b2a4a77

Browse files
feat: new TruecardGetter interface (#130)
**Summary**: Made a new `TruecardGetter` interface that allows individual DBMS cardtest helpers to _not_ implement query execution. **Details**: * This allows us to decouple stats creation from table loading in DataFusion, causing TPC-H SF1's cardtest to run 1.7x faster. * More importantly, this sets up a future PR where we _cache_ the statistics object in DataFusion, which would unlock a ~10x speedup for TPC-H SF1's cardtest. * Q6 also now works with these changes so I added it to the default args.
1 parent 9dfbb47 commit b2a4a77

11 files changed

+327
-280
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-perftest/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ tokio-util = "0.7"
4141
futures-util = "0.3"
4242
statistical = "1.0"
4343
prettytable-rs = "0.10"
44+
serde = "1.0"
4445
serde_json = "1.0"
4546

4647
[dev_dependencies]

optd-perftest/src/benchmark.rs

+9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use crate::tpch::TpchConfig;
2+
use serde::{Deserialize, Serialize};
23

4+
#[derive(Deserialize, Serialize)]
35
pub enum Benchmark {
46
#[allow(dead_code)]
57
Test,
@@ -35,6 +37,13 @@ impl Benchmark {
3537
dbname.to_lowercase()
3638
}
3739

40+
/// An ID is just a unique string identifying the benchmark
41+
/// It's not always used in the same situations as get_dbname(), so it's a separate function
42+
pub fn get_id(&self) -> String {
43+
// the fact that it happens to return dbname is an implementation detail
44+
self.get_dbname()
45+
}
46+
3847
pub fn is_readonly(&self) -> bool {
3948
match self {
4049
Self::Test => true,

optd-perftest/src/cardtest.rs

+59-53
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,70 @@
11
use std::collections::HashMap;
22
use std::path::Path;
33

4-
use crate::postgres_dbms::{PostgresDBMS, POSTGRES_DBMS_NAME};
4+
use crate::postgres_dbms::PostgresDBMS;
5+
use crate::truecard::TruecardGetter;
56
use crate::{benchmark::Benchmark, datafusion_dbms::DatafusionDBMS, tpch::TpchConfig};
67

78
use anyhow::{self};
89
use async_trait::async_trait;
910

10-
/// This struct performs cardinality testing across one or more dbmss.
11+
/// This struct performs cardinality testing across one or more DBMSs.
1112
/// Another design would be for the CardtestRunnerDBMSHelper trait to expose a function
1213
/// to evaluate the Q-error. However, I chose not to do this design for reasons
1314
/// described in the comments of the CardtestRunnerDBMSHelper trait. This is why
14-
/// you would use CardtestRunner even for computing the Q-error of a single dbms.
15+
/// you would use CardtestRunner even for computing the Q-error of a single DBMS.
1516
pub struct CardtestRunner {
1617
pub dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>,
18+
truecard_getter: Box<dyn TruecardGetter>,
19+
}
20+
21+
pub struct Cardinfo {
22+
pub qerror: f64,
23+
pub estcard: usize,
24+
pub truecard: usize,
1725
}
1826

1927
impl CardtestRunner {
20-
pub async fn new(dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>) -> anyhow::Result<Self> {
21-
Ok(CardtestRunner { dbmss })
28+
pub async fn new(
29+
dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>,
30+
truecard_getter: Box<dyn TruecardGetter>,
31+
) -> anyhow::Result<Self> {
32+
Ok(CardtestRunner {
33+
dbmss,
34+
truecard_getter,
35+
})
2236
}
2337

24-
/// Get the Q-error of a query using the cost models of all dbmss being tested
38+
/// Get the Q-error of a query using the cost models of all DBMSs being tested
2539
/// Q-error is defined in [Leis 2015](https://15721.courses.cs.cmu.edu/spring2024/papers/16-costmodels/p204-leis.pdf)
2640
/// One detail not specified in the paper is that Q-error is based on the ratio of true and estimated cardinality
2741
/// of the entire query, not of a subtree of the query. This detail is specified in Section 7.1 of
2842
/// [Yang 2020](https://arxiv.org/pdf/2006.08109.pdf)
29-
pub async fn eval_benchmark_qerrors_alldbs(
43+
pub async fn eval_benchmark_cardinfos_alldbs(
3044
&mut self,
3145
benchmark: &Benchmark,
32-
) -> anyhow::Result<HashMap<String, Vec<f64>>> {
33-
let mut qerrors_alldbs = HashMap::new();
34-
35-
// postgres runs faster and is less buggy so we use their true cardinalities
36-
// in the future, it's probably a good idea to get the truecards of datafusion to ensure that they match
37-
let pg_dbms = self
38-
.dbmss
39-
.iter_mut()
40-
.find(|dbms| dbms.get_name() == POSTGRES_DBMS_NAME)
41-
.unwrap();
42-
let pg_truecards = pg_dbms.eval_benchmark_truecards(benchmark).await?;
46+
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
47+
let mut cardinfos_alldbs = HashMap::new();
48+
let truecards = self
49+
.truecard_getter
50+
.get_benchmark_truecards(benchmark)
51+
.await?;
4352

4453
for dbms in &mut self.dbmss {
4554
let estcards = dbms.eval_benchmark_estcards(benchmark).await?;
46-
let qerrors = estcards
55+
let cardinfos = estcards
4756
.into_iter()
48-
.zip(pg_truecards.iter())
49-
.map(|(estcard, truecard)| CardtestRunner::calc_qerror(estcard, *truecard))
57+
.zip(truecards.iter())
58+
.map(|(estcard, &truecard)| Cardinfo {
59+
qerror: CardtestRunner::calc_qerror(estcard, truecard),
60+
estcard,
61+
truecard,
62+
})
5063
.collect();
51-
qerrors_alldbs.insert(String::from(dbms.get_name()), qerrors);
64+
cardinfos_alldbs.insert(String::from(dbms.get_name()), cardinfos);
5265
}
5366

54-
Ok(qerrors_alldbs)
67+
Ok(cardinfos_alldbs)
5568
}
5669

5770
fn calc_qerror(estcard: usize, truecard: usize) -> f64 {
@@ -62,54 +75,47 @@ impl CardtestRunner {
6275
}
6376
}
6477

65-
/// This trait defines helper functions to enable cardinality testing on a dbms
66-
/// The reason a "get qerror" function is not exposed is to allow for greater
67-
/// flexibility. If we exposed "get qerror" for each dbms, we would need to
68-
/// get the true and estimated cardinalities for _each_ dbms. However, we
69-
/// can now choose to only get the true cardinalities of _one_ dbms to
70-
/// improve performance or even cache the true cardinalities. Additionally, if
71-
/// we do want to get the true cardinalities of all dbmss, we can compare
72-
/// them against each other to ensure they're all equal. All these options are
73-
/// possible when exposing "get true card" and "get est card" instead of a
74-
/// single "get qerror". If you want to compute the Q-error of a single
75-
/// dbms, just create a CardtestRunner with a single dbms as input.
76-
/// When exposing a "get true card" and "get est card" interface, you could
77-
/// ostensibly do it on the granularity of a single SQL string or on the
78-
/// granularity of an entire benchmark. I chose the latter for a simple reason:
79-
/// different dbmss might have different SQL strings for the same conceptual
80-
/// query (see how qgen in tpch-kit takes in dbms as an input).
78+
/// This trait defines helper functions to enable cardinality testing on a DBMS
79+
/// The reason "get true card" is not a function here is because we don't need to call
80+
/// "get true card" for all DBMSs we are testing, since they'll all return the same
81+
/// answer. We also cache true cardinalities instead of executing queries every time
82+
/// since executing OLAP queries could take minutes to hours. Due to both of these
83+
/// factors, we conceptually view getting the true cardinality as a completely separate
84+
/// problem from getting the estimated cardinalities of each DBMS.
85+
/// When exposing a "get est card" interface, you could do it on the granularity of
86+
/// a single SQL string or on the granularity of an entire benchmark. I chose the
87+
/// latter for a simple reason: different DBMSs might have different SQL strings
88+
/// for the same conceptual query (see how qgen in tpch-kit takes in DBMS as an input).
8189
/// When more performance tests are implemented, you would probably want to extract
8290
/// get_name() into a generic "DBMS" trait.
8391
#[async_trait]
8492
pub trait CardtestRunnerDBMSHelper {
8593
// get_name() has &self so that we're able to do Box<dyn CardtestRunnerDBMSHelper>
8694
fn get_name(&self) -> &str;
8795

88-
// The order of queries has to be the same between these two functions.
96+
// The order of queries in the returned vector has to be the same between all databases,
97+
// and it has to be the same as the order returned by TruecardGetter.
8998
async fn eval_benchmark_estcards(
9099
&mut self,
91100
benchmark: &Benchmark,
92101
) -> anyhow::Result<Vec<usize>>;
93-
async fn eval_benchmark_truecards(
94-
&mut self,
95-
benchmark: &Benchmark,
96-
) -> anyhow::Result<Vec<usize>>;
97102
}
98103

99-
pub async fn cardtest<P: AsRef<Path> + Clone>(
104+
pub async fn cardtest<P: AsRef<Path>>(
100105
workspace_dpath: P,
101106
pguser: &str,
102107
pgpassword: &str,
103108
tpch_config: TpchConfig,
104-
) -> anyhow::Result<HashMap<String, Vec<f64>>> {
105-
let pg_dbms = PostgresDBMS::build(workspace_dpath.clone(), pguser, pgpassword)?;
106-
let df_dbms = DatafusionDBMS::new(workspace_dpath).await?;
107-
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![Box::new(pg_dbms), Box::new(df_dbms)];
109+
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
110+
let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
111+
let truecard_getter = pg_dbms.clone();
112+
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath).await?);
113+
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];
108114

109115
let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
110-
let mut cardtest_runner = CardtestRunner::new(dbmss).await?;
111-
let qerrors_alldbs = cardtest_runner
112-
.eval_benchmark_qerrors_alldbs(&tpch_benchmark)
116+
let mut cardtest_runner = CardtestRunner::new(dbmss, truecard_getter).await?;
117+
let cardinfos_alldbs = cardtest_runner
118+
.eval_benchmark_cardinfos_alldbs(&tpch_benchmark)
113119
.await?;
114-
Ok(qerrors_alldbs)
120+
Ok(cardinfos_alldbs)
115121
}

optd-perftest/src/datafusion_dbms.rs

+55-49
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use std::{
22
fs,
33
path::{Path, PathBuf},
44
sync::Arc,
5+
time::Instant,
56
};
67

78
use crate::{
@@ -44,23 +45,16 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
4445
&mut self,
4546
benchmark: &Benchmark,
4647
) -> anyhow::Result<Vec<usize>> {
47-
self.load_benchmark_data(benchmark).await?;
48+
let base_table_stats = self.get_benchmark_stats(benchmark).await?;
49+
self.clear_state(Some(base_table_stats)).await?;
50+
// Create the tables. This must be done after clear_state because that clears everything
51+
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
52+
self.create_tpch_tables(&tpch_kit).await?;
4853
match benchmark {
4954
Benchmark::Test => unimplemented!(),
5055
Benchmark::Tpch(tpch_config) => self.eval_tpch_estcards(tpch_config).await,
5156
}
5257
}
53-
54-
async fn eval_benchmark_truecards(
55-
&mut self,
56-
benchmark: &Benchmark,
57-
) -> anyhow::Result<Vec<usize>> {
58-
self.load_benchmark_data(benchmark).await?;
59-
match benchmark {
60-
Benchmark::Test => unimplemented!(),
61-
Benchmark::Tpch(tpch_config) => self.eval_tpch_truecards(tpch_config).await,
62-
}
63-
}
6458
}
6559

6660
impl DatafusionDBMS {
@@ -137,37 +131,22 @@ impl DatafusionDBMS {
137131
}
138132

139133
async fn eval_tpch_estcards(&self, tpch_config: &TpchConfig) -> anyhow::Result<Vec<usize>> {
134+
let start = Instant::now();
135+
140136
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
141137
tpch_kit.gen_queries(tpch_config)?;
142138

143139
let mut estcards = vec![];
144-
for sql_fpath in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
140+
for (_, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
145141
let sql = fs::read_to_string(sql_fpath)?;
146142
let estcard = self.eval_query_estcard(&sql).await?;
147143
estcards.push(estcard);
148144
}
149145

150-
Ok(estcards)
151-
}
146+
let duration = start.elapsed();
147+
println!("datafusion eval_tpch_estcards duration: {:?}", duration);
152148

153-
async fn eval_tpch_truecards(&self, tpch_config: &TpchConfig) -> anyhow::Result<Vec<usize>> {
154-
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
155-
tpch_kit.gen_queries(tpch_config)?;
156-
157-
let mut truecards = vec![];
158-
for sql_fpath in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
159-
let sql = fs::read_to_string(sql_fpath)?;
160-
let estcard = self.eval_query_truecard(&sql).await?;
161-
truecards.push(estcard);
162-
}
163-
164-
Ok(truecards)
165-
}
166-
167-
async fn eval_query_truecard(&self, sql: &str) -> anyhow::Result<usize> {
168-
let rows = Self::execute(&self.ctx, sql).await?;
169-
let num_rows = rows.len();
170-
Ok(num_rows)
149+
Ok(estcards)
171150
}
172151

173152
fn log_explain(&self, explains: &[Vec<String>]) {
@@ -204,23 +183,30 @@ impl DatafusionDBMS {
204183
Ok(row_cnt)
205184
}
206185

207-
async fn load_benchmark_data(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
186+
/// Load the data into DataFusion without building the stats used by optd.
187+
/// Unlike Postgres, where both data and stats are used by the same program, for this class the
188+
/// data is used by DataFusion while the stats are used by optd. That is why there are two
189+
/// separate functions to load them.
190+
#[allow(dead_code)]
191+
async fn load_benchmark_data_no_stats(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
208192
match benchmark {
209-
Benchmark::Tpch(tpch_config) => self.load_tpch_data(tpch_config).await,
193+
Benchmark::Tpch(tpch_config) => self.load_tpch_data_no_stats(tpch_config).await,
210194
_ => unimplemented!(),
211195
}
212196
}
213197

214-
async fn load_tpch_data(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<()> {
215-
// Geenrate the tables.
216-
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
217-
tpch_kit.gen_tables(tpch_config)?;
218-
219-
// Generate the stats.
220-
let stats = self.load_tpch_stats(&tpch_kit, tpch_config).await?;
221-
self.clear_state(Some(stats)).await?;
198+
/// Build the stats that optd's cost model uses.
199+
async fn get_benchmark_stats(
200+
&mut self,
201+
benchmark: &Benchmark,
202+
) -> anyhow::Result<BaseTableStats> {
203+
match benchmark {
204+
Benchmark::Tpch(tpch_config) => self.get_tpch_stats(tpch_config).await,
205+
_ => unimplemented!(),
206+
}
207+
}
222208

223-
// Create the tables.
209+
async fn create_tpch_tables(&mut self, tpch_kit: &TpchKit) -> anyhow::Result<()> {
224210
let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
225211
let ddls = ddls
226212
.split(';')
@@ -230,6 +216,19 @@ impl DatafusionDBMS {
230216
for ddl in ddls {
231217
Self::execute(&self.ctx, ddl).await?;
232218
}
219+
Ok(())
220+
}
221+
222+
#[allow(dead_code)]
223+
async fn load_tpch_data_no_stats(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<()> {
224+
let start = Instant::now();
225+
226+
// Generate the tables.
227+
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
228+
tpch_kit.gen_tables(tpch_config)?;
229+
230+
// Create the tables.
231+
self.create_tpch_tables(&tpch_kit).await?;
233232

234233
// Load the data by creating an external table first and copying the data to real tables.
235234
let tbl_fpath_iter = tpch_kit.get_tbl_fpath_iter(tpch_config).unwrap();
@@ -270,14 +269,19 @@ impl DatafusionDBMS {
270269
.await?;
271270
}
272271

272+
let duration = start.elapsed();
273+
println!("datafusion load_tpch_data duration: {:?}", duration);
274+
273275
Ok(())
274276
}
275277

276-
async fn load_tpch_stats(
277-
&self,
278-
tpch_kit: &TpchKit,
279-
tpch_config: &TpchConfig,
280-
) -> anyhow::Result<BaseTableStats> {
278+
async fn get_tpch_stats(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<BaseTableStats> {
279+
let start = Instant::now();
280+
281+
// Generate the tables
282+
let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
283+
tpch_kit.gen_tables(tpch_config)?;
284+
281285
// To get the schema of each table.
282286
let ctx = Self::new_session_ctx(None).await?;
283287
let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
@@ -315,6 +319,8 @@ impl DatafusionDBMS {
315319
);
316320
log::debug!("statistics generated for table: {}", tbl_name);
317321
}
322+
let duration = start.elapsed();
323+
println!("datafusion load_tpch_stats duration: {:?}", duration);
318324
Ok(base_table_stats)
319325
}
320326
}

optd-perftest/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ mod datafusion_dbms;
44
mod postgres_dbms;
55
pub mod shell;
66
pub mod tpch;
7-
mod truecard_cache;
7+
mod truecard;

0 commit comments

Comments
 (0)