cmu-db
diff --git a/‎Cargo.lock
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock
Lines changed: 1 addition & 0 deletions
diff --git a/‎optd-perftest/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎optd-perftest/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎optd-perftest/src/benchmark.rs
Lines changed: 9 additions & 0 deletions b/‎optd-perftest/src/benchmark.rs
Lines changed: 9 additions & 0 deletions
diff --git a/‎optd-perftest/src/cardtest.rs
Lines changed: 59 additions & 53 deletions b/‎optd-perftest/src/cardtest.rs
Lines changed: 59 additions & 53 deletions
diff --git a/‎optd-perftest/src/datafusion_dbms.rs
Lines changed: 55 additions & 49 deletions b/‎optd-perftest/src/datafusion_dbms.rs
Lines changed: 55 additions & 49 deletions
diff --git a/‎optd-perftest/src/lib.rs
Lines changed: 1 addition & 1 deletion b/‎optd-perftest/src/lib.rs
Lines changed: 1 addition & 1 deletion
@@ -41,6 +41,7 @@ tokio-util = "0.7"
 futures-util = "0.3"
 statistical = "1.0"
 prettytable-rs = "0.10"
+serde = "1.0"
 serde_json = "1.0"
 
 [dev_dependencies]
 
@@ -1,5 +1,7 @@
 use crate::tpch::TpchConfig;
+use serde::{Deserialize, Serialize};
 
+#[derive(Deserialize, Serialize)]
 pub enum Benchmark {
     #[allow(dead_code)]
     Test,
@@ -35,6 +37,13 @@ impl Benchmark {
         dbname.to_lowercase()
     }
 
+    /// An ID is just a unique string identifying the benchmark
+    /// It's not always used in the same situations as get_dbname(), so it's a separate function
+    pub fn get_id(&self) -> String {
+        // the fact that it happens to return dbname is an implementation detail
+        self.get_dbname()
+    }
+
     pub fn is_readonly(&self) -> bool {
         match self {
             Self::Test => true,
 
@@ -1,57 +1,70 @@
 use std::collections::HashMap;
 use std::path::Path;
 
-use crate::postgres_dbms::{PostgresDBMS, POSTGRES_DBMS_NAME};
+use crate::postgres_dbms::PostgresDBMS;
+use crate::truecard::TruecardGetter;
 use crate::{benchmark::Benchmark, datafusion_dbms::DatafusionDBMS, tpch::TpchConfig};
 
 use anyhow::{self};
 use async_trait::async_trait;
 
-/// This struct performs cardinality testing across one or more dbmss.
+/// This struct performs cardinality testing across one or more DBMSs.
 /// Another design would be for the CardtestRunnerDBMSHelper trait to expose a function
 ///   to evaluate the Q-error. However, I chose not to do this design for reasons
 ///   described in the comments of the CardtestRunnerDBMSHelper trait. This is why
-///   you would use CardtestRunner even for computing the Q-error of a single dbms.
+///   you would use CardtestRunner even for computing the Q-error of a single DBMS.
 pub struct CardtestRunner {
     pub dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>,
+    truecard_getter: Box<dyn TruecardGetter>,
+}
+
+pub struct Cardinfo {
+    pub qerror: f64,
+    pub estcard: usize,
+    pub truecard: usize,
 }
 
 impl CardtestRunner {
-    pub async fn new(dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>) -> anyhow::Result<Self> {
-        Ok(CardtestRunner { dbmss })
+    pub async fn new(
+        dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>>,
+        truecard_getter: Box<dyn TruecardGetter>,
+    ) -> anyhow::Result<Self> {
+        Ok(CardtestRunner {
+            dbmss,
+            truecard_getter,
+        })
     }
 
-    /// Get the Q-error of a query using the cost models of all dbmss being tested
+    /// Get the Q-error of a query using the cost models of all DBMSs being tested
     /// Q-error is defined in [Leis 2015](https://15721.courses.cs.cmu.edu/spring2024/papers/16-costmodels/p204-leis.pdf)
     /// One detail not specified in the paper is that Q-error is based on the ratio of true and estimated cardinality
     ///   of the entire query, not of a subtree of the query. This detail is specified in Section 7.1 of
     ///   [Yang 2020](https://arxiv.org/pdf/2006.08109.pdf)
-    pub async fn eval_benchmark_qerrors_alldbs(
+    pub async fn eval_benchmark_cardinfos_alldbs(
         &mut self,
         benchmark: &Benchmark,
-    ) -> anyhow::Result<HashMap<String, Vec<f64>>> {
-        let mut qerrors_alldbs = HashMap::new();
-
-        // postgres runs faster and is less buggy so we use their true cardinalities
-        // in the future, it's probably a good idea to get the truecards of datafusion to ensure that they match
-        let pg_dbms = self
-            .dbmss
-            .iter_mut()
-            .find(|dbms| dbms.get_name() == POSTGRES_DBMS_NAME)
-            .unwrap();
-        let pg_truecards = pg_dbms.eval_benchmark_truecards(benchmark).await?;
+    ) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
+        let mut cardinfos_alldbs = HashMap::new();
+        let truecards = self
+            .truecard_getter
+            .get_benchmark_truecards(benchmark)
+            .await?;
 
         for dbms in &mut self.dbmss {
             let estcards = dbms.eval_benchmark_estcards(benchmark).await?;
-            let qerrors = estcards
+            let cardinfos = estcards
                 .into_iter()
-                .zip(pg_truecards.iter())
-                .map(|(estcard, truecard)| CardtestRunner::calc_qerror(estcard, *truecard))
+                .zip(truecards.iter())
+                .map(|(estcard, &truecard)| Cardinfo {
+                    qerror: CardtestRunner::calc_qerror(estcard, truecard),
+                    estcard,
+                    truecard,
+                })
                 .collect();
-            qerrors_alldbs.insert(String::from(dbms.get_name()), qerrors);
+            cardinfos_alldbs.insert(String::from(dbms.get_name()), cardinfos);
         }
 
-        Ok(qerrors_alldbs)
+        Ok(cardinfos_alldbs)
     }
 
     fn calc_qerror(estcard: usize, truecard: usize) -> f64 {
@@ -62,54 +75,47 @@ impl CardtestRunner {
     }
 }
 
-/// This trait defines helper functions to enable cardinality testing on a dbms
-/// The reason a "get qerror" function is not exposed is to allow for greater
-///   flexibility. If we exposed "get qerror" for each dbms, we would need to
-///   get the true and estimated cardinalities for _each_ dbms. However, we
-///   can now choose to only get the true cardinalities of _one_ dbms to
-///   improve performance or even cache the true cardinalities. Additionally, if
-///   we do want to get the true cardinalities of all dbmss, we can compare
-///   them against each other to ensure they're all equal. All these options are
-///   possible when exposing "get true card" and "get est card" instead of a
-///   single "get qerror". If you want to compute the Q-error of a single
-///   dbms, just create a CardtestRunner with a single dbms as input.
-/// When exposing a "get true card" and "get est card" interface, you could
-///   ostensibly do it on the granularity of a single SQL string or on the
-///   granularity of an entire benchmark. I chose the latter for a simple reason:
-///   different dbmss might have different SQL strings for the same conceptual
-///   query (see how qgen in tpch-kit takes in dbms as an input).
+/// This trait defines helper functions to enable cardinality testing on a DBMS
+/// The reason "get true card" is not a function here is because we don't need to call
+///   "get true card" for all DBMSs we are testing, since they'll all return the same
+///   answer. We also cache true cardinalities instead of executing queries every time
+///   since executing OLAP queries could take minutes to hours. Due to both of these
+///   factors, we conceptually view getting the true cardinality as a completely separate
+///   problem from getting the estimated cardinalities of each DBMS.
+/// When exposing a "get est card" interface, you could do it on the granularity of
+///   a single SQL string or on the granularity of an entire benchmark. I chose the
+///   latter for a simple reason: different DBMSs might have different SQL strings
+///   for the same conceptual query (see how qgen in tpch-kit takes in DBMS as an input).
 /// When more performance tests are implemented, you would probably want to extract
 ///   get_name() into a generic "DBMS" trait.
 #[async_trait]
 pub trait CardtestRunnerDBMSHelper {
     // get_name() has &self so that we're able to do Box<dyn CardtestRunnerDBMSHelper>
     fn get_name(&self) -> &str;
 
-    // The order of queries has to be the same between these two functions.
+    // The order of queries in the returned vector has to be the same between all databases,
+    //   and it has to be the same as the order returned by TruecardGetter.
     async fn eval_benchmark_estcards(
         &mut self,
         benchmark: &Benchmark,
     ) -> anyhow::Result<Vec<usize>>;
-    async fn eval_benchmark_truecards(
-        &mut self,
-        benchmark: &Benchmark,
-    ) -> anyhow::Result<Vec<usize>>;
 }
 
-pub async fn cardtest<P: AsRef<Path> + Clone>(
+pub async fn cardtest<P: AsRef<Path>>(
     workspace_dpath: P,
     pguser: &str,
     pgpassword: &str,
     tpch_config: TpchConfig,
-) -> anyhow::Result<HashMap<String, Vec<f64>>> {
-    let pg_dbms = PostgresDBMS::build(workspace_dpath.clone(), pguser, pgpassword)?;
-    let df_dbms = DatafusionDBMS::new(workspace_dpath).await?;
-    let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![Box::new(pg_dbms), Box::new(df_dbms)];
+) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
+    let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
+    let truecard_getter = pg_dbms.clone();
+    let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath).await?);
+    let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];
 
     let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
-    let mut cardtest_runner = CardtestRunner::new(dbmss).await?;
-    let qerrors_alldbs = cardtest_runner
-        .eval_benchmark_qerrors_alldbs(&tpch_benchmark)
+    let mut cardtest_runner = CardtestRunner::new(dbmss, truecard_getter).await?;
+    let cardinfos_alldbs = cardtest_runner
+        .eval_benchmark_cardinfos_alldbs(&tpch_benchmark)
         .await?;
-    Ok(qerrors_alldbs)
+    Ok(cardinfos_alldbs)
 }
@@ -2,6 +2,7 @@ use std::{
     fs,
     path::{Path, PathBuf},
     sync::Arc,
+    time::Instant,
 };
 
 use crate::{
@@ -44,23 +45,16 @@ impl CardtestRunnerDBMSHelper for DatafusionDBMS {
         &mut self,
         benchmark: &Benchmark,
     ) -> anyhow::Result<Vec<usize>> {
-        self.load_benchmark_data(benchmark).await?;
+        let base_table_stats = self.get_benchmark_stats(benchmark).await?;
+        self.clear_state(Some(base_table_stats)).await?;
+        // Create the tables. This must be done after clear_state because that clears everything
+        let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
+        self.create_tpch_tables(&tpch_kit).await?;
         match benchmark {
             Benchmark::Test => unimplemented!(),
             Benchmark::Tpch(tpch_config) => self.eval_tpch_estcards(tpch_config).await,
         }
     }
-
-    async fn eval_benchmark_truecards(
-        &mut self,
-        benchmark: &Benchmark,
-    ) -> anyhow::Result<Vec<usize>> {
-        self.load_benchmark_data(benchmark).await?;
-        match benchmark {
-            Benchmark::Test => unimplemented!(),
-            Benchmark::Tpch(tpch_config) => self.eval_tpch_truecards(tpch_config).await,
-        }
-    }
 }
 
 impl DatafusionDBMS {
@@ -137,37 +131,22 @@ impl DatafusionDBMS {
     }
 
     async fn eval_tpch_estcards(&self, tpch_config: &TpchConfig) -> anyhow::Result<Vec<usize>> {
+        let start = Instant::now();
+
         let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
         tpch_kit.gen_queries(tpch_config)?;
 
         let mut estcards = vec![];
-        for sql_fpath in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
+        for (_, sql_fpath) in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
             let sql = fs::read_to_string(sql_fpath)?;
             let estcard = self.eval_query_estcard(&sql).await?;
             estcards.push(estcard);
         }
 
-        Ok(estcards)
-    }
+        let duration = start.elapsed();
+        println!("datafusion eval_tpch_estcards duration: {:?}", duration);
 
-    async fn eval_tpch_truecards(&self, tpch_config: &TpchConfig) -> anyhow::Result<Vec<usize>> {
-        let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
-        tpch_kit.gen_queries(tpch_config)?;
-
-        let mut truecards = vec![];
-        for sql_fpath in tpch_kit.get_sql_fpath_ordered_iter(tpch_config)? {
-            let sql = fs::read_to_string(sql_fpath)?;
-            let estcard = self.eval_query_truecard(&sql).await?;
-            truecards.push(estcard);
-        }
-
-        Ok(truecards)
-    }
-
-    async fn eval_query_truecard(&self, sql: &str) -> anyhow::Result<usize> {
-        let rows = Self::execute(&self.ctx, sql).await?;
-        let num_rows = rows.len();
-        Ok(num_rows)
+        Ok(estcards)
     }
 
     fn log_explain(&self, explains: &[Vec<String>]) {
@@ -204,23 +183,30 @@ impl DatafusionDBMS {
         Ok(row_cnt)
     }
 
-    async fn load_benchmark_data(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
+    /// Load the data into DataFusion without building the stats used by optd.
+    /// Unlike Postgres, where both data and stats are used by the same program, for this class the
+    ///   data is used by DataFusion while the stats are used by optd. That is why there are two
+    ///   separate functions to load them.
+    #[allow(dead_code)]
+    async fn load_benchmark_data_no_stats(&mut self, benchmark: &Benchmark) -> anyhow::Result<()> {
         match benchmark {
-            Benchmark::Tpch(tpch_config) => self.load_tpch_data(tpch_config).await,
+            Benchmark::Tpch(tpch_config) => self.load_tpch_data_no_stats(tpch_config).await,
             _ => unimplemented!(),
         }
     }
 
-    async fn load_tpch_data(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<()> {
-        // Geenrate the tables.
-        let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
-        tpch_kit.gen_tables(tpch_config)?;
-
-        // Generate the stats.
-        let stats = self.load_tpch_stats(&tpch_kit, tpch_config).await?;
-        self.clear_state(Some(stats)).await?;
+    /// Build the stats that optd's cost model uses.
+    async fn get_benchmark_stats(
+        &mut self,
+        benchmark: &Benchmark,
+    ) -> anyhow::Result<BaseTableStats> {
+        match benchmark {
+            Benchmark::Tpch(tpch_config) => self.get_tpch_stats(tpch_config).await,
+            _ => unimplemented!(),
+        }
+    }
 
-        // Create the tables.
+    async fn create_tpch_tables(&mut self, tpch_kit: &TpchKit) -> anyhow::Result<()> {
         let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
         let ddls = ddls
             .split(';')
@@ -230,6 +216,19 @@ impl DatafusionDBMS {
         for ddl in ddls {
             Self::execute(&self.ctx, ddl).await?;
         }
+        Ok(())
+    }
+
+    #[allow(dead_code)]
+    async fn load_tpch_data_no_stats(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<()> {
+        let start = Instant::now();
+
+        // Generate the tables.
+        let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
+        tpch_kit.gen_tables(tpch_config)?;
+
+        // Create the tables.
+        self.create_tpch_tables(&tpch_kit).await?;
 
         // Load the data by creating an external table first and copying the data to real tables.
         let tbl_fpath_iter = tpch_kit.get_tbl_fpath_iter(tpch_config).unwrap();
@@ -270,14 +269,19 @@ impl DatafusionDBMS {
             .await?;
         }
 
+        let duration = start.elapsed();
+        println!("datafusion load_tpch_data duration: {:?}", duration);
+
         Ok(())
     }
 
-    async fn load_tpch_stats(
-        &self,
-        tpch_kit: &TpchKit,
-        tpch_config: &TpchConfig,
-    ) -> anyhow::Result<BaseTableStats> {
+    async fn get_tpch_stats(&mut self, tpch_config: &TpchConfig) -> anyhow::Result<BaseTableStats> {
+        let start = Instant::now();
+
+        // Generate the tables
+        let tpch_kit = TpchKit::build(&self.workspace_dpath)?;
+        tpch_kit.gen_tables(tpch_config)?;
+
         // To get the schema of each table.
         let ctx = Self::new_session_ctx(None).await?;
         let ddls = fs::read_to_string(&tpch_kit.schema_fpath)?;
@@ -315,6 +319,8 @@ impl DatafusionDBMS {
             );
             log::debug!("statistics generated for table: {}", tbl_name);
         }
+        let duration = start.elapsed();
+        println!("datafusion load_tpch_stats duration: {:?}", duration);
         Ok(base_table_stats)
     }
 }
 
@@ -4,4 +4,4 @@ mod datafusion_dbms;
 mod postgres_dbms;
 pub mod shell;
 pub mod tpch;
-mod truecard_cache;
+mod truecard;