cmu-db
diff --git a/Diff for: ‎.gitignore
+3-1 b/Diff for: ‎.gitignore
+3-1
diff --git a/Diff for: ‎Cargo.lock
+7 b/Diff for: ‎Cargo.lock
+7
diff --git a/Diff for: ‎optd-perftest/Cargo.toml
+1 b/Diff for: ‎optd-perftest/Cargo.toml
+1
diff --git a/Diff for: ‎optd-perftest/src/benchmark.rs
+22 b/Diff for: ‎optd-perftest/src/benchmark.rs
+22
diff --git a/Diff for: ‎optd-perftest/src/cardtest.rs
+2-4 b/Diff for: ‎optd-perftest/src/cardtest.rs
+2-4
diff --git a/Diff for: ‎optd-perftest/src/datafusion_db_cardtest.rs
+2-1 b/Diff for: ‎optd-perftest/src/datafusion_db_cardtest.rs
+2-1
diff --git a/Diff for: ‎optd-perftest/src/main.rs
+20-11 b/Diff for: ‎optd-perftest/src/main.rs
+20-11
diff --git a/Diff for: ‎optd-perftest/src/postgres_db.rs
+221-5 b/Diff for: ‎optd-perftest/src/postgres_db.rs
+221-5
@@ -5,4 +5,6 @@
 .history
 optd-perftest/**/genned_tables
 optd-perftest/**/genned_queries
-optd-perftest/**/tpch-kit
+optd-perftest/**/tpch-kit
+optd-perftest/**/pgdata
+optd-perftest/**/postgres_log
@@ -17,3 +17,4 @@ tokio = { version = "1.24", features = [
     "sync",
     "parking_lot",
 ] }
+shlex = "1.3"
@@ -0,0 +1,22 @@
+use crate::tpch::TpchConfig;
+
+pub enum Benchmark {
+    Test,
+    Tpch(TpchConfig),
+}
+
+impl Benchmark {
+    pub fn get_strid(&self) -> String {
+        match self {
+            Self::Test => String::from("test"),
+            Self::Tpch(tpch_cfg) => format!("tpch_{}", tpch_cfg.get_strid()),
+        }
+    }
+
+    pub fn is_readonly(&self) -> bool {
+        match self {
+            Self::Test => true,
+            Self::Tpch(_) => true,
+        }
+    }
+}
@@ -1,6 +1,8 @@
 use anyhow::{self};
 use async_trait::async_trait;
 
+use crate::benchmark::Benchmark;
+
 /// This struct performs cardinality testing across one or more databases.
 /// Another design would be for the CardtestRunnerDBHelper trait to expose a function
 ///   to evaluate the Q-error. However, I chose not to do this design for reasons
@@ -10,10 +12,6 @@ pub struct CardtestRunner {
     pub databases: Vec<Box<dyn CardtestRunnerDBHelper>>,
 }
 
-pub enum Benchmark {
-    Test,
-}
-
 impl CardtestRunner {
     pub async fn new(databases: Vec<Box<dyn CardtestRunnerDBHelper>>) -> anyhow::Result<Self> {
         Ok(CardtestRunner { databases })
 
@@ -1,4 +1,4 @@
-use crate::cardtest::{Benchmark, CardtestRunnerDBHelper};
+use crate::{benchmark::Benchmark, cardtest::CardtestRunnerDBHelper};
 use async_trait::async_trait;
 use optd_sqlplannertest::DatafusionDb;
 
@@ -14,6 +14,7 @@ impl CardtestRunnerDBHelper for DatafusionDb {
                 self.execute("CREATE TABLE t1 (c1 INT);", true).await?;
                 self.execute("INSERT INTO t1 VALUES (0);", true).await?;
             }
+            _ => unimplemented!(),
         };
         Ok(())
     }
 
@@ -4,28 +4,37 @@ use optd_sqlplannertest::DatafusionDb;
 use postgres_db::PostgresDb;
 
 use crate::{
-    cardtest::Benchmark,
-    tpch_kit::{TpchKit, TPCH_KIT_POSTGRES},
+    benchmark::Benchmark,
+    tpch::{TpchConfig, TpchKit, TPCH_KIT_POSTGRES},
 };
 
+mod benchmark;
 mod cardtest;
-mod cmd;
 mod datafusion_db_cardtest;
 mod postgres_db;
-mod tpch_kit;
+mod shell;
+mod tpch;
 
 #[tokio::main]
 async fn main() -> Result<()> {
-    let databases: Vec<Box<dyn CardtestRunnerDBHelper>> = vec![
-        Box::new(PostgresDb::new().await?),
-        Box::new(DatafusionDb::new().await?),
-    ];
+    let pg_db = PostgresDb::build(true).await?;
+    let tpch_cfg = TpchConfig {
+        database: String::from(TPCH_KIT_POSTGRES),
+        scale_factor: 1,
+        seed: 15721,
+    };
+    let tpch_benchmark = Benchmark::Tpch(tpch_cfg.clone());
+    pg_db.load_database(&tpch_benchmark).await?;
+    if true {
+        return Ok(());
+    }
+    let df_db = DatafusionDb::new().await?;
+    let databases: Vec<Box<dyn CardtestRunnerDBHelper>> = vec![Box::new(pg_db), Box::new(df_db)];
     let cardtest_runner = CardtestRunner::new(databases).await?;
     cardtest_runner.load_databases(Benchmark::Test).await?;
     let qerrors = cardtest_runner.eval_qerrors("SELECT * FROM t1;").await?;
     println!("qerrors: {:?}", qerrors);
-    let kit = TpchKit::build(true).unwrap();
-    kit.gen_tables(TPCH_KIT_POSTGRES, 1)?;
-    kit.gen_queries(TPCH_KIT_POSTGRES, 1, 15721)?;
+    let tpch_kit = TpchKit::build(true)?;
+    tpch_kit.gen_queries(&tpch_cfg)?;
     Ok(())
 }
@@ -1,12 +1,205 @@
-use crate::cardtest::{Benchmark, CardtestRunnerDBHelper};
+use crate::{
+    benchmark::Benchmark,
+    cardtest::CardtestRunnerDBHelper,
+    shell,
+    tpch::{TpchConfig, TpchKit},
+};
 use anyhow::Result;
 use async_trait::async_trait;
+use std::{
+    env::{self, consts::OS},
+    fs::{self, File},
+    path::{Path, PathBuf},
+    process::Command,
+};
 
-pub struct PostgresDb {}
+const OPTD_DB_NAME: &str = "optd";
 
+pub struct PostgresDb {
+    verbose: bool,
+
+    // cache these paths so we don't have to build them multiple times
+    _postgres_db_dpath: PathBuf,
+    pgdata_dpath: PathBuf,
+    log_fpath: PathBuf,
+}
+
+/// Conventions I keep for methods of this class:
+///   - Functions should be idempotent. For instance, start_postgres() should not fail if Postgres is already running
+///   - Stop and start functions should be separate
+///   - Setup should be done in build() unless it requires more information (like benchmark)
 impl PostgresDb {
-    pub async fn new() -> Result<Self> {
-        Ok(PostgresDb {})
+    pub async fn build(verbose: bool) -> Result<Self> {
+        // build paths, sometimes creating them if they don't exist
+        let curr_dpath = env::current_dir()?;
+        let postgres_db_dpath = Path::new(file!())
+            .parent()
+            .unwrap()
+            .join("postgres_db")
+            .to_path_buf();
+        let postgres_db_dpath = curr_dpath.join(postgres_db_dpath); // make it absolute
+        if !postgres_db_dpath.exists() {
+            fs::create_dir(&postgres_db_dpath)?;
+        }
+        let pgdata_dpath = postgres_db_dpath.join("pgdata");
+        let log_fpath = postgres_db_dpath.join("postgres_log");
+
+        // create Self
+        let db = PostgresDb {
+            verbose,
+            _postgres_db_dpath: postgres_db_dpath,
+            pgdata_dpath,
+            log_fpath,
+        };
+
+        // (re)start postgres
+        db.install_postgres().await?;
+        db.init_pgdata().await?;
+        db.start_postgres().await?;
+
+        Ok(db)
+    }
+
+    /// Installs an up-to-date version of Postgres using the OS's package manager
+    async fn install_postgres(&self) -> Result<()> {
+        match OS {
+            "macos" => {
+                if self.verbose {
+                    println!("updating and upgrading brew...");
+                }
+                shell::run_command_with_status_check("brew update")?;
+                shell::run_command_with_status_check("brew upgrade")?;
+
+                if self.verbose {
+                    println!("installing postgresql...");
+                }
+                shell::run_command_with_status_check("brew install postgresql")?;
+            }
+            _ => unimplemented!(),
+        };
+        Ok(())
+    }
+
+    /// Remove the pgdata dir, making sure to stop a running Postgres process if there is one
+    /// If there is a Postgres process running on pgdata, it's important to stop it to avoid
+    ///   corrupting it (not stopping it leads to lots of weird behavior)
+    async fn remove_pgdata(&self) -> Result<()> {
+        if PostgresDb::get_is_postgres_running()? {
+            self.stop_postgres().await?;
+        }
+        shell::make_into_empty_dir(&self.pgdata_dpath)?;
+        Ok(())
+    }
+
+    /// Initializes pgdata_dpath directory if it wasn't already initialized
+    async fn init_pgdata(&self) -> Result<()> {
+        let done_fpath = self.pgdata_dpath.join("initdb_done");
+        if !done_fpath.exists() {
+            if self.verbose {
+                println!("running initdb...");
+            }
+            shell::make_into_empty_dir(&self.pgdata_dpath)?;
+            shell::run_command_with_status_check(&format!(
+                "initdb {}",
+                self.pgdata_dpath.to_str().unwrap()
+            ))?;
+            File::create(done_fpath)?;
+        } else {
+            #[allow(clippy::collapsible_else_if)]
+            if self.verbose {
+                println!("skipped running initdb");
+            }
+        }
+        Ok(())
+    }
+
+    /// Start the Postgres process if it's not already started
+    /// It will always be started using the pg_ctl binary installed with the package manager
+    /// It will always be started on port 5432
+    async fn start_postgres(&self) -> Result<()> {
+        if !PostgresDb::get_is_postgres_running()? {
+            if self.verbose {
+                println!("starting postgres...");
+            }
+            shell::run_command_with_status_check(&format!(
+                "pg_ctl -D{} -l{} start",
+                self.pgdata_dpath.to_str().unwrap(),
+                self.log_fpath.to_str().unwrap()
+            ))?;
+        } else {
+            #[allow(clippy::collapsible_else_if)]
+            if self.verbose {
+                println!("skipped starting postgres");
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Stop the Postgres process started by start_postgres()
+    async fn stop_postgres(&self) -> Result<()> {
+        if PostgresDb::get_is_postgres_running()? {
+            if self.verbose {
+                println!("stopping postgres...");
+            }
+            shell::run_command_with_status_check(&format!(
+                "pg_ctl -D{} stop",
+                self.pgdata_dpath.to_str().unwrap()
+            ))?;
+        } else {
+            #[allow(clippy::collapsible_else_if)]
+            if self.verbose {
+                println!("skipped stopping postgres");
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Check whether postgres is running
+    fn get_is_postgres_running() -> Result<bool> {
+        Ok(Command::new("pg_isready").output()?.status.success())
+    }
+
+    /// Load the benchmark data without worrying about caching
+    async fn load_benchmark_data_raw(&self, benchmark: &Benchmark) -> Result<()> {
+        match benchmark {
+            Benchmark::Tpch(tpch_cfg) => self.load_tpch_data(tpch_cfg).await?,
+            _ => unimplemented!(),
+        };
+        Ok(())
+    }
+
+    async fn load_tpch_data(&self, tpch_cfg: &TpchConfig) -> Result<()> {
+        // start from a clean slate
+        self.remove_pgdata().await?;
+        // since we deleted pgdata we'll need to re-init it
+        self.init_pgdata().await?;
+        // postgres must be started again since remove_pgdata() stops it
+        self.start_postgres().await?;
+        // load the schema. createdb should not fail since we just make a fresh pgdata
+        shell::run_command_with_status_check(&format!("createdb {}", OPTD_DB_NAME))?;
+        let tpch_kit = TpchKit::build(self.verbose)?;
+        tpch_kit.gen_tables(tpch_cfg)?;
+        shell::run_command_with_status_check(&format!(
+            "psql {} -f {}",
+            OPTD_DB_NAME,
+            tpch_kit.schema_fpath.to_str().unwrap()
+        ))?;
+        let tbl_fpath_iter = tpch_kit.get_tbl_fpath_iter(tpch_cfg).unwrap();
+        for tbl_fpath in tbl_fpath_iter {
+            let tbl_name = tbl_fpath.file_stem().unwrap().to_str().unwrap();
+            let copy_table_cmd = format!(
+                "\\copy {} from {} csv delimiter '|'",
+                tbl_name,
+                tbl_fpath.to_str().unwrap()
+            );
+            shell::run_command_with_status_check(&format!(
+                "psql {} -c \"{}\"",
+                OPTD_DB_NAME, copy_table_cmd
+            ))?;
+        }
+        Ok(())
     }
 }
 
@@ -16,7 +209,30 @@ impl CardtestRunnerDBHelper for PostgresDb {
         "Postgres"
     }
 
-    async fn load_database(&self, _benchmark: &Benchmark) -> anyhow::Result<()> {
+    /// Load the data of a benchmark with parameters
+    /// As an optimization, if this benchmark only has read-only queries and the
+    ///   data currently loaded was with the same benchmark and parameters, we don't
+    ///   need to load it again
+    async fn load_database(&self, benchmark: &Benchmark) -> anyhow::Result<()> {
+        if benchmark.is_readonly() {
+            let benchmark_strid = benchmark.get_strid();
+            let done_fname = format!("{}_done", benchmark_strid);
+            let done_fpath = self.pgdata_dpath.join(done_fname);
+            if !done_fpath.exists() {
+                if self.verbose {
+                    println!("loading data for {}...", benchmark_strid);
+                }
+                self.load_benchmark_data_raw(benchmark).await?;
+                File::create(done_fpath)?;
+            } else {
+                #[allow(clippy::collapsible_else_if)]
+                if self.verbose {
+                    println!("skipped loading data for {}", benchmark_strid);
+                }
+            }
+        } else {
+            self.load_benchmark_data_raw(benchmark).await?
+        }
         Ok(())
     }
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-use crate::cardtest::{Benchmark, CardtestRunnerDBHelper};`
	`1`	`+use crate::{benchmark::Benchmark, cardtest::CardtestRunnerDBHelper};`
`2`	`2`	`use async_trait::async_trait;`
`3`	`3`	`use optd_sqlplannertest::DatafusionDb;`
`4`	`4`
`@@ -14,6 +14,7 @@ impl CardtestRunnerDBHelper for DatafusionDb {`
`14`	`14`	`self.execute("CREATE TABLE t1 (c1 INT);", true).await?;`
`15`	`15`	`self.execute("INSERT INTO t1 VALUES (0);", true).await?;`
`16`	`16`	`}`
	`17`	`+ _ => unimplemented!(),`
`17`	`18`	`};`
`18`	`19`	`Ok(())`
`19`	`20`	`}`