|
| 1 | +/// A wrapper around job-kit |
| 2 | +use serde::{Deserialize, Serialize}; |
| 3 | + |
| 4 | +use crate::shell; |
| 5 | +use std::fmt::{self, Display, Formatter}; |
| 6 | +use std::fs; |
| 7 | +use std::fs::File; |
| 8 | +use std::io; |
| 9 | +use std::path::{Path, PathBuf}; |
| 10 | + |
| 11 | +const JOB_KIT_REPO_URL: &str = "https://github.com/wangpatrick57/job-kit.git"; |
| 12 | +const JOB_TABLES_URL: &str = "https://homepages.cwi.nl/~boncz/job/imdb.tgz"; |
| 13 | + |
| 14 | +#[derive(Clone, Debug, Serialize, Deserialize)] |
| 15 | +pub struct JobConfig { |
| 16 | + pub query_ids: Vec<u32>, |
| 17 | +} |
| 18 | + |
| 19 | +impl Display for JobConfig { |
| 20 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 21 | + // Use write! macro to write formatted string to `f` |
| 22 | + write!(f, "JobConfig(query_ids={:?})", self.query_ids,) |
| 23 | + } |
| 24 | +} |
| 25 | + |
| 26 | +/// Provides many helper functions for running a JOB workload. |
| 27 | +/// It does not actually execute the queries as it is meant to be DBMS-agnostic. |
| 28 | +/// Is essentially a wrapper around the job-kit repo. |
| 29 | +/// Since it's conceptually a wrapper around the repo, I chose _not_ to make |
| 30 | +/// JobConfig an initialization parameter. |
| 31 | +pub struct JobKit { |
| 32 | + _workspace_dpath: PathBuf, |
| 33 | + |
| 34 | + // cache these paths so we don't have to build them multiple times |
| 35 | + job_dpath: PathBuf, |
| 36 | + job_kit_repo_dpath: PathBuf, |
| 37 | + downloaded_tables_dpath: PathBuf, |
| 38 | + queries_dpath: PathBuf, |
| 39 | + pub schema_fpath: PathBuf, |
| 40 | + pub indexes_fpath: PathBuf, |
| 41 | +} |
| 42 | + |
| 43 | +impl JobKit { |
| 44 | + pub fn build<P: AsRef<Path>>(workspace_dpath: P) -> io::Result<Self> { |
| 45 | + log::debug!("[start] building JobKit"); |
| 46 | + |
| 47 | + // build paths, sometimes creating them if they don't exist |
| 48 | + let workspace_dpath = workspace_dpath.as_ref().to_path_buf(); |
| 49 | + let job_dpath = workspace_dpath.join("job"); |
| 50 | + if !job_dpath.exists() { |
| 51 | + fs::create_dir(&job_dpath)?; |
| 52 | + } |
| 53 | + let job_kit_repo_dpath = job_dpath.join("job-kit"); |
| 54 | + let queries_dpath = job_kit_repo_dpath.join("queries"); |
| 55 | + let downloaded_tables_dpath = job_dpath.join("downloaded_tables"); |
| 56 | + if !downloaded_tables_dpath.exists() { |
| 57 | + fs::create_dir(&downloaded_tables_dpath)?; |
| 58 | + } |
| 59 | + let schema_fpath = job_kit_repo_dpath.join("schema.sql"); |
| 60 | + let indexes_fpath = job_kit_repo_dpath.join("fkindexes.sql"); |
| 61 | + |
| 62 | + // create Self |
| 63 | + let kit = JobKit { |
| 64 | + _workspace_dpath: workspace_dpath, |
| 65 | + job_dpath, |
| 66 | + job_kit_repo_dpath, |
| 67 | + queries_dpath, |
| 68 | + downloaded_tables_dpath, |
| 69 | + schema_fpath, |
| 70 | + indexes_fpath, |
| 71 | + }; |
| 72 | + |
| 73 | + // setup |
| 74 | + shell::clonepull_repo(JOB_KIT_REPO_URL, &kit.job_kit_repo_dpath)?; |
| 75 | + |
| 76 | + log::debug!("[end] building TpchKit"); |
| 77 | + Ok(kit) |
| 78 | + } |
| 79 | + |
| 80 | + /// Download the .csv files for all tables of JOB |
| 81 | + pub fn download_tables(&self, job_config: &JobConfig) -> io::Result<()> { |
| 82 | + let done_fpath = self.downloaded_tables_dpath.join("download_tables_done"); |
| 83 | + if !done_fpath.exists() { |
| 84 | + log::debug!("[start] downloading tables for {}", job_config); |
| 85 | + // Instructions are from https://cedardb.com/docs/guides/example_datasets/job/, not from the job-kit repo. |
| 86 | + shell::run_command_with_status_check_in_dir( |
| 87 | + &format!("curl -O {JOB_TABLES_URL}"), |
| 88 | + &self.job_dpath, |
| 89 | + )?; |
| 90 | + shell::make_into_empty_dir(&self.downloaded_tables_dpath)?; |
| 91 | + shell::run_command_with_status_check_in_dir( |
| 92 | + "tar -zxvf ../imdb.tgz", |
| 93 | + &self.downloaded_tables_dpath, |
| 94 | + )?; |
| 95 | + shell::run_command_with_status_check_in_dir("rm imdb.tgz", &self.job_dpath)?; |
| 96 | + File::create(done_fpath)?; |
| 97 | + log::debug!("[end] downloading tables for {}", job_config); |
| 98 | + } else { |
| 99 | + log::debug!("[skip] downloading tables for {}", job_config); |
| 100 | + } |
| 101 | + Ok(()) |
| 102 | + } |
| 103 | + |
| 104 | + /// Convert a tbl_fpath into the table name |
| 105 | + pub fn get_tbl_name_from_tbl_fpath<P: AsRef<Path>>(tbl_fpath: P) -> String { |
| 106 | + tbl_fpath |
| 107 | + .as_ref() |
| 108 | + .file_stem() |
| 109 | + .unwrap() |
| 110 | + .to_str() |
| 111 | + .unwrap() |
| 112 | + .to_string() |
| 113 | + } |
| 114 | + |
| 115 | + /// Get an iterator through all generated .tbl files of a given config |
| 116 | + pub fn get_tbl_fpath_iter(&self) -> io::Result<impl Iterator<Item = PathBuf>> { |
| 117 | + let dirent_iter = fs::read_dir(&self.downloaded_tables_dpath)?; |
| 118 | + // all results/options are fine to be unwrapped except for path.extension() because that could |
| 119 | + // return None in various cases |
| 120 | + let path_iter = dirent_iter.map(|dirent| dirent.unwrap().path()); |
| 121 | + let tbl_fpath_iter = path_iter |
| 122 | + .filter(|path| path.extension().map(|ext| ext.to_str().unwrap()) == Some("csv")); |
| 123 | + Ok(tbl_fpath_iter) |
| 124 | + } |
| 125 | + |
| 126 | + /// Get an iterator through all generated .sql files _in order_ of a given config |
| 127 | + /// It's important to iterate _in order_ due to the interface of CardtestRunnerDBMSHelper |
| 128 | + pub fn get_sql_fpath_ordered_iter( |
| 129 | + &self, |
| 130 | + job_config: &JobConfig, |
| 131 | + ) -> io::Result<impl Iterator<Item = (u32, PathBuf)>> { |
| 132 | + let queries_dpath = self.queries_dpath.clone(); |
| 133 | + let sql_fpath_ordered_iter = job_config |
| 134 | + .query_ids |
| 135 | + .clone() |
| 136 | + .into_iter() |
| 137 | + .map(move |query_id| (query_id, queries_dpath.join(format!("{}.sql", query_id)))); |
| 138 | + Ok(sql_fpath_ordered_iter) |
| 139 | + } |
| 140 | +} |
0 commit comments