Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit f42a3cd

Browse files
feat: using proper magic numbers in various edge cases (#143)
**Summary**: Using magic numbers from Postgres in various selectivity edge cases. **Demo**: Different (unfortunately worse) q-error on TPC-H SF1. See #127 for per-query details on how this PR affects q-error. ![Screenshot 2024-03-30 at 11 27 24](https://github.com/cmu-db/optd/assets/20631215/b0cce5d4-6ac8-4cd5-b0cf-48f86db14d26) **Details**: * Fixed the cardinality of Q10! * `INVALID_SEL` is **no longer used** at all during cardtest. It is still used during plannertest as some plannertests use the optd optimizer instead of the datafusion logical optimizer. This can be checked by replacing all instances of `INVALID_SEL` with a `panic!()` and seeing that cardtest still runs. * Using magic number from Postgres for `LIKE`. * Using magic number from Postgres for equality with various complex expressions. * Using magic number from Postgres for range comparison with various complex expressions. * Replaced `INVALID_SEL` with `panic!()` and `unreachable!()` statements in places where it makes sense.
1 parent bd40a10 commit f42a3cd

File tree

6 files changed

+167
-107
lines changed

6 files changed

+167
-107
lines changed

optd-datafusion-repr/src/cost/base_cost.rs

+145-90
Original file line numberDiff line numberDiff line change
@@ -315,9 +315,16 @@ pub trait Distribution: 'static + Send + Sync {
315315
pub const ROW_COUNT: usize = 1;
316316
pub const COMPUTE_COST: usize = 2;
317317
pub const IO_COST: usize = 3;
318-
// used to indicate a combination of unimplemented!(), unreachable!(), or panic!()
319-
// TODO: a future PR will remove this and get the code working for all of TPC-H
320-
const INVALID_SELECTIVITY: f64 = 0.001;
318+
319+
// Default statistics. All are from selfuncs.h in Postgres unless specified otherwise
320+
// Default selectivity estimate for equalities such as "A = b"
321+
const DEFAULT_EQ_SEL: f64 = 0.005;
322+
// Default selectivity estimate for inequalities such as "A < b"
323+
const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333;
324+
// Default selectivity estimate for pattern-match operators such as LIKE
325+
const DEFAULT_MATCH_SEL: f64 = 0.005;
326+
327+
const INVALID_SEL: f64 = 0.01;
321328

322329
impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
323330
pub fn row_cnt(Cost(cost): &Cost) -> f64 {
@@ -421,10 +428,10 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
421428
row_cnt.min(fetch as f64)
422429
}
423430
} else {
424-
(row_cnt * INVALID_SELECTIVITY).max(1.0)
431+
panic!("compute_cost() should not be called if optimizer is None")
425432
}
426433
} else {
427-
(row_cnt * INVALID_SELECTIVITY).max(1.0)
434+
panic!("compute_cost() should not be called if context is None")
428435
};
429436
Self::cost(row_cnt, compute_cost, 0.0)
430437
}
@@ -446,13 +453,13 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
446453
if let Some(expr_tree) = expr_trees.first() {
447454
self.get_filter_selectivity(Arc::clone(expr_tree), &column_refs)
448455
} else {
449-
INVALID_SELECTIVITY
456+
panic!("encountered a PhysicalFilter without an expression")
450457
}
451458
} else {
452-
INVALID_SELECTIVITY
459+
panic!("compute_cost() should not be called if optimizer is None")
453460
}
454461
}
455-
None => INVALID_SELECTIVITY,
462+
None => panic!("compute_cost() should not be called if context is None"),
456463
};
457464

458465
Self::cost(
@@ -552,53 +559,73 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
552559
column_refs: &GroupColumnRefs,
553560
) -> f64 {
554561
assert!(expr_tree.typ.is_expression());
555-
match expr_tree.typ {
562+
match &expr_tree.typ {
563+
OptRelNodeTyp::Constant(_) => todo!("check bool type or else panic"),
564+
OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
565+
OptRelNodeTyp::UnOp(un_op_typ) => {
566+
assert!(expr_tree.children.len() == 1);
567+
let child = expr_tree.child(0);
568+
match un_op_typ {
569+
// not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
570+
// for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
571+
UnOpType::Not => 1.0 - self.get_filter_selectivity(child, column_refs),
572+
UnOpType::Neg => panic!(
573+
"the selectivity of operations that return numerical values is undefined"
574+
),
575+
}
576+
}
556577
OptRelNodeTyp::BinOp(bin_op_typ) => {
557578
assert!(expr_tree.children.len() == 2);
558579
let left_child = expr_tree.child(0);
559580
let right_child = expr_tree.child(1);
560581

561582
if bin_op_typ.is_comparison() {
562583
self.get_comparison_op_selectivity(
563-
bin_op_typ,
584+
*bin_op_typ,
564585
left_child,
565586
right_child,
566587
column_refs,
567588
)
568589
} else if bin_op_typ.is_numerical() {
569-
INVALID_SELECTIVITY
590+
panic!(
591+
"the selectivity of operations that return numerical values is undefined"
592+
)
570593
} else {
571594
unreachable!("all BinOpTypes should be true for at least one is_*() function")
572595
}
573596
}
574597
OptRelNodeTyp::LogOp(log_op_typ) => {
575-
self.get_log_op_selectivity(log_op_typ, &expr_tree.children, column_refs)
598+
self.get_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs)
576599
}
577-
OptRelNodeTyp::UnOp(un_op_typ) => {
578-
assert!(expr_tree.children.len() == 1);
579-
let child = expr_tree.child(0);
580-
match un_op_typ {
581-
// not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
582-
// for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
583-
UnOpType::Not => 1.0 - self.get_filter_selectivity(child, column_refs),
584-
_ => INVALID_SELECTIVITY,
585-
}
600+
OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
601+
OptRelNodeTyp::SortOrder(_) => {
602+
panic!("the selectivity of sort order expressions is undefined")
603+
}
604+
OptRelNodeTyp::Between => INVALID_SEL,
605+
OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
606+
OptRelNodeTyp::Like => DEFAULT_MATCH_SEL,
607+
OptRelNodeTyp::DataType(_) => {
608+
panic!("the selectivity of a data type is not defined")
586609
}
587-
_ => INVALID_SELECTIVITY,
610+
OptRelNodeTyp::InList => INVALID_SEL,
611+
_ => unreachable!(
612+
"all expression OptRelNodeTyp were enumerated. this should be unreachable"
613+
),
588614
}
589615
}
590616

591617
/// Comparison operators are the base case for recursion in get_filter_selectivity()
592618
fn get_comparison_op_selectivity(
593619
&self,
594-
bin_op_typ: BinOpType,
620+
comp_bin_op_typ: BinOpType,
595621
left: OptRelNodeRef,
596622
right: OptRelNodeRef,
597623
column_refs: &GroupColumnRefs,
598624
) -> f64 {
599-
assert!(bin_op_typ.is_comparison());
625+
assert!(comp_bin_op_typ.is_comparison());
600626

601-
// the # of column refs determines how we handle the logic
627+
// it's more convenient to refer to the children based on whether they're column nodes or not
628+
// rather than by left/right
602629
let mut col_ref_nodes = vec![];
603630
let mut non_col_ref_nodes = vec![];
604631
let is_left_col_ref;
@@ -623,8 +650,9 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
623650
non_col_ref_nodes.push(right);
624651
}
625652

653+
// handle the different cases of column nodes
626654
if col_ref_nodes.is_empty() {
627-
INVALID_SELECTIVITY
655+
INVALID_SEL
628656
} else if col_ref_nodes.len() == 1 {
629657
let col_ref_node = col_ref_nodes
630658
.pop()
@@ -636,79 +664,98 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
636664
.pop()
637665
.expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1");
638666

639-
if let OptRelNodeTyp::Constant(_) = non_col_ref_node.as_ref().typ {
640-
let value = non_col_ref_node
641-
.as_ref()
642-
.data
643-
.as_ref()
644-
.expect("constants should have data");
645-
match match bin_op_typ {
646-
BinOpType::Eq => {
647-
self.get_column_equality_selectivity(table, *col_idx, value, true)
648-
}
649-
BinOpType::Neq => {
650-
self.get_column_equality_selectivity(table, *col_idx, value, false)
667+
match non_col_ref_node.as_ref().typ {
668+
OptRelNodeTyp::Constant(_) => {
669+
let value = non_col_ref_node
670+
.as_ref()
671+
.data
672+
.as_ref()
673+
.expect("constants should have data");
674+
match comp_bin_op_typ {
675+
BinOpType::Eq => {
676+
self.get_column_equality_selectivity(table, *col_idx, value, true)
677+
}
678+
BinOpType::Neq => {
679+
self.get_column_equality_selectivity(table, *col_idx, value, false)
680+
}
681+
BinOpType::Lt => self.get_column_range_selectivity(
682+
table,
683+
*col_idx,
684+
value,
685+
is_left_col_ref,
686+
false,
687+
),
688+
BinOpType::Leq => self.get_column_range_selectivity(
689+
table,
690+
*col_idx,
691+
value,
692+
is_left_col_ref,
693+
true,
694+
),
695+
BinOpType::Gt => self.get_column_range_selectivity(
696+
table,
697+
*col_idx,
698+
value,
699+
!is_left_col_ref,
700+
false,
701+
),
702+
BinOpType::Geq => self.get_column_range_selectivity(
703+
table,
704+
*col_idx,
705+
value,
706+
!is_left_col_ref,
707+
true,
708+
),
709+
_ => unreachable!("all comparison BinOpTypes were enumerated. this should be unreachable"),
651710
}
652-
BinOpType::Lt => self.get_column_range_selectivity(
653-
table,
654-
*col_idx,
655-
value,
656-
is_left_col_ref,
657-
false,
658-
),
659-
BinOpType::Leq => self.get_column_range_selectivity(
660-
table,
661-
*col_idx,
662-
value,
663-
is_left_col_ref,
664-
true,
665-
),
666-
BinOpType::Gt => self.get_column_range_selectivity(
667-
table,
668-
*col_idx,
669-
value,
670-
!is_left_col_ref,
671-
false,
672-
),
673-
BinOpType::Geq => self.get_column_range_selectivity(
674-
table,
675-
*col_idx,
676-
value,
677-
!is_left_col_ref,
678-
true,
679-
),
680-
_ => None,
681-
} {
682-
Some(sel) => sel,
683-
None => INVALID_SELECTIVITY,
684711
}
685-
} else {
686-
INVALID_SELECTIVITY
712+
OptRelNodeTyp::BinOp(_) => {
713+
Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
714+
}
715+
OptRelNodeTyp::Cast => INVALID_SEL,
716+
_ => unimplemented!(
717+
"unhandled case of comparing a column ref node to {}",
718+
non_col_ref_node.as_ref().typ
719+
),
687720
}
688721
} else {
689-
INVALID_SELECTIVITY
722+
unimplemented!("non base table column refs need to be implemented")
690723
}
691724
} else if col_ref_nodes.len() == 2 {
692-
INVALID_SELECTIVITY
725+
Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
693726
} else {
694-
unreachable!("We could have at most pushed left and right into col_ref_nodes")
727+
unreachable!("we could have at most pushed left and right into col_ref_nodes")
728+
}
729+
}
730+
731+
/// The default selectivity of a comparison expression
732+
/// Used when one side of the comparison is a column while the other side is something too
733+
/// complex/impossible to evaluate (subquery, UDF, another column, we have no stats, etc.)
734+
fn get_default_comparison_op_selectivity(comp_bin_op_typ: BinOpType) -> f64 {
735+
assert!(comp_bin_op_typ.is_comparison());
736+
match comp_bin_op_typ {
737+
BinOpType::Eq => DEFAULT_EQ_SEL,
738+
BinOpType::Neq => 1.0 - DEFAULT_EQ_SEL,
739+
BinOpType::Lt | BinOpType::Leq | BinOpType::Gt | BinOpType::Geq => DEFAULT_INEQ_SEL,
740+
_ => unreachable!(
741+
"all comparison BinOpTypes were enumerated. this should be unreachable"
742+
),
695743
}
696744
}
697745

698746
/// Get the selectivity of an expression of the form "column equals value" (or "value equals column")
699-
/// Computes selectivity based off of statistics
747+
/// Will handle the case of statistics missing
700748
/// Equality predicates are handled entirely differently from range predicates so this is its own function
701749
/// Also, get_column_equality_selectivity is a subroutine when computing range selectivity, which is another
702750
/// reason for separating these into two functions
703-
/// If it is unable to find the statistics, it returns None
704751
/// is_eq means whether it's == or !=
705752
fn get_column_equality_selectivity(
706753
&self,
707754
table: &str,
708755
col_idx: usize,
709756
value: &Value,
710757
is_eq: bool,
711-
) -> Option<f64> {
758+
) -> f64 {
712759
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
713760
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
714761
{
@@ -722,16 +769,26 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
722769
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0
723770
(non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64)
724771
};
725-
Some(if is_eq {
772+
if is_eq {
726773
eq_freq
727774
} else {
728775
1.0 - eq_freq - per_column_stats.null_frac
729-
})
776+
}
730777
} else {
731-
None
778+
#[allow(clippy::collapsible_else_if)]
779+
if is_eq {
780+
DEFAULT_EQ_SEL
781+
} else {
782+
1.0 - DEFAULT_EQ_SEL
783+
}
732784
}
733785
} else {
734-
None
786+
#[allow(clippy::collapsible_else_if)]
787+
if is_eq {
788+
DEFAULT_EQ_SEL
789+
} else {
790+
1.0 - DEFAULT_EQ_SEL
791+
}
735792
}
736793
}
737794

@@ -748,7 +805,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
748805
value: &Value,
749806
is_col_lt_val: bool,
750807
is_col_eq_val: bool,
751-
) -> Option<f64> {
808+
) -> f64 {
752809
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
753810
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
754811
{
@@ -764,12 +821,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
764821
// depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
765822
// this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
766823
let total_lt_freq = total_leq_freq
767-
- self
768-
.get_column_equality_selectivity(table, col_idx, value, true)
769-
.expect("we already know that table and col_idx exist");
824+
- self.get_column_equality_selectivity(table, col_idx, value, true);
770825

771826
// use either total_leq_freq or total_lt_freq to get the selectivity
772-
Some(if is_col_lt_val {
827+
if is_col_lt_val {
773828
if is_col_eq_val {
774829
// this branch means <=
775830
total_leq_freq
@@ -788,12 +843,12 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
788843
// this branch means >. same logic as above
789844
1.0 - total_leq_freq - per_column_stats.null_frac
790845
}
791-
})
846+
}
792847
} else {
793-
None
848+
DEFAULT_INEQ_SEL
794849
}
795850
} else {
796-
None
851+
DEFAULT_INEQ_SEL
797852
}
798853
}
799854

optd-perftest/src/cardtest.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper {
103103

104104
pub async fn cardtest<P: AsRef<Path>>(
105105
workspace_dpath: P,
106-
use_cached_optd_stats: bool,
106+
no_cached_optd_stats: bool,
107107
pguser: &str,
108108
pgpassword: &str,
109109
tpch_config: TpchConfig,
110110
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
111111
let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
112112
let truecard_getter = pg_dbms.clone();
113-
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, use_cached_optd_stats).await?);
113+
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?);
114114
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];
115115

116116
let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());

0 commit comments

Comments
 (0)