@@ -315,9 +315,16 @@ pub trait Distribution: 'static + Send + Sync {
315
315
pub const ROW_COUNT : usize = 1 ;
316
316
pub const COMPUTE_COST : usize = 2 ;
317
317
pub const IO_COST : usize = 3 ;
318
- // used to indicate a combination of unimplemented!(), unreachable!(), or panic!()
319
- // TODO: a future PR will remove this and get the code working for all of TPC-H
320
- const INVALID_SELECTIVITY : f64 = 0.001 ;
318
+
319
+ // Default statistics. All are from selfuncs.h in Postgres unless specified otherwise
320
+ // Default selectivity estimate for equalities such as "A = b"
321
+ const DEFAULT_EQ_SEL : f64 = 0.005 ;
322
+ // Default selectivity estimate for inequalities such as "A < b"
323
+ const DEFAULT_INEQ_SEL : f64 = 0.3333333333333333 ;
324
+ // Default selectivity estimate for pattern-match operators such as LIKE
325
+ const DEFAULT_MATCH_SEL : f64 = 0.005 ;
326
+
327
+ const INVALID_SEL : f64 = 0.01 ;
321
328
322
329
impl < M : MostCommonValues , D : Distribution > OptCostModel < M , D > {
323
330
pub fn row_cnt ( Cost ( cost) : & Cost ) -> f64 {
@@ -421,10 +428,10 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
421
428
row_cnt. min ( fetch as f64 )
422
429
}
423
430
} else {
424
- ( row_cnt * INVALID_SELECTIVITY ) . max ( 1.0 )
431
+ panic ! ( "compute_cost() should not be called if optimizer is None" )
425
432
}
426
433
} else {
427
- ( row_cnt * INVALID_SELECTIVITY ) . max ( 1.0 )
434
+ panic ! ( "compute_cost() should not be called if context is None" )
428
435
} ;
429
436
Self :: cost ( row_cnt, compute_cost, 0.0 )
430
437
}
@@ -446,13 +453,13 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
446
453
if let Some ( expr_tree) = expr_trees. first ( ) {
447
454
self . get_filter_selectivity ( Arc :: clone ( expr_tree) , & column_refs)
448
455
} else {
449
- INVALID_SELECTIVITY
456
+ panic ! ( "encountered a PhysicalFilter without an expression" )
450
457
}
451
458
} else {
452
- INVALID_SELECTIVITY
459
+ panic ! ( "compute_cost() should not be called if optimizer is None" )
453
460
}
454
461
}
455
- None => INVALID_SELECTIVITY ,
462
+ None => panic ! ( "compute_cost() should not be called if context is None" ) ,
456
463
} ;
457
464
458
465
Self :: cost (
@@ -552,53 +559,73 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
552
559
column_refs : & GroupColumnRefs ,
553
560
) -> f64 {
554
561
assert ! ( expr_tree. typ. is_expression( ) ) ;
555
- match expr_tree. typ {
562
+ match & expr_tree. typ {
563
+ OptRelNodeTyp :: Constant ( _) => todo ! ( "check bool type or else panic" ) ,
564
+ OptRelNodeTyp :: ColumnRef => todo ! ( "check bool type or else panic" ) ,
565
+ OptRelNodeTyp :: UnOp ( un_op_typ) => {
566
+ assert ! ( expr_tree. children. len( ) == 1 ) ;
567
+ let child = expr_tree. child ( 0 ) ;
568
+ match un_op_typ {
569
+ // not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
570
+ // for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
571
+ UnOpType :: Not => 1.0 - self . get_filter_selectivity ( child, column_refs) ,
572
+ UnOpType :: Neg => panic ! (
573
+ "the selectivity of operations that return numerical values is undefined"
574
+ ) ,
575
+ }
576
+ }
556
577
OptRelNodeTyp :: BinOp ( bin_op_typ) => {
557
578
assert ! ( expr_tree. children. len( ) == 2 ) ;
558
579
let left_child = expr_tree. child ( 0 ) ;
559
580
let right_child = expr_tree. child ( 1 ) ;
560
581
561
582
if bin_op_typ. is_comparison ( ) {
562
583
self . get_comparison_op_selectivity (
563
- bin_op_typ,
584
+ * bin_op_typ,
564
585
left_child,
565
586
right_child,
566
587
column_refs,
567
588
)
568
589
} else if bin_op_typ. is_numerical ( ) {
569
- INVALID_SELECTIVITY
590
+ panic ! (
591
+ "the selectivity of operations that return numerical values is undefined"
592
+ )
570
593
} else {
571
594
unreachable ! ( "all BinOpTypes should be true for at least one is_*() function" )
572
595
}
573
596
}
574
597
OptRelNodeTyp :: LogOp ( log_op_typ) => {
575
- self . get_log_op_selectivity ( log_op_typ, & expr_tree. children , column_refs)
598
+ self . get_log_op_selectivity ( * log_op_typ, & expr_tree. children , column_refs)
576
599
}
577
- OptRelNodeTyp :: UnOp ( un_op_typ ) => {
578
- assert ! ( expr_tree . children . len ( ) == 1 ) ;
579
- let child = expr_tree . child ( 0 ) ;
580
- match un_op_typ {
581
- // not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
582
- // for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
583
- UnOpType :: Not => 1.0 - self . get_filter_selectivity ( child , column_refs ) ,
584
- _ => INVALID_SELECTIVITY ,
585
- }
600
+ OptRelNodeTyp :: Func ( _ ) => todo ! ( "check bool type or else panic" ) ,
601
+ OptRelNodeTyp :: SortOrder ( _ ) => {
602
+ panic ! ( "the selectivity of sort order expressions is undefined" )
603
+ }
604
+ OptRelNodeTyp :: Between => INVALID_SEL ,
605
+ OptRelNodeTyp :: Cast => todo ! ( "check bool type or else panic" ) ,
606
+ OptRelNodeTyp :: Like => DEFAULT_MATCH_SEL ,
607
+ OptRelNodeTyp :: DataType ( _ ) => {
608
+ panic ! ( "the selectivity of a data type is not defined" )
586
609
}
587
- _ => INVALID_SELECTIVITY ,
610
+ OptRelNodeTyp :: InList => INVALID_SEL ,
611
+ _ => unreachable ! (
612
+ "all expression OptRelNodeTyp were enumerated. this should be unreachable"
613
+ ) ,
588
614
}
589
615
}
590
616
591
617
/// Comparison operators are the base case for recursion in get_filter_selectivity()
592
618
fn get_comparison_op_selectivity (
593
619
& self ,
594
- bin_op_typ : BinOpType ,
620
+ comp_bin_op_typ : BinOpType ,
595
621
left : OptRelNodeRef ,
596
622
right : OptRelNodeRef ,
597
623
column_refs : & GroupColumnRefs ,
598
624
) -> f64 {
599
- assert ! ( bin_op_typ . is_comparison( ) ) ;
625
+ assert ! ( comp_bin_op_typ . is_comparison( ) ) ;
600
626
601
- // the # of column refs determines how we handle the logic
627
+ // it's more convenient to refer to the children based on whether they're column nodes or not
628
+ // rather than by left/right
602
629
let mut col_ref_nodes = vec ! [ ] ;
603
630
let mut non_col_ref_nodes = vec ! [ ] ;
604
631
let is_left_col_ref;
@@ -623,8 +650,9 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
623
650
non_col_ref_nodes. push ( right) ;
624
651
}
625
652
653
+ // handle the different cases of column nodes
626
654
if col_ref_nodes. is_empty ( ) {
627
- INVALID_SELECTIVITY
655
+ INVALID_SEL
628
656
} else if col_ref_nodes. len ( ) == 1 {
629
657
let col_ref_node = col_ref_nodes
630
658
. pop ( )
@@ -636,79 +664,98 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
636
664
. pop ( )
637
665
. expect ( "non_col_ref_nodes should have a value since col_ref_nodes.len() == 1" ) ;
638
666
639
- if let OptRelNodeTyp :: Constant ( _) = non_col_ref_node. as_ref ( ) . typ {
640
- let value = non_col_ref_node
641
- . as_ref ( )
642
- . data
643
- . as_ref ( )
644
- . expect ( "constants should have data" ) ;
645
- match match bin_op_typ {
646
- BinOpType :: Eq => {
647
- self . get_column_equality_selectivity ( table, * col_idx, value, true )
648
- }
649
- BinOpType :: Neq => {
650
- self . get_column_equality_selectivity ( table, * col_idx, value, false )
667
+ match non_col_ref_node. as_ref ( ) . typ {
668
+ OptRelNodeTyp :: Constant ( _) => {
669
+ let value = non_col_ref_node
670
+ . as_ref ( )
671
+ . data
672
+ . as_ref ( )
673
+ . expect ( "constants should have data" ) ;
674
+ match comp_bin_op_typ {
675
+ BinOpType :: Eq => {
676
+ self . get_column_equality_selectivity ( table, * col_idx, value, true )
677
+ }
678
+ BinOpType :: Neq => {
679
+ self . get_column_equality_selectivity ( table, * col_idx, value, false )
680
+ }
681
+ BinOpType :: Lt => self . get_column_range_selectivity (
682
+ table,
683
+ * col_idx,
684
+ value,
685
+ is_left_col_ref,
686
+ false ,
687
+ ) ,
688
+ BinOpType :: Leq => self . get_column_range_selectivity (
689
+ table,
690
+ * col_idx,
691
+ value,
692
+ is_left_col_ref,
693
+ true ,
694
+ ) ,
695
+ BinOpType :: Gt => self . get_column_range_selectivity (
696
+ table,
697
+ * col_idx,
698
+ value,
699
+ !is_left_col_ref,
700
+ false ,
701
+ ) ,
702
+ BinOpType :: Geq => self . get_column_range_selectivity (
703
+ table,
704
+ * col_idx,
705
+ value,
706
+ !is_left_col_ref,
707
+ true ,
708
+ ) ,
709
+ _ => unreachable ! ( "all comparison BinOpTypes were enumerated. this should be unreachable" ) ,
651
710
}
652
- BinOpType :: Lt => self . get_column_range_selectivity (
653
- table,
654
- * col_idx,
655
- value,
656
- is_left_col_ref,
657
- false ,
658
- ) ,
659
- BinOpType :: Leq => self . get_column_range_selectivity (
660
- table,
661
- * col_idx,
662
- value,
663
- is_left_col_ref,
664
- true ,
665
- ) ,
666
- BinOpType :: Gt => self . get_column_range_selectivity (
667
- table,
668
- * col_idx,
669
- value,
670
- !is_left_col_ref,
671
- false ,
672
- ) ,
673
- BinOpType :: Geq => self . get_column_range_selectivity (
674
- table,
675
- * col_idx,
676
- value,
677
- !is_left_col_ref,
678
- true ,
679
- ) ,
680
- _ => None ,
681
- } {
682
- Some ( sel) => sel,
683
- None => INVALID_SELECTIVITY ,
684
711
}
685
- } else {
686
- INVALID_SELECTIVITY
712
+ OptRelNodeTyp :: BinOp ( _) => {
713
+ Self :: get_default_comparison_op_selectivity ( comp_bin_op_typ)
714
+ }
715
+ OptRelNodeTyp :: Cast => INVALID_SEL ,
716
+ _ => unimplemented ! (
717
+ "unhandled case of comparing a column ref node to {}" ,
718
+ non_col_ref_node. as_ref( ) . typ
719
+ ) ,
687
720
}
688
721
} else {
689
- INVALID_SELECTIVITY
722
+ unimplemented ! ( "non base table column refs need to be implemented" )
690
723
}
691
724
} else if col_ref_nodes. len ( ) == 2 {
692
- INVALID_SELECTIVITY
725
+ Self :: get_default_comparison_op_selectivity ( comp_bin_op_typ )
693
726
} else {
694
- unreachable ! ( "We could have at most pushed left and right into col_ref_nodes" )
727
+ unreachable ! ( "we could have at most pushed left and right into col_ref_nodes" )
728
+ }
729
+ }
730
+
731
+ /// The default selectivity of a comparison expression
732
+ /// Used when one side of the comparison is a column while the other side is something too
733
+ /// complex/impossible to evaluate (subquery, UDF, another column, we have no stats, etc.)
734
+ fn get_default_comparison_op_selectivity ( comp_bin_op_typ : BinOpType ) -> f64 {
735
+ assert ! ( comp_bin_op_typ. is_comparison( ) ) ;
736
+ match comp_bin_op_typ {
737
+ BinOpType :: Eq => DEFAULT_EQ_SEL ,
738
+ BinOpType :: Neq => 1.0 - DEFAULT_EQ_SEL ,
739
+ BinOpType :: Lt | BinOpType :: Leq | BinOpType :: Gt | BinOpType :: Geq => DEFAULT_INEQ_SEL ,
740
+ _ => unreachable ! (
741
+ "all comparison BinOpTypes were enumerated. this should be unreachable"
742
+ ) ,
695
743
}
696
744
}
697
745
698
746
/// Get the selectivity of an expression of the form "column equals value" (or "value equals column")
699
- /// Computes selectivity based off of statistics
747
+ /// Will handle the case of statistics missing
700
748
/// Equality predicates are handled entirely differently from range predicates so this is its own function
701
749
/// Also, get_column_equality_selectivity is a subroutine when computing range selectivity, which is another
702
750
/// reason for separating these into two functions
703
- /// If it is unable to find the statistics, it returns None
704
751
/// is_eq means whether it's == or !=
705
752
fn get_column_equality_selectivity (
706
753
& self ,
707
754
table : & str ,
708
755
col_idx : usize ,
709
756
value : & Value ,
710
757
is_eq : bool ,
711
- ) -> Option < f64 > {
758
+ ) -> f64 {
712
759
if let Some ( per_table_stats) = self . per_table_stats_map . get ( table) {
713
760
if let Some ( Some ( per_column_stats) ) = per_table_stats. per_column_stats_vec . get ( col_idx)
714
761
{
@@ -722,16 +769,26 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
722
769
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0
723
770
( non_mcv_freq - per_column_stats. null_frac ) / ( non_mcv_cnt as f64 )
724
771
} ;
725
- Some ( if is_eq {
772
+ if is_eq {
726
773
eq_freq
727
774
} else {
728
775
1.0 - eq_freq - per_column_stats. null_frac
729
- } )
776
+ }
730
777
} else {
731
- None
778
+ #[ allow( clippy:: collapsible_else_if) ]
779
+ if is_eq {
780
+ DEFAULT_EQ_SEL
781
+ } else {
782
+ 1.0 - DEFAULT_EQ_SEL
783
+ }
732
784
}
733
785
} else {
734
- None
786
+ #[ allow( clippy:: collapsible_else_if) ]
787
+ if is_eq {
788
+ DEFAULT_EQ_SEL
789
+ } else {
790
+ 1.0 - DEFAULT_EQ_SEL
791
+ }
735
792
}
736
793
}
737
794
@@ -748,7 +805,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
748
805
value : & Value ,
749
806
is_col_lt_val : bool ,
750
807
is_col_eq_val : bool ,
751
- ) -> Option < f64 > {
808
+ ) -> f64 {
752
809
if let Some ( per_table_stats) = self . per_table_stats_map . get ( table) {
753
810
if let Some ( Some ( per_column_stats) ) = per_table_stats. per_column_stats_vec . get ( col_idx)
754
811
{
@@ -764,12 +821,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
764
821
// depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
765
822
// this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
766
823
let total_lt_freq = total_leq_freq
767
- - self
768
- . get_column_equality_selectivity ( table, col_idx, value, true )
769
- . expect ( "we already know that table and col_idx exist" ) ;
824
+ - self . get_column_equality_selectivity ( table, col_idx, value, true ) ;
770
825
771
826
// use either total_leq_freq or total_lt_freq to get the selectivity
772
- Some ( if is_col_lt_val {
827
+ if is_col_lt_val {
773
828
if is_col_eq_val {
774
829
// this branch means <=
775
830
total_leq_freq
@@ -788,12 +843,12 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
788
843
// this branch means >. same logic as above
789
844
1.0 - total_leq_freq - per_column_stats. null_frac
790
845
}
791
- } )
846
+ }
792
847
} else {
793
- None
848
+ DEFAULT_INEQ_SEL
794
849
}
795
850
} else {
796
- None
851
+ DEFAULT_INEQ_SEL
797
852
}
798
853
}
799
854
0 commit comments