1
+ use std:: ops:: Bound ;
1
2
use std:: { collections:: HashMap , sync:: Arc } ;
2
3
3
4
use crate :: plan_nodes:: {
@@ -985,34 +986,28 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
985
986
BinOpType :: Neq => {
986
987
self . get_column_equality_selectivity ( table, * col_idx, value, false )
987
988
}
988
- BinOpType :: Lt => self . get_column_range_selectivity (
989
- table,
990
- * col_idx,
991
- value,
992
- is_left_col_ref,
993
- false ,
994
- ) ,
995
- BinOpType :: Leq => self . get_column_range_selectivity (
996
- table,
997
- * col_idx,
998
- value,
999
- is_left_col_ref,
1000
- true ,
1001
- ) ,
1002
- BinOpType :: Gt => self . get_column_range_selectivity (
1003
- table,
1004
- * col_idx,
1005
- value,
1006
- !is_left_col_ref,
1007
- false ,
1008
- ) ,
1009
- BinOpType :: Geq => self . get_column_range_selectivity (
1010
- table,
1011
- * col_idx,
1012
- value,
1013
- !is_left_col_ref,
1014
- true ,
1015
- ) ,
989
+ BinOpType :: Lt | BinOpType :: Leq | BinOpType :: Gt | BinOpType :: Geq => {
990
+ let start = match ( comp_bin_op_typ, is_left_col_ref) {
991
+ ( BinOpType :: Lt , true ) | ( BinOpType :: Geq , false ) => Bound :: Unbounded ,
992
+ ( BinOpType :: Leq , true ) | ( BinOpType :: Gt , false ) => Bound :: Unbounded ,
993
+ ( BinOpType :: Gt , true ) | ( BinOpType :: Leq , false ) => Bound :: Excluded ( value) ,
994
+ ( BinOpType :: Geq , true ) | ( BinOpType :: Lt , false ) => Bound :: Included ( value) ,
995
+ _ => unreachable ! ( "all comparison BinOpTypes were enumerated. this should be unreachable" ) ,
996
+ } ;
997
+ let end = match ( comp_bin_op_typ, is_left_col_ref) {
998
+ ( BinOpType :: Lt , true ) | ( BinOpType :: Geq , false ) => Bound :: Excluded ( value) ,
999
+ ( BinOpType :: Leq , true ) | ( BinOpType :: Gt , false ) => Bound :: Included ( value) ,
1000
+ ( BinOpType :: Gt , true ) | ( BinOpType :: Leq , false ) => Bound :: Unbounded ,
1001
+ ( BinOpType :: Geq , true ) | ( BinOpType :: Lt , false ) => Bound :: Unbounded ,
1002
+ _ => unreachable ! ( "all comparison BinOpTypes were enumerated. this should be unreachable" ) ,
1003
+ } ;
1004
+ self . get_column_range_selectivity (
1005
+ table,
1006
+ * col_idx,
1007
+ start,
1008
+ end,
1009
+ )
1010
+ } ,
1016
1011
_ => unreachable ! ( "all comparison BinOpTypes were enumerated. this should be unreachable" ) ,
1017
1012
}
1018
1013
}
@@ -1148,56 +1143,61 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
1148
1143
}
1149
1144
}
1150
1145
1151
- /// Get the selectivity of an expression of the form "column </<=/>=/> value" (or "value </<=/>=/> column")
1152
- /// Computes selectivity based off of statistics
1153
- /// Range predicates are handled entirely differently from equality predicates so this is its own function
1154
- /// If it is unable to find the statistics, it returns None
1155
- /// Like in the Postgres source code, we decompose the four operators "</<=/>=/>" into "is_lt" and "is_eq"
1156
- /// The "is_lt" and "is_eq" values are set as if column is on the left hand side
1157
- fn get_column_range_selectivity (
1146
+ /// Compute the frequency of values in a column less than or equal to the given value.
1147
+ fn get_column_leq_value_freq ( per_column_stats : & PerColumnStats < M , D > , value : & Value ) -> f64 {
1148
+ // because distr does not include the values in MCVs, we need to compute the CDFs there as well
1149
+ // because nulls return false in any comparison, they are never included when computing range selectivity
1150
+ let distr_leq_freq = per_column_stats. distr . cdf ( value) ;
1151
+ let value = value. clone ( ) ;
1152
+ let pred = Box :: new ( move |val : & Value | val <= & value) ;
1153
+ let mcvs_leq_freq = per_column_stats. mcvs . freq_over_pred ( pred) ;
1154
+ distr_leq_freq + mcvs_leq_freq
1155
+ }
1156
+
1157
+ /// Compute the frequency of values in a column less than the given value.
1158
+ fn get_column_lt_value_freq (
1158
1159
& self ,
1160
+ per_column_stats : & PerColumnStats < M , D > ,
1159
1161
table : & str ,
1160
1162
col_idx : usize ,
1161
1163
value : & Value ,
1162
- is_col_lt_val : bool ,
1163
- is_col_eq_val : bool ,
1164
+ ) -> f64 {
1165
+ // depending on whether value is in mcvs or not, we use different logic to turn total_lt_cdf into total_leq_cdf
1166
+ // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
1167
+ Self :: get_column_leq_value_freq ( per_column_stats, value)
1168
+ - self . get_column_equality_selectivity ( table, col_idx, value, true )
1169
+ }
1170
+
1171
+ /// Get the selectivity of an expression of the form "column </<=/>=/> value" (or "value </<=/>=/> column").
1172
+ /// Computes selectivity based off of statistics.
1173
+ /// Range predicates are handled entirely differently from equality predicates so this is its own function.
1174
+ /// If it is unable to find the statistics, it returns DEFAULT_INEQ_SEL.
1175
+ /// The selectivity is computed as quantile of the right bound minus quantile of the left bound.
1176
+ fn get_column_range_selectivity (
1177
+ & self ,
1178
+ table : & str ,
1179
+ col_idx : usize ,
1180
+ start : Bound < & Value > ,
1181
+ end : Bound < & Value > ,
1164
1182
) -> f64 {
1165
1183
if let Some ( per_column_stats) = self . get_per_column_stats ( table, col_idx) {
1166
- // because distr does not include the values in MCVs, we need to compute the CDFs there as well
1167
- // because nulls return false in any comparison, they are never included when computing range selectivity
1168
- let distr_leq_freq = per_column_stats. distr . cdf ( value) ;
1169
- let value_clone = value. clone ( ) ; // clone the value so that we can move it into the closure to avoid lifetime issues
1170
- // TODO: in a future PR, figure out how to make Values comparable. rn I just hardcoded as_i32() to work around this
1171
- let pred = Box :: new ( move |val : & Value | val. as_i32 ( ) <= value_clone. as_i32 ( ) ) ;
1172
- let mcvs_leq_freq = per_column_stats. mcvs . freq_over_pred ( pred) ;
1173
- let total_leq_freq = distr_leq_freq + mcvs_leq_freq;
1174
-
1175
- // depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
1176
- // this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
1177
- let total_lt_freq =
1178
- total_leq_freq - self . get_column_equality_selectivity ( table, col_idx, value, true ) ;
1179
-
1180
- // use either total_leq_freq or total_lt_freq to get the selectivity
1181
- if is_col_lt_val {
1182
- if is_col_eq_val {
1183
- // this branch means <=
1184
- total_leq_freq
1185
- } else {
1186
- // this branch means <
1187
- total_lt_freq
1184
+ let left_quantile = match start {
1185
+ Bound :: Unbounded => 0.0 ,
1186
+ Bound :: Included ( value) => {
1187
+ self . get_column_lt_value_freq ( per_column_stats, table, col_idx, value)
1188
1188
}
1189
- } else {
1190
- // clippy wants me to collapse this into an else if, but keeping two nested if else statements is clearer
1191
- #[ allow( clippy:: collapsible_else_if) ]
1192
- if is_col_eq_val {
1193
- // this branch means >=, which is 1 - < - null_frac
1194
- // we need to subtract null_frac since that isn't included in >= either
1195
- 1.0 - total_lt_freq - per_column_stats. null_frac
1196
- } else {
1197
- // this branch means >. same logic as above
1198
- 1.0 - total_leq_freq - per_column_stats. null_frac
1189
+ Bound :: Excluded ( value) => Self :: get_column_leq_value_freq ( per_column_stats, value) ,
1190
+ } ;
1191
+ let right_quantile = match end {
1192
+ Bound :: Unbounded => 1.0 ,
1193
+ Bound :: Included ( value) => Self :: get_column_leq_value_freq ( per_column_stats, value) ,
1194
+ Bound :: Excluded ( value) => {
1195
+ self . get_column_lt_value_freq ( per_column_stats, table, col_idx, value)
1199
1196
}
1200
- }
1197
+ } ;
1198
+ assert ! ( left_quantile <= right_quantile) ;
1199
+ // `Distribution` does not account for NULL values, so the selectivity is smaller than frequency.
1200
+ ( right_quantile - left_quantile) * ( 1.0 - per_column_stats. null_frac )
1201
1201
} else {
1202
1202
DEFAULT_INEQ_SEL
1203
1203
}
@@ -1541,7 +1541,7 @@ mod tests {
1541
1541
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1542
1542
) ) ;
1543
1543
let expr_tree = bin_op ( BinOpType :: Leq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1544
- let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1544
+ let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1545
1545
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1546
1546
table: String :: from( TABLE1_NAME ) ,
1547
1547
col_idx: 0 ,
@@ -1565,18 +1565,18 @@ mod tests {
1565
1565
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1566
1566
) ) ;
1567
1567
let expr_tree = bin_op ( BinOpType :: Leq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1568
- let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1568
+ let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1569
1569
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1570
1570
table: String :: from( TABLE1_NAME ) ,
1571
1571
col_idx: 0 ,
1572
1572
} ] ;
1573
1573
assert_approx_eq:: assert_approx_eq!(
1574
1574
cost_model. get_filter_selectivity( expr_tree, & column_refs) ,
1575
- 0.7
1575
+ 0.7 * 0.9
1576
1576
) ;
1577
1577
assert_approx_eq:: assert_approx_eq!(
1578
1578
cost_model. get_filter_selectivity( expr_tree_rev, & column_refs) ,
1579
- 0.7
1579
+ 0.7 * 0.9
1580
1580
) ;
1581
1581
}
1582
1582
@@ -1598,7 +1598,7 @@ mod tests {
1598
1598
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1599
1599
) ) ;
1600
1600
let expr_tree = bin_op ( BinOpType :: Leq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1601
- let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1601
+ let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1602
1602
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1603
1603
table: String :: from( TABLE1_NAME ) ,
1604
1604
col_idx: 0 ,
@@ -1627,7 +1627,7 @@ mod tests {
1627
1627
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1628
1628
) ) ;
1629
1629
let expr_tree = bin_op ( BinOpType :: Leq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1630
- let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1630
+ let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1631
1631
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1632
1632
table: String :: from( TABLE1_NAME ) ,
1633
1633
col_idx: 0 ,
@@ -1651,7 +1651,7 @@ mod tests {
1651
1651
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1652
1652
) ) ;
1653
1653
let expr_tree = bin_op ( BinOpType :: Lt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1654
- let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1654
+ let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1655
1655
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1656
1656
table: String :: from( TABLE1_NAME ) ,
1657
1657
col_idx: 0 ,
@@ -1675,18 +1675,18 @@ mod tests {
1675
1675
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1676
1676
) ) ;
1677
1677
let expr_tree = bin_op ( BinOpType :: Lt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1678
- let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1678
+ let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1679
1679
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1680
1680
table: String :: from( TABLE1_NAME ) ,
1681
1681
col_idx: 0 ,
1682
1682
} ] ;
1683
1683
assert_approx_eq:: assert_approx_eq!(
1684
1684
cost_model. get_filter_selectivity( expr_tree, & column_refs) ,
1685
- 0.6
1685
+ 0.6 * 0.9
1686
1686
) ;
1687
1687
assert_approx_eq:: assert_approx_eq!(
1688
1688
cost_model. get_filter_selectivity( expr_tree_rev, & column_refs) ,
1689
- 0.6
1689
+ 0.6 * 0.9
1690
1690
) ;
1691
1691
}
1692
1692
@@ -1708,7 +1708,7 @@ mod tests {
1708
1708
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1709
1709
) ) ;
1710
1710
let expr_tree = bin_op ( BinOpType :: Lt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1711
- let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1711
+ let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1712
1712
// TODO(phw2): make column_refs a function
1713
1713
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1714
1714
table: String :: from( TABLE1_NAME ) ,
@@ -1742,7 +1742,7 @@ mod tests {
1742
1742
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1743
1743
) ) ;
1744
1744
let expr_tree = bin_op ( BinOpType :: Lt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1745
- let expr_tree_rev = bin_op ( BinOpType :: Gt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1745
+ let expr_tree_rev = bin_op ( BinOpType :: Geq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1746
1746
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1747
1747
table: String :: from( TABLE1_NAME ) ,
1748
1748
col_idx: 0 ,
@@ -1768,7 +1768,7 @@ mod tests {
1768
1768
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1769
1769
) ) ;
1770
1770
let expr_tree = bin_op ( BinOpType :: Gt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1771
- let expr_tree_rev = bin_op ( BinOpType :: Lt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1771
+ let expr_tree_rev = bin_op ( BinOpType :: Leq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1772
1772
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1773
1773
table: String :: from( TABLE1_NAME ) ,
1774
1774
col_idx: 0 ,
@@ -1792,19 +1792,18 @@ mod tests {
1792
1792
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1793
1793
) ) ;
1794
1794
let expr_tree = bin_op ( BinOpType :: Gt , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1795
- let expr_tree_rev = bin_op ( BinOpType :: Lt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1795
+ let expr_tree_rev = bin_op ( BinOpType :: Leq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1796
1796
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1797
1797
table: String :: from( TABLE1_NAME ) ,
1798
1798
col_idx: 0 ,
1799
1799
} ] ;
1800
- // we have to subtract 0.1 since we don't want to include them in GT either
1801
1800
assert_approx_eq:: assert_approx_eq!(
1802
1801
cost_model. get_filter_selectivity( expr_tree, & column_refs) ,
1803
- 1.0 - 0.7 - 0.1
1802
+ ( 1.0 - 0.7 ) * 0.9
1804
1803
) ;
1805
1804
assert_approx_eq:: assert_approx_eq!(
1806
1805
cost_model. get_filter_selectivity( expr_tree_rev, & column_refs) ,
1807
- 1.0 - 0.7 - 0.1
1806
+ ( 1.0 - 0.7 ) * 0.9
1808
1807
) ;
1809
1808
}
1810
1809
@@ -1818,7 +1817,7 @@ mod tests {
1818
1817
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1819
1818
) ) ;
1820
1819
let expr_tree = bin_op ( BinOpType :: Geq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1821
- let expr_tree_rev = bin_op ( BinOpType :: Leq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1820
+ let expr_tree_rev = bin_op ( BinOpType :: Lt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1822
1821
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1823
1822
table: String :: from( TABLE1_NAME ) ,
1824
1823
col_idx: 0 ,
@@ -1842,19 +1841,19 @@ mod tests {
1842
1841
TestDistribution :: new ( vec ! [ ( Value :: Int32 ( 15 ) , 0.7 ) ] ) ,
1843
1842
) ) ;
1844
1843
let expr_tree = bin_op ( BinOpType :: Geq , col_ref ( 0 ) , cnst ( Value :: Int32 ( 15 ) ) ) ;
1845
- let expr_tree_rev = bin_op ( BinOpType :: Leq , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1844
+ let expr_tree_rev = bin_op ( BinOpType :: Lt , cnst ( Value :: Int32 ( 15 ) ) , col_ref ( 0 ) ) ;
1846
1845
let column_refs = vec ! [ ColumnRef :: BaseTableColumnRef {
1847
1846
table: String :: from( TABLE1_NAME ) ,
1848
1847
col_idx: 0 ,
1849
1848
} ] ;
1850
- // we have to subtract 0.1 since we don't want to include them in GT either
1849
+ // we have to add 0.1 since it's Geq
1851
1850
assert_approx_eq:: assert_approx_eq!(
1852
1851
cost_model. get_filter_selectivity( expr_tree, & column_refs) ,
1853
- 1.0 - 0.6 - 0.1
1852
+ ( 1.0 - 0.7 + 0.1 ) * 0.9
1854
1853
) ;
1855
1854
assert_approx_eq:: assert_approx_eq!(
1856
1855
cost_model. get_filter_selectivity( expr_tree_rev, & column_refs) ,
1857
- 1.0 - 0.6 - 0.1
1856
+ ( 1.0 - 0.7 + 0.1 ) * 0.9
1858
1857
) ;
1859
1858
}
1860
1859
0 commit comments