Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit ed5eba7

Browse files
Feat: flatten filter and join cond to list of expr, simplify filter rule (#108)
# Major Changes - BinOpExpr in Join cond and filter cond is changed to LogOpExpr - LogOpExpr child is not a single ExprList but multiple exprs as children (which is different from previous logOpExpr) - ColumnRef and Schema properties changed for LogOpExpr # Design Previously, LogOpExpr is designed to be `children: vec![ExprList]` which means the LogOpExpr only has one child of ExprList. And condition expressions in filter and join from datafusion will be converted to BinOpExpr, which means the LogOpExpr actually does not exist in optd. After converting conditions from datafusion to be LogOpExpr, the design of one child LogOpExpr will influence the ProjectPullUpJoinRule and other rules with schema recalculated as these rules simply traverse the expr's children in its relNode format, it will treat LogOpExpr children as only one child (ExprList), and when it further traverses ExprList it has no idea whether it is a LogOp list or a normal list. So current design change the LogOpExpr to include multiple children to represent its real children size in RelNode format. It has further impact on cost model and properties builder logic as previously List is the only type in Optd which contains multiple children. About the pickMany stuff, it won't change the core's logic no matter it has one or many children as we only uses as join cond or filter cond, their cond is only one expr which is LogOpExpr and LogOpExpr is expanded and returned with all of it children. So the LogOpExpr already play the role as a one child wrapper(like ExprList), no need to add a ExprList. #TODO - [x] currently I simply copy properties derive and build rules for List as the rules for LogOpExpr, please check whether it is aligned with cost model's logic - [x] the change break the selectivity infer rules and cost model calculation so it cannot find a best group bindings for several cases. Please update the selectivity calculation for logOpExpr. --------- Signed-off-by: AveryQi115 <[email protected]> Co-authored-by: Patrick Wang <[email protected]>
1 parent 7f87289 commit ed5eba7

File tree

17 files changed

+1124
-794
lines changed

17 files changed

+1124
-794
lines changed

datafusion-optd-cli/tpch-sf0_01/simple_manual_test.sql

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ CREATE TABLE NATION (
1010
CREATE EXTERNAL TABLE nation_tbl STORED AS CSV DELIMITER '|' LOCATION 'datafusion-optd-cli/tpch-sf0_01/nation.tbl';
1111
insert into nation select column_1, column_2, column_3, column_4 from nation_tbl;
1212

13-
SELECT * FROM nation where nation.n_nationkey = 1;
13+
SELECT * FROM nation where nation.n_nationkey = 1 OR nation.n_nationkey = 2 OR nation.n_nationkey = 5;

optd-datafusion-bridge/src/from_optd.rs

+27-10
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ use optd_core::rel_node::RelNodeMetaMap;
2525
use optd_datafusion_repr::{
2626
plan_nodes::{
2727
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, ConstantType,
28-
Expr, FuncExpr, FuncType, InListExpr, JoinType, LikeExpr, OptRelNode, OptRelNodeRef,
29-
OptRelNodeTyp, PhysicalAgg, PhysicalEmptyRelation, PhysicalFilter, PhysicalHashJoin,
30-
PhysicalLimit, PhysicalNestedLoopJoin, PhysicalProjection, PhysicalScan, PhysicalSort,
31-
PlanNode, SortOrderExpr, SortOrderType,
28+
Expr, ExprList, FuncExpr, FuncType, InListExpr, JoinType, LikeExpr, LogOpExpr, LogOpType,
29+
OptRelNode, OptRelNodeRef, OptRelNodeTyp, PhysicalAgg, PhysicalEmptyRelation,
30+
PhysicalFilter, PhysicalHashJoin, PhysicalLimit, PhysicalNestedLoopJoin,
31+
PhysicalProjection, PhysicalScan, PhysicalSort, PlanNode, SortOrderExpr, SortOrderType,
3232
},
3333
properties::schema::Schema as OptdSchema,
3434
};
@@ -196,6 +196,23 @@ impl OptdPlanContext<'_> {
196196
}
197197
}
198198
OptRelNodeTyp::Sort => unreachable!(),
199+
OptRelNodeTyp::LogOp(typ) => {
200+
let expr = LogOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
201+
let mut children = expr.children().into_iter();
202+
let first_expr = Self::conv_from_optd_expr(children.next().unwrap(), context)?;
203+
let op = match typ {
204+
LogOpType::And => Operator::And,
205+
LogOpType::Or => Operator::Or,
206+
};
207+
children.try_fold(first_expr, |acc, expr| {
208+
let expr = Self::conv_from_optd_expr(expr, context)?;
209+
Ok(
210+
Arc::new(datafusion::physical_plan::expressions::BinaryExpr::new(
211+
acc, op, expr,
212+
)) as Arc<dyn PhysicalExpr>,
213+
)
214+
})
215+
}
199216
OptRelNodeTyp::BinOp(op) => {
200217
let expr = BinOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
201218
let left = Self::conv_from_optd_expr(expr.left_child(), context)?;
@@ -207,8 +224,6 @@ impl OptdPlanContext<'_> {
207224
BinOpType::Lt => Operator::Lt,
208225
BinOpType::Geq => Operator::GtEq,
209226
BinOpType::Gt => Operator::Gt,
210-
BinOpType::And => Operator::And,
211-
BinOpType::Or => Operator::Or,
212227
BinOpType::Add => Operator::Plus,
213228
BinOpType::Sub => Operator::Minus,
214229
BinOpType::Mul => Operator::Multiply,
@@ -225,10 +240,12 @@ impl OptdPlanContext<'_> {
225240
// TODO: should we just convert between to x <= c1 and x >= c2?
226241
let expr = BetweenExpr::from_rel_node(expr.into_rel_node()).unwrap();
227242
Self::conv_from_optd_expr(
228-
BinOpExpr::new(
229-
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
230-
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
231-
BinOpType::And,
243+
LogOpExpr::new(
244+
LogOpType::And,
245+
ExprList::new(vec![
246+
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
247+
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
248+
]),
232249
)
233250
.into_expr(),
234251
context,

optd-datafusion-bridge/src/into_optd.rs

+48-18
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,35 @@ use datafusion_expr::Expr as DFExpr;
88
use optd_core::rel_node::RelNode;
99
use optd_datafusion_repr::plan_nodes::{
1010
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, Expr, ExprList,
11-
FuncExpr, FuncType, InListExpr, JoinType, LikeExpr, LogicalAgg, LogicalEmptyRelation,
12-
LogicalFilter, LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan, LogicalSort,
13-
OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
11+
FuncExpr, FuncType, InListExpr, JoinType, LikeExpr, LogOpExpr, LogOpType, LogicalAgg,
12+
LogicalEmptyRelation, LogicalFilter, LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan,
13+
LogicalSort, OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
1414
};
1515

1616
use crate::OptdPlanContext;
1717

18+
// flatten_nested_logical is a helper function to flatten nested logical operators with same op type
19+
// eg. (a AND (b AND c)) => ExprList([a, b, c])
20+
// (a OR (b OR c)) => ExprList([a, b, c])
21+
// It assume the children of the input expr_list are already flattened
22+
// and can only be used in bottom up manner
23+
fn flatten_nested_logical(op: LogOpType, expr_list: ExprList) -> ExprList {
24+
// conv_into_optd_expr is building the children bottom up so there is no need to
25+
// call flatten_nested_logical recursively
26+
let mut new_expr_list = Vec::new();
27+
for child in expr_list.to_vec() {
28+
if let OptRelNodeTyp::LogOp(child_op) = child.typ() {
29+
if child_op == op {
30+
let child_log_op_expr = LogOpExpr::from_rel_node(child.into_rel_node()).unwrap();
31+
new_expr_list.extend(child_log_op_expr.children().to_vec());
32+
continue;
33+
}
34+
}
35+
new_expr_list.push(child.clone());
36+
}
37+
ExprList::new(new_expr_list)
38+
}
39+
1840
impl OptdPlanContext<'_> {
1941
fn conv_into_optd_table_scan(&mut self, node: &logical_plan::TableScan) -> Result<PlanNode> {
2042
let table_name = node.table_name.to_string();
@@ -47,15 +69,29 @@ impl OptdPlanContext<'_> {
4769
Expr::BinaryExpr(node) => {
4870
let left = self.conv_into_optd_expr(node.left.as_ref(), context)?;
4971
let right = self.conv_into_optd_expr(node.right.as_ref(), context)?;
72+
match node.op {
73+
Operator::And => {
74+
let op = LogOpType::And;
75+
let expr_list = ExprList::new(vec![left, right]);
76+
let expr_list = flatten_nested_logical(op, expr_list);
77+
return Ok(LogOpExpr::new(op, expr_list).into_expr());
78+
}
79+
Operator::Or => {
80+
let op = LogOpType::Or;
81+
let expr_list = ExprList::new(vec![left, right]);
82+
let expr_list = flatten_nested_logical(op, expr_list);
83+
return Ok(LogOpExpr::new(op, expr_list).into_expr());
84+
}
85+
_ => {}
86+
}
87+
5088
let op = match node.op {
5189
Operator::Eq => BinOpType::Eq,
5290
Operator::NotEq => BinOpType::Neq,
5391
Operator::LtEq => BinOpType::Leq,
5492
Operator::Lt => BinOpType::Lt,
5593
Operator::GtEq => BinOpType::Geq,
5694
Operator::Gt => BinOpType::Gt,
57-
Operator::And => BinOpType::And,
58-
Operator::Or => BinOpType::Or,
5995
Operator::Plus => BinOpType::Add,
6096
Operator::Minus => BinOpType::Sub,
6197
Operator::Multiply => BinOpType::Mul,
@@ -305,19 +341,13 @@ impl OptdPlanContext<'_> {
305341
} else if log_ops.len() == 1 {
306342
Ok(LogicalJoin::new(left, right, log_ops.remove(0), join_type))
307343
} else {
308-
// Build a left-deep tree from log_ops
309-
// I wanted to pop from the left instead of the right to maintain the order, even if it's slower
310-
// you can obv change log_ops to a Deque to avoid this issue but I didn't bother since I don't wanna
311-
// do premature optimization
312-
let left_nonlog_op = log_ops.remove(0);
313-
let right_nonlog_op = log_ops.remove(0);
314-
let mut cond =
315-
BinOpExpr::new(left_nonlog_op, right_nonlog_op, BinOpType::And).into_expr();
316-
while !log_ops.is_empty() {
317-
cond = BinOpExpr::new(cond, log_ops.remove(0), BinOpType::And).into_expr();
318-
}
319-
320-
Ok(LogicalJoin::new(left, right, cond, join_type))
344+
let expr_list = ExprList::new(log_ops);
345+
Ok(LogicalJoin::new(
346+
left,
347+
right,
348+
LogOpExpr::new(LogOpType::And, expr_list).into_expr(),
349+
join_type,
350+
))
321351
}
322352
}
323353

optd-datafusion-repr/src/cost/base_cost.rs

+68-41
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::{collections::HashMap, sync::Arc};
22

3-
use crate::plan_nodes::{BinOpType, ColumnRefExpr, OptRelNode, UnOpType};
3+
use crate::plan_nodes::{BinOpType, ColumnRefExpr, LogOpType, OptRelNode, UnOpType};
44
use crate::properties::column_ref::{ColumnRefPropertyBuilder, GroupColumnRefs};
55
use crate::{
66
plan_nodes::{OptRelNodeRef, OptRelNodeTyp},
@@ -298,19 +298,15 @@ impl OptCostModel {
298298
right_child,
299299
column_refs,
300300
)
301-
} else if bin_op_typ.is_logical() {
302-
self.get_logical_bin_op_selectivity(
303-
bin_op_typ,
304-
left_child,
305-
right_child,
306-
column_refs,
307-
)
308301
} else if bin_op_typ.is_numerical() {
309302
INVALID_SELECTIVITY
310303
} else {
311304
unreachable!("all BinOpTypes should be true for at least one is_*() function")
312305
}
313306
}
307+
OptRelNodeTyp::LogOp(log_op_typ) => {
308+
self.get_log_op_selectivity(log_op_typ, &expr_tree.children, column_refs)
309+
}
314310
OptRelNodeTyp::UnOp(un_op_typ) => {
315311
assert!(expr_tree.children.len() == 1);
316312
let child = expr_tree.child(0);
@@ -532,24 +528,20 @@ impl OptCostModel {
532528
}
533529
}
534530

535-
fn get_logical_bin_op_selectivity(
531+
fn get_log_op_selectivity(
536532
&self,
537-
bin_op_typ: BinOpType,
538-
left: OptRelNodeRef,
539-
right: OptRelNodeRef,
533+
log_op_typ: LogOpType,
534+
children: &[OptRelNodeRef],
540535
column_refs: &GroupColumnRefs,
541536
) -> f64 {
542-
assert!(bin_op_typ.is_logical());
543-
544-
let left_sel = self.get_filter_selectivity(left, column_refs);
545-
let right_sel = self.get_filter_selectivity(right, column_refs);
546-
547-
match bin_op_typ {
548-
// note that there's no need to account for nulls here
549-
// it's also impossible to even account for nulls because we don't know which columns the left and right selectivities are
550-
BinOpType::And => left_sel * right_sel,
551-
BinOpType::Or => left_sel + right_sel - left_sel * right_sel,
552-
_ => unreachable!("we covered all bin_op_typ.is_logical() cases"),
537+
let children_sel = children
538+
.iter()
539+
.map(|expr| self.get_filter_selectivity(expr.clone(), column_refs));
540+
541+
match log_op_typ {
542+
LogOpType::And => children_sel.product(),
543+
// the formula is 1.0 - the probability of _none_ of the events happening
544+
LogOpType::Or => 1.0 - children_sel.fold(1.0, |acc, sel| acc * (1.0 - sel)),
553545
}
554546
}
555547

@@ -595,8 +587,8 @@ mod tests {
595587

596588
use crate::{
597589
plan_nodes::{
598-
BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, OptRelNode, OptRelNodeRef,
599-
UnOpExpr, UnOpType,
590+
BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, Expr, ExprList, LogOpExpr,
591+
LogOpType, OptRelNode, OptRelNodeRef, UnOpExpr, UnOpType,
600592
},
601593
properties::column_ref::ColumnRef,
602594
};
@@ -696,6 +688,21 @@ mod tests {
696688
.into_rel_node()
697689
}
698690

691+
fn log_op(op_type: LogOpType, children: Vec<OptRelNodeRef>) -> OptRelNodeRef {
692+
LogOpExpr::new(
693+
op_type,
694+
ExprList::new(
695+
children
696+
.into_iter()
697+
.map(|opt_rel_node_ref| {
698+
Expr::from_rel_node(opt_rel_node_ref).expect("all children should be Expr")
699+
})
700+
.collect(),
701+
),
702+
)
703+
.into_rel_node()
704+
}
705+
699706
fn un_op(op_type: UnOpType, child: OptRelNodeRef) -> OptRelNodeRef {
700707
UnOpExpr::new(
701708
Expr::from_rel_node(child).expect("child should be an Expr"),
@@ -1136,59 +1143,79 @@ mod tests {
11361143
fn test_and() {
11371144
let cost_model = create_one_column_cost_model(PerColumnStats::new(
11381145
Box::new(MockMostCommonValues {
1139-
mcvs: vec![(Value::Int32(1), 0.3), (Value::Int32(5), 0.5)]
1140-
.into_iter()
1141-
.collect(),
1146+
mcvs: vec![
1147+
(Value::Int32(1), 0.3),
1148+
(Value::Int32(5), 0.5),
1149+
(Value::Int32(8), 0.2),
1150+
]
1151+
.into_iter()
1152+
.collect(),
11421153
}),
11431154
0,
11441155
0.0,
11451156
Box::new(MockDistribution::empty()),
11461157
));
11471158
let eq1 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(1)));
11481159
let eq5 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(5)));
1149-
let expr_tree = bin_op(BinOpType::And, eq1.clone(), eq5.clone());
1150-
let expr_tree_rev = bin_op(BinOpType::And, eq5.clone(), eq1.clone());
1160+
let eq8 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(8)));
1161+
let expr_tree = log_op(LogOpType::And, vec![eq1.clone(), eq5.clone(), eq8.clone()]);
1162+
let expr_tree_shift1 = log_op(LogOpType::And, vec![eq5.clone(), eq8.clone(), eq1.clone()]);
1163+
let expr_tree_shift2 = log_op(LogOpType::And, vec![eq8.clone(), eq1.clone(), eq5.clone()]);
11511164
let column_refs = vec![ColumnRef::BaseTableColumnRef {
11521165
table: String::from(TABLE1_NAME),
11531166
col_idx: 0,
11541167
}];
11551168
assert_approx_eq::assert_approx_eq!(
11561169
cost_model.get_filter_selectivity(expr_tree, &column_refs),
1157-
0.15
1170+
0.03
11581171
);
11591172
assert_approx_eq::assert_approx_eq!(
1160-
cost_model.get_filter_selectivity(expr_tree_rev, &column_refs),
1161-
0.15
1173+
cost_model.get_filter_selectivity(expr_tree_shift1, &column_refs),
1174+
0.03
1175+
);
1176+
assert_approx_eq::assert_approx_eq!(
1177+
cost_model.get_filter_selectivity(expr_tree_shift2, &column_refs),
1178+
0.03
11621179
);
11631180
}
11641181

11651182
#[test]
11661183
fn test_or() {
11671184
let cost_model = create_one_column_cost_model(PerColumnStats::new(
11681185
Box::new(MockMostCommonValues {
1169-
mcvs: vec![(Value::Int32(1), 0.3), (Value::Int32(5), 0.5)]
1170-
.into_iter()
1171-
.collect(),
1186+
mcvs: vec![
1187+
(Value::Int32(1), 0.3),
1188+
(Value::Int32(5), 0.5),
1189+
(Value::Int32(8), 0.2),
1190+
]
1191+
.into_iter()
1192+
.collect(),
11721193
}),
11731194
0,
11741195
0.0,
11751196
Box::new(MockDistribution::empty()),
11761197
));
11771198
let eq1 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(1)));
11781199
let eq5 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(5)));
1179-
let expr_tree = bin_op(BinOpType::Or, eq1.clone(), eq5.clone());
1180-
let expr_tree_rev = bin_op(BinOpType::Or, eq5.clone(), eq1.clone());
1200+
let eq8 = bin_op(BinOpType::Eq, col_ref(0), cnst(Value::Int32(8)));
1201+
let expr_tree = log_op(LogOpType::Or, vec![eq1.clone(), eq5.clone(), eq8.clone()]);
1202+
let expr_tree_shift1 = log_op(LogOpType::Or, vec![eq5.clone(), eq8.clone(), eq1.clone()]);
1203+
let expr_tree_shift2 = log_op(LogOpType::Or, vec![eq8.clone(), eq1.clone(), eq5.clone()]);
11811204
let column_refs = vec![ColumnRef::BaseTableColumnRef {
11821205
table: String::from(TABLE1_NAME),
11831206
col_idx: 0,
11841207
}];
11851208
assert_approx_eq::assert_approx_eq!(
11861209
cost_model.get_filter_selectivity(expr_tree, &column_refs),
1187-
0.65
1210+
0.72
11881211
);
11891212
assert_approx_eq::assert_approx_eq!(
1190-
cost_model.get_filter_selectivity(expr_tree_rev, &column_refs),
1191-
0.65
1213+
cost_model.get_filter_selectivity(expr_tree_shift1, &column_refs),
1214+
0.72
1215+
);
1216+
assert_approx_eq::assert_approx_eq!(
1217+
cost_model.get_filter_selectivity(expr_tree_shift2, &column_refs),
1218+
0.72
11921219
);
11931220
}
11941221

0 commit comments

Comments
 (0)