Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit d34c22c

Browse files
authored
feat: Eliminate Duplicated Expr Rule (#67)
# Major Changes Add an eliminate duplicate expression rule which: 1. Removes duplicate sort expressions 2. Removes duplicate aggregate group bys Also, this PR adds derive traits for Hash, PartialEq, and Eq for RelNode. Examples: `select * from t1 order by id, name, id desc, id asc, name desc` becomes `select * from t1 order by id, name` `select * from t1 group by id, name, id` becomes `select * from t1 group by id, name` ## Rule Type Heuristics (always apply), Transformation Rule (logical to logical)
1 parent e846c58 commit d34c22c

File tree

8 files changed

+272
-3
lines changed

8 files changed

+272
-3
lines changed

optd-core/src/rel_node.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ impl Value {
144144
}
145145

146146
/// A RelNode is consisted of a plan node type and some children.
147-
#[derive(Clone, Debug)]
147+
#[derive(Clone, Debug, Hash, PartialEq, Eq)]
148148
pub struct RelNode<T: RelNodeTyp> {
149149
pub typ: T,
150150
pub children: Vec<RelNodeRef<T>>,

optd-datafusion-repr/src/lib.rs

+5-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ use properties::{
1111
schema::{Catalog, SchemaPropertyBuilder},
1212
};
1313
use rules::{
14-
EliminateFilterRule, EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule,
15-
JoinCommuteRule, PhysicalConversionRule, ProjectionPullUpJoin,
14+
EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule, EliminateFilterRule,
15+
EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule, JoinCommuteRule,
16+
PhysicalConversionRule, ProjectionPullUpJoin,
1617
};
1718

1819
pub use adaptive::PhysicalCollector;
@@ -53,6 +54,8 @@ impl DatafusionOptimizer {
5354
rules.push(Arc::new(EliminateJoinRule::new()));
5455
rules.push(Arc::new(EliminateFilterRule::new()));
5556
rules.push(Arc::new(EliminateLimitRule::new()));
57+
rules.push(Arc::new(EliminateDuplicatedSortExprRule::new()));
58+
rules.push(Arc::new(EliminateDuplicatedAggExprRule::new()));
5659

5760
let cost_model = AdaptiveCostModel::new(50);
5861
Self {

optd-datafusion-repr/src/plan_nodes/expr.rs

+4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ impl ExprList {
3737
.map(|x| Expr::from_rel_node(x.clone()).unwrap())
3838
.collect_vec()
3939
}
40+
41+
pub fn from_group(rel_node: OptRelNodeRef) -> Self {
42+
Self(rel_node)
43+
}
4044
}
4145

4246
impl OptRelNode for ExprList {

optd-datafusion-repr/src/plan_nodes/sort.rs

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ use super::{OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode};
66
#[derive(Clone, Debug)]
77
pub struct LogicalSort(pub PlanNode);
88

9+
// each expression in ExprList is represented as a SortOrderExpr
10+
// 1. nulls_first is not included from DF
11+
// 2. node type defines sort order per expression
12+
// 3. actual expr is stored as a child of this node
913
define_plan_node!(
1014
LogicalSort : PlanNode,
1115
Sort, [

optd-datafusion-repr/src/rules.rs

+4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
// mod filter_join;
2+
mod eliminate_duplicated_expr;
23
mod eliminate_filter;
34
mod eliminate_limit;
45
mod joins;
56
mod macros;
67
mod physical;
78

89
// pub use filter_join::FilterJoinPullUpRule;
10+
pub use eliminate_duplicated_expr::{
11+
EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule,
12+
};
913
pub use eliminate_filter::EliminateFilterRule;
1014
pub use eliminate_limit::EliminateLimitRule;
1115
pub use joins::{
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
use std::collections::{HashMap, HashSet};
2+
use std::sync::Arc;
3+
4+
use itertools::Itertools;
5+
use optd_core::rules::{Rule, RuleMatcher};
6+
use optd_core::{optimizer::Optimizer, rel_node::RelNode};
7+
8+
use crate::plan_nodes::{
9+
Expr, ExprList, LogicalAgg, LogicalSort, OptRelNode, OptRelNodeTyp, PlanNode, SortOrderExpr,
10+
SortOrderType,
11+
};
12+
13+
use super::macros::define_rule;
14+
15+
define_rule!(
16+
EliminateDuplicatedSortExprRule,
17+
apply_eliminate_duplicated_sort_expr,
18+
(Sort, child, [exprs])
19+
);
20+
21+
/// Removes duplicate sort expressions
22+
/// For exmaple:
23+
/// select *
24+
/// from t1
25+
/// order by id desc, id, name, id asc
26+
/// becomes
27+
/// select *
28+
/// from t1
29+
/// order by id desc, name
30+
fn apply_eliminate_duplicated_sort_expr(
31+
_optimizer: &impl Optimizer<OptRelNodeTyp>,
32+
EliminateDuplicatedSortExprRulePicks { child, exprs }: EliminateDuplicatedSortExprRulePicks,
33+
) -> Vec<RelNode<OptRelNodeTyp>> {
34+
let sort_keys: Vec<Expr> = exprs
35+
.children
36+
.iter()
37+
.map(|x| Expr::from_rel_node(x.clone()).unwrap())
38+
.collect_vec();
39+
40+
let normalized_sort_keys: Vec<Arc<RelNode<OptRelNodeTyp>>> = exprs
41+
.children
42+
.iter()
43+
.map(|x| match x.typ {
44+
OptRelNodeTyp::SortOrder(_) => SortOrderExpr::new(
45+
SortOrderType::Asc,
46+
SortOrderExpr::from_rel_node(x.clone()).unwrap().child(),
47+
)
48+
.into_rel_node(),
49+
_ => x.clone(),
50+
})
51+
.collect_vec();
52+
53+
let mut dedup_expr: Vec<Expr> = Vec::new();
54+
let mut dedup_set: HashSet<Arc<RelNode<OptRelNodeTyp>>> = HashSet::new();
55+
56+
sort_keys
57+
.iter()
58+
.zip(normalized_sort_keys.iter())
59+
.for_each(|(expr, normalized_expr)| {
60+
if !dedup_set.contains(normalized_expr) {
61+
dedup_expr.push(expr.clone());
62+
dedup_set.insert(normalized_expr.to_owned());
63+
}
64+
});
65+
66+
if dedup_expr.len() != sort_keys.len() {
67+
let node = LogicalSort::new(
68+
PlanNode::from_group(child.into()),
69+
ExprList::new(dedup_expr),
70+
);
71+
return vec![node.into_rel_node().as_ref().clone()];
72+
}
73+
vec![]
74+
}
75+
76+
define_rule!(
77+
EliminateDuplicatedAggExprRule,
78+
apply_eliminate_duplicated_agg_expr,
79+
(Agg, child, exprs, [groups])
80+
);
81+
82+
/// Removes duplicate group by expressions
83+
/// For exmaple:
84+
/// select *
85+
/// from t1
86+
/// group by id, name, id, id
87+
/// becomes
88+
/// select *
89+
/// from t1
90+
/// group by id, name
91+
fn apply_eliminate_duplicated_agg_expr(
92+
_optimizer: &impl Optimizer<OptRelNodeTyp>,
93+
EliminateDuplicatedAggExprRulePicks {
94+
child,
95+
exprs,
96+
groups,
97+
}: EliminateDuplicatedAggExprRulePicks,
98+
) -> Vec<RelNode<OptRelNodeTyp>> {
99+
let mut dedup_expr: Vec<Expr> = Vec::new();
100+
let mut dedup_set: HashSet<Arc<RelNode<OptRelNodeTyp>>> = HashSet::new();
101+
groups.children.iter().for_each(|expr| {
102+
if !dedup_set.contains(expr) {
103+
dedup_expr.push(Expr::from_rel_node(expr.clone()).unwrap());
104+
dedup_set.insert(expr.clone());
105+
}
106+
});
107+
108+
if dedup_expr.len() != groups.children.len() {
109+
let node = LogicalAgg::new(
110+
PlanNode::from_group(child.into()),
111+
ExprList::from_group(exprs.into()),
112+
ExprList::new(dedup_expr),
113+
);
114+
return vec![node.into_rel_node().as_ref().clone()];
115+
}
116+
vec![]
117+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
-- (no id or description)
2+
create table t1(v1 int, v2 int);
3+
insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2);
4+
5+
/*
6+
5
7+
*/
8+
9+
-- Test without sorts/aggs.
10+
select * from t1;
11+
12+
/*
13+
LogicalProjection { exprs: [ #0, #1 ] }
14+
└── LogicalScan { table: t1 }
15+
PhysicalProjection { exprs: [ #0, #1 ] }
16+
└── PhysicalScan { table: t1 }
17+
0 0
18+
1 1
19+
5 2
20+
2 4
21+
0 2
22+
*/
23+
24+
-- Test whether the optimizer handles duplicate sort expressions correctly.
25+
select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc;
26+
27+
/*
28+
LogicalSort
29+
├── exprs:
30+
│ ┌── SortOrder { order: Asc }
31+
│ │ └── #0
32+
│ ├── SortOrder { order: Asc }
33+
│ │ └── #1
34+
│ ├── SortOrder { order: Desc }
35+
│ │ └── #0
36+
│ ├── SortOrder { order: Desc }
37+
│ │ └── #1
38+
│ └── SortOrder { order: Asc }
39+
│ └── #0
40+
└── LogicalProjection { exprs: [ #0, #1 ] }
41+
└── LogicalScan { table: t1 }
42+
PhysicalSort
43+
├── exprs:
44+
│ ┌── SortOrder { order: Asc }
45+
│ │ └── #0
46+
│ └── SortOrder { order: Asc }
47+
│ └── #1
48+
└── PhysicalProjection { exprs: [ #0, #1 ] }
49+
└── PhysicalScan { table: t1 }
50+
0 0
51+
0 2
52+
1 1
53+
2 4
54+
5 2
55+
*/
56+
57+
-- Test whether the optimizer handles duplicate agg expressions correctly.
58+
select * from t1 group by v1, v2, v1;
59+
60+
/*
61+
LogicalProjection { exprs: [ #0, #1 ] }
62+
└── LogicalAgg { exprs: [], groups: [ #0, #1, #0 ] }
63+
└── LogicalScan { table: t1 }
64+
PhysicalProjection { exprs: [ #0, #1 ] }
65+
└── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] }
66+
└── PhysicalScan { table: t1 }
67+
0 0
68+
1 1
69+
5 2
70+
2 4
71+
0 2
72+
*/
73+
74+
-- Test whether the optimizer handles duplicate sort and agg expressions correctly.
75+
select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc;
76+
77+
/*
78+
LogicalSort
79+
├── exprs:
80+
│ ┌── SortOrder { order: Asc }
81+
│ │ └── #0
82+
│ ├── SortOrder { order: Asc }
83+
│ │ └── #1
84+
│ ├── SortOrder { order: Desc }
85+
│ │ └── #0
86+
│ ├── SortOrder { order: Desc }
87+
│ │ └── #1
88+
│ └── SortOrder { order: Asc }
89+
│ └── #0
90+
└── LogicalProjection { exprs: [ #0, #1 ] }
91+
└── LogicalAgg { exprs: [], groups: [ #0, #1, #0, #1, #1 ] }
92+
└── LogicalScan { table: t1 }
93+
PhysicalSort
94+
├── exprs:
95+
│ ┌── SortOrder { order: Asc }
96+
│ │ └── #0
97+
│ └── SortOrder { order: Asc }
98+
│ └── #1
99+
└── PhysicalProjection { exprs: [ #0, #1 ] }
100+
└── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] }
101+
└── PhysicalScan { table: t1 }
102+
0 0
103+
0 2
104+
1 1
105+
2 4
106+
5 2
107+
*/
108+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
- sql: |
2+
create table t1(v1 int, v2 int);
3+
insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2);
4+
tasks:
5+
- execute
6+
- sql: |
7+
select * from t1;
8+
desc: Test without sorts/aggs.
9+
tasks:
10+
- explain:logical_optd,physical_optd
11+
- execute
12+
- sql: |
13+
select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc;
14+
desc: Test whether the optimizer handles duplicate sort expressions correctly.
15+
tasks:
16+
- explain:logical_optd,physical_optd
17+
- execute
18+
- sql: |
19+
select * from t1 group by v1, v2, v1;
20+
desc: Test whether the optimizer handles duplicate agg expressions correctly.
21+
tasks:
22+
- explain:logical_optd,physical_optd
23+
- execute
24+
- sql: |
25+
select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc;
26+
desc: Test whether the optimizer handles duplicate sort and agg expressions correctly.
27+
tasks:
28+
- explain:logical_optd,physical_optd
29+
- execute

0 commit comments

Comments
 (0)