Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit f81649c

Browse files
authored
refactor(core): add predicate into cost model (#220)
Signed-off-by: Alex Chi <[email protected]>
1 parent 7045f09 commit f81649c

File tree

8 files changed

+81
-44
lines changed

8 files changed

+81
-44
lines changed

optd-core/src/cascades/optimizer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ impl<T: RelNodeTyp, M: Memo<T>> CascadesOptimizer<T, M> {
339339
self.memo.get_predicate_binding(group_id)
340340
}
341341

342-
pub fn get_predicate(&self, pred_id: PredId) -> ArcPredNode<T> {
342+
pub fn get_pred(&self, pred_id: PredId) -> ArcPredNode<T> {
343343
self.memo.get_pred(pred_id)
344344
}
345345

optd-core/src/cascades/tasks/apply_rule.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ fn match_node<T: RelNodeTyp, M: Memo<T>>(
119119
predicates: node
120120
.predicates
121121
.iter()
122-
.map(|x| optimizer.get_predicate(*x))
122+
.map(|x| optimizer.get_pred(*x))
123123
.collect(),
124124
},
125125
);

optd-core/src/cascades/tasks/optimize_inputs.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use anyhow::Result;
2+
use itertools::Itertools;
23
use tracing::trace;
34

45
use crate::{
@@ -97,9 +98,15 @@ impl OptimizeInputsTask {
9798
}
9899
if update_cost {
99100
let expr = optimizer.get_expr_memoed(self.expr_id);
101+
let preds = expr
102+
.predicates
103+
.iter()
104+
.map(|pred_id| optimizer.get_pred(*pred_id))
105+
.collect_vec();
100106
let statistics = cost.derive_statistics(
101107
&expr.typ,
102108
&expr.data,
109+
&preds,
103110
&input_statistics
104111
.iter()
105112
.map(|x| x.expect("child winner should always have statistics?"))
@@ -181,9 +188,15 @@ impl<T: RelNodeTyp, M: Memo<T>> Task<T, M> for OptimizeInputsTask {
181188
.unwrap_or_else(|| cost.zero())
182189
})
183190
.collect::<Vec<_>>();
191+
let preds = expr
192+
.predicates
193+
.iter()
194+
.map(|pred_id| optimizer.get_pred(*pred_id))
195+
.collect_vec();
184196
let operation_cost = cost.compute_operation_cost(
185197
&expr.typ,
186198
&expr.data,
199+
&preds,
187200
&input_statistics_ref,
188201
&input_cost,
189202
Some(context.clone()),

optd-core/src/cost.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::{
22
cascades::{CascadesOptimizer, Memo, RelNodeContext},
3-
rel_node::{RelNodeTyp, Value},
3+
rel_node::{ArcPredNode, RelNodeTyp, Value},
44
};
55

66
/// The statistics of a group.
@@ -15,12 +15,14 @@ pub struct Cost(pub Vec<f64>);
1515

1616
pub trait CostModel<T: RelNodeTyp, M: Memo<T>>: 'static + Send + Sync {
1717
/// Compute the cost of a single operation
18+
#[allow(clippy::too_many_arguments)]
1819
fn compute_operation_cost(
1920
&self,
2021
node: &T,
2122
data: &Option<Value>,
22-
children: &[Option<&Statistics>],
23-
children_cost: &[Cost],
23+
predicates: &[ArcPredNode<T>],
24+
children_stats: &[Option<&Statistics>],
25+
children_costs: &[Cost],
2426
context: Option<RelNodeContext>,
2527
optimizer: Option<&CascadesOptimizer<T, M>>,
2628
) -> Cost;
@@ -30,7 +32,8 @@ pub trait CostModel<T: RelNodeTyp, M: Memo<T>>: 'static + Send + Sync {
3032
&self,
3133
node: &T,
3234
data: &Option<Value>,
33-
children: &[&Statistics],
35+
predicates: &[ArcPredNode<T>],
36+
children_stats: &[&Statistics],
3437
context: Option<RelNodeContext>,
3538
optimizer: Option<&CascadesOptimizer<T, M>>,
3639
) -> Statistics;

optd-datafusion-repr-adv-cost/src/lib.rs

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::collections::HashMap;
1919
use optd_core::{
2020
cascades::{CascadesOptimizer, NaiveMemo, RelNodeContext},
2121
cost::{Cost, CostModel, Statistics},
22-
rel_node::Value,
22+
rel_node::{ArcPredNode, Value},
2323
};
2424

2525
pub struct AdvancedCostModel {
@@ -60,16 +60,18 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdvancedCostModel {
6060
&self,
6161
node: &OptRelNodeTyp,
6262
data: &Option<Value>,
63-
children: &[Option<&Statistics>],
64-
children_cost: &[Cost],
63+
predicates: &[ArcPredNode<OptRelNodeTyp>],
64+
children_stats: &[Option<&Statistics>],
65+
children_costs: &[Cost],
6566
context: Option<RelNodeContext>,
6667
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
6768
) -> Cost {
6869
self.base_model.compute_operation_cost(
6970
node,
7071
data,
71-
children,
72-
children_cost,
72+
predicates,
73+
children_stats,
74+
children_costs,
7375
context,
7476
optimizer,
7577
)
@@ -79,11 +81,12 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdvancedCostModel {
7981
&self,
8082
node: &OptRelNodeTyp,
8183
data: &Option<Value>,
82-
children: &[&Statistics],
84+
predicates: &[ArcPredNode<OptRelNodeTyp>],
85+
children_stats: &[&Statistics],
8386
context: Option<RelNodeContext>,
8487
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
8588
) -> Statistics {
86-
let row_cnts = children
89+
let row_cnts = children_stats
8790
.iter()
8891
.map(|child| OptCostModel::row_cnt(child))
8992
.collect::<Vec<f64>>();
@@ -134,9 +137,14 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdvancedCostModel {
134137
let row_cnt = self.stats.get_agg_row_cnt(context, optimizer, row_cnts[0]);
135138
OptCostModel::stat(row_cnt)
136139
}
137-
_ => self
138-
.base_model
139-
.derive_statistics(node, data, children, context, optimizer),
140+
_ => self.base_model.derive_statistics(
141+
node,
142+
data,
143+
predicates,
144+
children_stats,
145+
context,
146+
optimizer,
147+
),
140148
}
141149
}
142150
}

optd-datafusion-repr/src/cost/adaptive_cost.rs

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use crate::{cost::OptCostModel, plan_nodes::OptRelNodeTyp};
77
use optd_core::{
88
cascades::{CascadesOptimizer, GroupId, NaiveMemo, RelNodeContext},
99
cost::{Cost, CostModel, Statistics},
10-
rel_node::Value,
10+
rel_node::{ArcPredNode, Value},
1111
};
1212

1313
use super::base_cost::DEFAULT_TABLE_ROW_CNT;
@@ -65,8 +65,9 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdaptiveCostModel {
6565
&self,
6666
node: &OptRelNodeTyp,
6767
data: &Option<Value>,
68-
children: &[Option<&Statistics>],
69-
children_cost: &[Cost],
68+
predicates: &[ArcPredNode<OptRelNodeTyp>],
69+
children_stats: &[Option<&Statistics>],
70+
children_costs: &[Cost],
7071
context: Option<RelNodeContext>,
7172
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
7273
) -> Cost {
@@ -77,8 +78,9 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdaptiveCostModel {
7778
self.base_model.compute_operation_cost(
7879
node,
7980
data,
80-
children,
81-
children_cost,
81+
predicates,
82+
children_stats,
83+
children_costs,
8284
context,
8385
optimizer,
8486
)
@@ -88,16 +90,23 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for AdaptiveCostModel {
8890
&self,
8991
node: &OptRelNodeTyp,
9092
data: &Option<Value>,
91-
children: &[&Statistics],
93+
predicates: &[ArcPredNode<OptRelNodeTyp>],
94+
children_stats: &[&Statistics],
9295
context: Option<RelNodeContext>,
9396
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
9497
) -> Statistics {
9598
if let OptRelNodeTyp::PhysicalScan = node {
9699
let row_cnt = self.get_row_cnt(data, &context);
97100
return OptCostModel::stat(row_cnt);
98101
}
99-
self.base_model
100-
.derive_statistics(node, data, children, context, optimizer)
102+
self.base_model.derive_statistics(
103+
node,
104+
data,
105+
predicates,
106+
children_stats,
107+
context,
108+
optimizer,
109+
)
101110
}
102111
}
103112

optd-datafusion-repr/src/cost/base_cost.rs

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use itertools::Itertools;
55
use optd_core::{
66
cascades::{CascadesOptimizer, NaiveMemo, RelNodeContext},
77
cost::{Cost, CostModel, Statistics},
8-
rel_node::Value,
8+
rel_node::{ArcPredNode, Value},
99
};
1010
use value_bag::ValueBag;
1111

@@ -78,7 +78,8 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
7878
&self,
7979
node: &OptRelNodeTyp,
8080
data: &Option<Value>,
81-
children: &[&Statistics],
81+
_predicates: &[ArcPredNode<OptRelNodeTyp>],
82+
children_stats: &[&Statistics],
8283
_context: Option<RelNodeContext>,
8384
_optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
8485
) -> Statistics {
@@ -88,31 +89,31 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
8889
Self::stat(row_cnt)
8990
}
9091
OptRelNodeTyp::PhysicalLimit => {
91-
let row_cnt = Self::row_cnt(children[0]);
92+
let row_cnt = Self::row_cnt(children_stats[0]);
9293
let selectivity = 0.001;
9394
Self::stat((row_cnt * selectivity).max(1.0))
9495
}
9596
OptRelNodeTyp::PhysicalEmptyRelation => Self::stat(0.01),
9697
OptRelNodeTyp::PhysicalFilter => {
97-
let row_cnt = Self::row_cnt(children[0]);
98+
let row_cnt = Self::row_cnt(children_stats[0]);
9899
let selectivity = 0.001;
99100
Self::stat((row_cnt * selectivity).max(1.0))
100101
}
101102
OptRelNodeTyp::PhysicalNestedLoopJoin(_) => {
102-
let row_cnt_1 = Self::row_cnt(children[0]);
103-
let row_cnt_2 = Self::row_cnt(children[1]);
103+
let row_cnt_1 = Self::row_cnt(children_stats[0]);
104+
let row_cnt_2 = Self::row_cnt(children_stats[1]);
104105
let selectivity = 0.01;
105106
Self::stat((row_cnt_1 * row_cnt_2 * selectivity).max(1.0))
106107
}
107108
OptRelNodeTyp::PhysicalHashJoin(_) => {
108-
let row_cnt_1 = Self::row_cnt(children[0]);
109-
let row_cnt_2 = Self::row_cnt(children[1]);
109+
let row_cnt_1 = Self::row_cnt(children_stats[0]);
110+
let row_cnt_2 = Self::row_cnt(children_stats[1]);
110111
Self::stat(row_cnt_1.min(row_cnt_2).max(1.0))
111112
}
112113
OptRelNodeTyp::PhysicalSort
113114
| OptRelNodeTyp::PhysicalAgg
114115
| OptRelNodeTyp::PhysicalProjection => {
115-
let row_cnt = Self::row_cnt(children[0]);
116+
let row_cnt = Self::row_cnt(children_stats[0]);
116117
Self::stat(row_cnt)
117118
}
118119
OptRelNodeTyp::List => Self::stat(1.0),
@@ -125,12 +126,13 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
125126
&self,
126127
node: &OptRelNodeTyp,
127128
data: &Option<Value>,
128-
children: &[Option<&Statistics>],
129-
children_cost: &[Cost],
129+
_predicates: &[ArcPredNode<OptRelNodeTyp>],
130+
children_stats: &[Option<&Statistics>],
131+
children_costs: &[Cost],
130132
_context: Option<RelNodeContext>,
131133
_optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
132134
) -> Cost {
133-
let row_cnts = children
135+
let row_cnts = children_stats
134136
.iter()
135137
.map(|child| child.map(Self::row_cnt).unwrap_or(0 as f64))
136138
.collect_vec();
@@ -146,18 +148,18 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
146148
OptRelNodeTyp::PhysicalEmptyRelation => Self::cost(0.01, 0.0),
147149
OptRelNodeTyp::PhysicalFilter => {
148150
let row_cnt = row_cnts[0];
149-
let (compute_cost, _) = Self::cost_tuple(&children_cost[1]);
151+
let (compute_cost, _) = Self::cost_tuple(&children_costs[1]);
150152
Self::cost(row_cnt * compute_cost, 0.0)
151153
}
152154
OptRelNodeTyp::PhysicalNestedLoopJoin(_) => {
153155
let row_cnt_1 = row_cnts[0];
154156
let row_cnt_2 = row_cnts[1];
155-
let (compute_cost, _) = Self::cost_tuple(&children_cost[2]);
157+
let (compute_cost, _) = Self::cost_tuple(&children_costs[2]);
156158
Self::cost(row_cnt_1 * row_cnt_2 * compute_cost + row_cnt_1, 0.0)
157159
}
158160
OptRelNodeTyp::PhysicalProjection => {
159161
let row_cnt = row_cnts[0];
160-
let (compute_cost, _) = Self::cost_tuple(&children_cost[1]);
162+
let (compute_cost, _) = Self::cost_tuple(&children_costs[1]);
161163
Self::cost(row_cnt * compute_cost, 0.0)
162164
}
163165
OptRelNodeTyp::PhysicalHashJoin(_) => {
@@ -171,13 +173,13 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
171173
}
172174
OptRelNodeTyp::PhysicalAgg => {
173175
let row_cnt = row_cnts[0];
174-
let (compute_cost_1, _) = Self::cost_tuple(&children_cost[1]);
175-
let (compute_cost_2, _) = Self::cost_tuple(&children_cost[2]);
176+
let (compute_cost_1, _) = Self::cost_tuple(&children_costs[1]);
177+
let (compute_cost_2, _) = Self::cost_tuple(&children_costs[2]);
176178
Self::cost(row_cnt * (compute_cost_1 + compute_cost_2), 0.0)
177179
}
178180
// List and expressions are computed in the same way -- but list has much fewer cost
179181
OptRelNodeTyp::List => {
180-
let compute_cost = children_cost
182+
let compute_cost = children_costs
181183
.iter()
182184
.map(|child| {
183185
let (compute_cost, _) = Self::cost_tuple(child);
@@ -187,7 +189,7 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for OptCostModel {
187189
Self::cost(compute_cost + 0.01, 0.0)
188190
}
189191
_ if node.is_expression() => {
190-
let compute_cost = children_cost
192+
let compute_cost = children_costs
191193
.iter()
192194
.map(|child| {
193195
let (compute_cost, _) = Self::cost_tuple(child);

optd-datafusion-repr/src/testing/dummy_cost.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use crate::plan_nodes::OptRelNodeTyp;
22
use optd_core::{
33
cascades::{CascadesOptimizer, NaiveMemo, RelNodeContext},
44
cost::{Cost, CostModel, Statistics},
5-
rel_node::Value,
5+
rel_node::{ArcPredNode, Value},
66
};
77
use value_bag::ValueBag;
88

@@ -16,6 +16,7 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for DummyCostModel {
1616
&self,
1717
_: &OptRelNodeTyp,
1818
_: &Option<Value>,
19+
_: &[ArcPredNode<OptRelNodeTyp>],
1920
_: &[Option<&Statistics>],
2021
_: &[Cost],
2122
_: Option<RelNodeContext>,
@@ -29,6 +30,7 @@ impl CostModel<OptRelNodeTyp, NaiveMemo<OptRelNodeTyp>> for DummyCostModel {
2930
&self,
3031
_: &OptRelNodeTyp,
3132
_: &Option<Value>,
33+
_: &[ArcPredNode<OptRelNodeTyp>],
3234
_: &[&Statistics],
3335
_: Option<RelNodeContext>,
3436
_: Option<&CascadesOptimizer<OptRelNodeTyp>>,

0 commit comments

Comments
 (0)