Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 755db92

Browse files
feat: "hello world" selectivity computation (#70)
**Description**: reason this PR just contains the simplest possible selectivity computation is because computing selectivity at all requires lots of refactoring, so I want that refactoring reviewed separately from the "core" selectivity computation logic. **Demo**: passing `test_colref_eq_constint_in_mcv` test ![Screenshot 2024-02-16 at 12 37 56](https://github.com/cmu-db/optd/assets/20631215/66661d41-0b3f-4148-96bf-0db2cabb00c0)
1 parent ad37149 commit 755db92

File tree

18 files changed

+357
-148
lines changed

18 files changed

+357
-148
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ci.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
# runs the stuff in CI.yaml locally
3+
# unfortunately this needs to be updated manually. just update it if you get annoyed by GHAs failing
4+
5+
set -ex
6+
7+
cargo fmt --all -- --check
8+
cargo clippy --workspace --all-targets --all-features --locked -- -D warnings
9+
cargo test --no-fail-fast --workspace --all-features --locked

optd-core/src/cascades/memo.rs

+2
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ impl<T: RelNodeTyp> Memo<T> {
226226
self.groups.remove(&group_id);
227227
}
228228

229+
/// If group_id exists, it adds expr_id to the existing group
230+
/// Otherwise, it creates a new group of that group_id and insert expr_id into the new group
229231
fn add_expr_to_group(
230232
&mut self,
231233
expr_id: ExprId,

optd-core/src/cascades/optimizer.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,11 @@ pub struct CascadesOptimizer<T: RelNodeTyp> {
5252

5353
/// `RelNode` only contains the representation of the plan nodes. Sometimes, we need more context, i.e., group id and
5454
/// expr id, during the optimization phase. All these information are collected in this struct.
55-
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
55+
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
5656
pub struct RelNodeContext {
5757
pub group_id: GroupId,
5858
pub expr_id: ExprId,
59+
pub children_group_ids: Vec<GroupId>,
5960
}
6061

6162
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
@@ -322,9 +323,13 @@ impl<T: RelNodeTyp> CascadesOptimizer<T> {
322323
.get_all_expr_bindings(expr_id, false, false, level)
323324
}
324325

325-
pub fn get_all_group_physical_bindings(&self, group_id: GroupId) -> Vec<RelNodeRef<T>> {
326+
pub fn get_all_group_bindings(
327+
&self,
328+
group_id: GroupId,
329+
physical_only: bool,
330+
) -> Vec<RelNodeRef<T>> {
326331
self.memo
327-
.get_all_group_bindings(group_id, true, true, Some(10))
332+
.get_all_group_bindings(group_id, physical_only, true, Some(10))
328333
}
329334

330335
pub(super) fn is_group_explored(&self, group_id: GroupId) -> bool {

optd-core/src/cascades/tasks/optimize_inputs.rs

+21-7
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
137137
trace!(event = "task_begin", task = "optimize_inputs", expr_id = %self.expr_id, continue_from = ?self.continue_from);
138138
let expr = optimizer.get_expr_memoed(self.expr_id);
139139
let group_id = optimizer.get_group_id(self.expr_id);
140-
let children = &expr.children;
140+
let children_group_ids = &expr.children;
141141
let cost = optimizer.cost();
142142

143143
if let Some(ContinueTask {
@@ -149,10 +149,17 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
149149
let context = RelNodeContext {
150150
expr_id: self.expr_id,
151151
group_id,
152+
children_group_ids: children_group_ids.clone(),
152153
};
153154
if self.should_terminate(
154155
cost.sum(
155-
&cost.compute_cost(&expr.typ, &expr.data, &input_cost, Some(context)),
156+
&cost.compute_cost(
157+
&expr.typ,
158+
&expr.data,
159+
&input_cost,
160+
Some(context.clone()),
161+
Some(optimizer),
162+
),
156163
&input_cost,
157164
)
158165
.0[0],
@@ -161,8 +168,8 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
161168
trace!(event = "task_finish", task = "optimize_inputs", expr_id = %self.expr_id);
162169
return Ok(vec![]);
163170
}
164-
if next_group_idx < children.len() {
165-
let group_id = children[next_group_idx];
171+
if next_group_idx < children_group_ids.len() {
172+
let group_id = children_group_ids[next_group_idx];
166173
let group_idx = next_group_idx;
167174
let group_info = optimizer.get_group_info(group_id);
168175
let mut has_full_winner = false;
@@ -176,7 +183,8 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
176183
&expr.typ,
177184
&expr.data,
178185
&input_cost,
179-
Some(context),
186+
Some(context.clone()),
187+
Some(optimizer),
180188
),
181189
&input_cost,
182190
)
@@ -243,7 +251,13 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
243251
} else {
244252
self.update_winner(
245253
&cost.sum(
246-
&cost.compute_cost(&expr.typ, &expr.data, &input_cost, Some(context)),
254+
&cost.compute_cost(
255+
&expr.typ,
256+
&expr.data,
257+
&input_cost,
258+
Some(context.clone()),
259+
Some(optimizer),
260+
),
247261
&input_cost,
248262
),
249263
optimizer,
@@ -252,7 +266,7 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
252266
Ok(vec![])
253267
}
254268
} else {
255-
let input_cost = self.first_invoke(children, optimizer);
269+
let input_cost = self.first_invoke(children_group_ids, optimizer);
256270
trace!(event = "task_yield", task = "optimize_inputs", expr_id = %self.expr_id);
257271
Ok(vec![Box::new(self.continue_from(
258272
ContinueTask {

optd-core/src/cost.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::{
2-
cascades::RelNodeContext,
2+
cascades::{CascadesOptimizer, RelNodeContext},
33
rel_node::{RelNode, RelNodeTyp, Value},
44
};
55

@@ -13,6 +13,8 @@ pub trait CostModel<T: RelNodeTyp>: 'static + Send + Sync {
1313
data: &Option<Value>,
1414
children: &[Cost],
1515
context: Option<RelNodeContext>,
16+
// one reason we need the optimizer is to traverse children nodes to build up an expression tree
17+
optimizer: Option<&CascadesOptimizer<T>>,
1618
) -> Cost;
1719

1820
fn compute_plan_node_cost(&self, node: &RelNode<T>) -> Cost;

optd-datafusion-bridge/src/from_optd.rs

+8-27
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ use datafusion::{
2323
use optd_datafusion_repr::{
2424
plan_nodes::{
2525
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, ConstantType,
26-
Expr, ExprList, FuncExpr, FuncType, JoinType, LikeExpr, LogOpExpr, LogOpType, OptRelNode,
27-
OptRelNodeRef, OptRelNodeTyp, PhysicalAgg, PhysicalEmptyRelation, PhysicalFilter,
28-
PhysicalHashJoin, PhysicalLimit, PhysicalNestedLoopJoin, PhysicalProjection, PhysicalScan,
29-
PhysicalSort, PlanNode, SortOrderExpr, SortOrderType,
26+
Expr, FuncExpr, FuncType, JoinType, LikeExpr, OptRelNode, OptRelNodeRef, OptRelNodeTyp,
27+
PhysicalAgg, PhysicalEmptyRelation, PhysicalFilter, PhysicalHashJoin, PhysicalLimit,
28+
PhysicalNestedLoopJoin, PhysicalProjection, PhysicalScan, PhysicalSort, PlanNode,
29+
SortOrderExpr, SortOrderType,
3030
},
3131
properties::schema::Schema as OptdSchema,
3232
PhysicalCollector,
@@ -191,23 +191,6 @@ impl OptdPlanContext<'_> {
191191
}
192192
}
193193
OptRelNodeTyp::Sort => unreachable!(),
194-
OptRelNodeTyp::LogOp(typ) => {
195-
let expr = LogOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
196-
let mut children = expr.children().to_vec().into_iter();
197-
let first_expr = Self::conv_from_optd_expr(children.next().unwrap(), context)?;
198-
let op = match typ {
199-
LogOpType::And => datafusion::logical_expr::Operator::And,
200-
LogOpType::Or => datafusion::logical_expr::Operator::Or,
201-
};
202-
children.try_fold(first_expr, |acc, expr| {
203-
let expr = Self::conv_from_optd_expr(expr, context)?;
204-
Ok(
205-
Arc::new(datafusion::physical_plan::expressions::BinaryExpr::new(
206-
acc, op, expr,
207-
)) as Arc<dyn PhysicalExpr>,
208-
)
209-
})
210-
}
211194
OptRelNodeTyp::BinOp(op) => {
212195
let expr = BinOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
213196
let left = Self::conv_from_optd_expr(expr.left_child(), context)?;
@@ -237,12 +220,10 @@ impl OptdPlanContext<'_> {
237220
// TODO: should we just convert between to x <= c1 and x >= c2?
238221
let expr = BetweenExpr::from_rel_node(expr.into_rel_node()).unwrap();
239222
Self::conv_from_optd_expr(
240-
LogOpExpr::new(
241-
LogOpType::And,
242-
ExprList::new(vec![
243-
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
244-
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
245-
]),
223+
BinOpExpr::new(
224+
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
225+
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
226+
BinOpType::And,
246227
)
247228
.into_expr(),
248229
context,

optd-datafusion-bridge/src/into_optd.rs

+16-10
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ use datafusion_expr::Expr as DFExpr;
88
use optd_core::rel_node::RelNode;
99
use optd_datafusion_repr::plan_nodes::{
1010
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, Expr, ExprList,
11-
FuncExpr, FuncType, JoinType, LikeExpr, LogOpExpr, LogOpType, LogicalAgg, LogicalEmptyRelation,
12-
LogicalFilter, LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan, LogicalSort,
13-
OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
11+
FuncExpr, FuncType, JoinType, LikeExpr, LogicalAgg, LogicalEmptyRelation, LogicalFilter,
12+
LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan, LogicalSort, OptRelNode,
13+
OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
1414
};
1515

1616
use crate::OptdPlanContext;
@@ -296,13 +296,19 @@ impl OptdPlanContext<'_> {
296296
} else if log_ops.len() == 1 {
297297
Ok(LogicalJoin::new(left, right, log_ops.remove(0), join_type))
298298
} else {
299-
let expr_list = ExprList::new(log_ops);
300-
Ok(LogicalJoin::new(
301-
left,
302-
right,
303-
LogOpExpr::new(LogOpType::And, expr_list).into_expr(),
304-
join_type,
305-
))
299+
// Build a left-deep tree from log_ops
300+
// I wanted to pop from the left instead of the right to maintain the order, even if it's slower
301+
// you can obv change log_ops to a Deque to avoid this issue but I didn't bother since I don't wanna
302+
// do premature optimization
303+
let left_nonlog_op = log_ops.remove(0);
304+
let right_nonlog_op = log_ops.remove(0);
305+
let mut cond =
306+
BinOpExpr::new(left_nonlog_op, right_nonlog_op, BinOpType::And).into_expr();
307+
while !log_ops.is_empty() {
308+
cond = BinOpExpr::new(cond, log_ops.remove(0), BinOpType::And).into_expr();
309+
}
310+
311+
Ok(LogicalJoin::new(left, right, cond, join_type))
306312
}
307313
}
308314

optd-datafusion-bridge/src/lib.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use std::{
3030
sync::{Arc, Mutex},
3131
};
3232

33-
struct OptdPlanContext<'a> {
33+
pub struct OptdPlanContext<'a> {
3434
tables: HashMap<String, Arc<dyn TableSource>>,
3535
session_state: &'a SessionState,
3636
pub optimizer: Option<&'a DatafusionOptimizer>,
@@ -251,7 +251,7 @@ impl OptdQueryPlanner {
251251
));
252252
let bindings = optimizer
253253
.optd_optimizer()
254-
.get_all_group_physical_bindings(group_id);
254+
.get_all_group_bindings(group_id, true);
255255
let mut join_orders = BTreeSet::new();
256256
let mut logical_join_orders = BTreeSet::new();
257257
for binding in bindings {

optd-datafusion-repr/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ optd-core = { path = "../optd-core" }
1919
camelpaste = "0.1"
2020
datafusion-expr = "32.0.0"
2121
async-trait = "0.1"
22+
datafusion = "32.0.0"

optd-datafusion-repr/src/bin/test_optimize.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use optd_core::{
55
rel_node::Value,
66
};
77
use optd_datafusion_repr::{
8-
cost::OptCostModel,
8+
cost::{OptCostModel, PerTableStats},
99
plan_nodes::{
1010
BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, JoinType, LogicalFilter, LogicalJoin,
1111
LogicalScan, OptRelNode, OptRelNodeTyp, PlanNode,
@@ -36,7 +36,7 @@ pub fn main() {
3636
Box::new(OptCostModel::new(
3737
[("t1", 1000), ("t2", 100), ("t3", 10000)]
3838
.into_iter()
39-
.map(|(x, y)| (x.to_string(), y))
39+
.map(|(x, y)| (x.to_string(), PerTableStats::new(y, vec![])))
4040
.collect(),
4141
)),
4242
vec![],

optd-datafusion-repr/src/cost.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@ mod adaptive_cost;
22
mod base_cost;
33

44
pub use adaptive_cost::{AdaptiveCostModel, RuntimeAdaptionStorage};
5-
pub use base_cost::{OptCostModel, COMPUTE_COST, IO_COST, ROW_COUNT};
5+
pub use base_cost::{
6+
OptCostModel, PerColumnStats, PerTableStats, COMPUTE_COST, IO_COST, ROW_COUNT,
7+
};

optd-datafusion-repr/src/cost/adaptive_cost.rs

+7-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use std::{
55

66
use crate::{cost::OptCostModel, plan_nodes::OptRelNodeTyp};
77
use optd_core::{
8-
cascades::{GroupId, RelNodeContext},
8+
cascades::{CascadesOptimizer, GroupId, RelNodeContext},
99
cost::{Cost, CostModel},
1010
rel_node::{RelNode, Value},
1111
};
@@ -43,6 +43,7 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
4343
data: &Option<Value>,
4444
children: &[Cost],
4545
context: Option<RelNodeContext>,
46+
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
4647
) -> Cost {
4748
if let OptRelNodeTyp::PhysicalScan = node {
4849
let guard = self.runtime_row_cnt.lock().unwrap();
@@ -57,8 +58,11 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
5758
return OptCostModel::cost(1.0, 0.0, 1.0);
5859
}
5960
}
60-
let (mut row_cnt, compute_cost, io_cost) =
61-
OptCostModel::cost_tuple(&self.base_model.compute_cost(node, data, children, None));
61+
let (mut row_cnt, compute_cost, io_cost) = OptCostModel::cost_tuple(
62+
&self
63+
.base_model
64+
.compute_cost(node, data, children, context.clone(), optimizer),
65+
);
6266
if let Some(context) = context {
6367
let guard = self.runtime_row_cnt.lock().unwrap();
6468
if let Some((runtime_row_cnt, iter)) = guard.history.get(&context.group_id) {

0 commit comments

Comments
 (0)