Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 204758e

Browse files
feat: caching optd stats, 12x speedup on TPC-H SF1 (#132)
**Summary**: Now caching the stat objects used by `OptCostModel`, meaning we don't need to load data into DataFusion after doing it the first time. **Demo**: 12x speedup on TPC-H SF1 compared to not caching stats. Caching everything _except_ optd stats takes 45.6s total. ![Screenshot 2024-03-23 at 16 59 04](https://github.com/cmu-db/optd/assets/20631215/4c199374-e2df-43fb-9eba-f348ea1e275a) Caching everything, _including_ optd stats, takes 3.9s total. ![Screenshot 2024-03-23 at 16 57 45](https://github.com/cmu-db/optd/assets/20631215/4ef01ae9-c5a9-4fcd-bad9-c52d9a73c147) **Details**: * This caching is **disabled by default** to avoid accidentally using stale stats. I added a CLI arg to enable it. * The main challenge of this PR was making `PerTableStats` a serializable object for `serde`. * The serializability refactor will also help down the line when we want to **put statistics in the catalog**, since that is fundamentally a serialization problem too. Having `Box<dyn ...>` would make putting stats in the catalog more difficult. * This required a significant refactor of how the `MostCommonValues` and `Distribution` traits are handled in `OptCostModel`. Instead of having `Box<dyn ...>` values in `PerColumnStats` which store any object that implements these traits, I made `PerColumnStats` a templated object. * The one downside of this refactor is that we can no longer have a database which uses _different_ data structures for `Distribution` (like a t-digest for one column, a histogram for another, etc.). I didn't see this as a big enough reason to not do the refactor because it seems like a rare thing to do. Additionally, if we really needed to do this, we could just make an enum that had both types.
1 parent 3477898 commit 204758e

File tree

18 files changed

+281
-199
lines changed

18 files changed

+281
-199
lines changed

Cargo.lock

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-core/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ ordered-float = "4"
1414
tracing-subscriber = "0.3"
1515
pretty-xmlish = "0.1"
1616
itertools = "0.11"
17+
serde = {version = "1.0", features = ["derive", "rc"]}

optd-core/src/rel_node.rs

+28-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use std::{
99
};
1010

1111
use ordered_float::OrderedFloat;
12+
use serde::{Deserialize, Deserializer, Serialize, Serializer};
1213

1314
use crate::{cascades::GroupId, cost::Cost};
1415

@@ -27,6 +28,30 @@ pub trait RelNodeTyp:
2728
}
2829

2930
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
31+
pub struct SerializableOrderedF64(pub OrderedFloat<f64>);
32+
33+
impl Serialize for SerializableOrderedF64 {
34+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
35+
where
36+
S: Serializer,
37+
{
38+
// Directly serialize the inner f64 value of the OrderedFloat
39+
self.0 .0.serialize(serializer)
40+
}
41+
}
42+
43+
impl<'de> Deserialize<'de> for SerializableOrderedF64 {
44+
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
45+
where
46+
D: Deserializer<'de>,
47+
{
48+
// Deserialize an f64 and wrap it in an OrderedFloat
49+
let float = f64::deserialize(deserializer)?;
50+
Ok(SerializableOrderedF64(OrderedFloat(float)))
51+
}
52+
}
53+
54+
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
3055
pub enum Value {
3156
UInt8(u8),
3257
UInt16(u16),
@@ -37,7 +62,7 @@ pub enum Value {
3762
Int32(i32),
3863
Int64(i64),
3964
Int128(i128),
40-
Float(OrderedFloat<f64>),
65+
Float(SerializableOrderedF64),
4166
String(Arc<str>),
4267
Bool(bool),
4368
Date32(i32),
@@ -57,7 +82,7 @@ impl std::fmt::Display for Value {
5782
Self::Int32(x) => write!(f, "{x}"),
5883
Self::Int64(x) => write!(f, "{x}"),
5984
Self::Int128(x) => write!(f, "{x}"),
60-
Self::Float(x) => write!(f, "{x}"),
85+
Self::Float(x) => write!(f, "{}", x.0),
6186
Self::String(x) => write!(f, "\"{x}\""),
6287
Self::Bool(x) => write!(f, "{x}"),
6388
Self::Date32(x) => write!(f, "{x}"),
@@ -133,7 +158,7 @@ impl Value {
133158

134159
pub fn as_f64(&self) -> f64 {
135160
match self {
136-
Value::Float(i) => **i,
161+
Value::Float(i) => *i.0,
137162
_ => panic!("Value is not an f64"),
138163
}
139164
}

optd-datafusion-repr/src/bin/test_optimize.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use optd_core::{
88
rules::{Rule, RuleWrapper},
99
};
1010
use optd_datafusion_repr::{
11-
cost::{OptCostModel, PerTableStats},
11+
cost::{base_cost::DataFusionPerTableStats, OptCostModel},
1212
plan_nodes::{
1313
BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, JoinType, LogicalFilter, LogicalJoin,
1414
LogicalScan, OptRelNode, OptRelNodeTyp, PlanNode,
@@ -45,7 +45,7 @@ pub fn main() {
4545
Box::new(OptCostModel::new(
4646
[("t1", 1000), ("t2", 100), ("t3", 10000)]
4747
.into_iter()
48-
.map(|(x, y)| (x.to_string(), PerTableStats::new(y, vec![])))
48+
.map(|(x, y)| (x.to_string(), DataFusionPerTableStats::new(y, vec![])))
4949
.collect(),
5050
)),
5151
vec![],

optd-datafusion-repr/src/cost.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
mod adaptive_cost;
2-
mod base_cost;
1+
pub mod adaptive_cost;
2+
pub mod base_cost;
33
mod stats;
44

55
pub use adaptive_cost::{AdaptiveCostModel, RuntimeAdaptionStorage, DEFAULT_DECAY};

optd-datafusion-repr/src/cost/adaptive_cost.rs

+14-9
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,14 @@ use optd_core::{
1010
rel_node::{RelNode, Value},
1111
};
1212

13-
use super::base_cost::BaseTableStats;
13+
use super::base_cost::{
14+
BaseTableStats, DataFusionDistribution, DataFusionMostCommonValues, Distribution,
15+
MostCommonValues,
16+
};
1417

1518
pub type RuntimeAdaptionStorage = Arc<Mutex<RuntimeAdaptionStorageInner>>;
19+
pub type DataFusionAdaptiveCostModel =
20+
AdaptiveCostModel<DataFusionMostCommonValues, DataFusionDistribution>;
1621

1722
#[derive(Default, Debug)]
1823
pub struct RuntimeAdaptionStorageInner {
@@ -22,13 +27,13 @@ pub struct RuntimeAdaptionStorageInner {
2227

2328
pub const DEFAULT_DECAY: usize = 50;
2429

25-
pub struct AdaptiveCostModel {
30+
pub struct AdaptiveCostModel<M: MostCommonValues, D: Distribution> {
2631
runtime_row_cnt: RuntimeAdaptionStorage,
27-
base_model: OptCostModel,
32+
base_model: OptCostModel<M, D>,
2833
decay: usize,
2934
}
3035

31-
impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
36+
impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for AdaptiveCostModel<M, D> {
3237
fn explain(&self, cost: &Cost) -> String {
3338
self.base_model.explain(cost)
3439
}
@@ -56,11 +61,11 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
5661
{
5762
if *iter + self.decay >= guard.iter_cnt {
5863
let runtime_row_cnt = (*runtime_row_cnt).max(1) as f64;
59-
return OptCostModel::cost(runtime_row_cnt, 0.0, runtime_row_cnt);
64+
return OptCostModel::<M, D>::cost(runtime_row_cnt, 0.0, runtime_row_cnt);
6065
}
6166
}
6267
}
63-
let (mut row_cnt, compute_cost, io_cost) = OptCostModel::cost_tuple(
68+
let (mut row_cnt, compute_cost, io_cost) = OptCostModel::<M, D>::cost_tuple(
6469
&self
6570
.base_model
6671
.compute_cost(node, data, children, context.clone(), optimizer),
@@ -74,16 +79,16 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
7479
}
7580
}
7681
}
77-
OptCostModel::cost(row_cnt, compute_cost, io_cost)
82+
OptCostModel::<M, D>::cost(row_cnt, compute_cost, io_cost)
7883
}
7984

8085
fn compute_plan_node_cost(&self, node: &RelNode<OptRelNodeTyp>) -> Cost {
8186
self.base_model.compute_plan_node_cost(node)
8287
}
8388
}
8489

85-
impl AdaptiveCostModel {
86-
pub fn new(decay: usize, stats: BaseTableStats) -> Self {
90+
impl<M: MostCommonValues, D: Distribution> AdaptiveCostModel<M, D> {
91+
pub fn new(decay: usize, stats: BaseTableStats<M, D>) -> Self {
8792
Self {
8893
runtime_row_cnt: RuntimeAdaptionStorage::default(),
8994
base_model: OptCostModel::new(stats),

0 commit comments

Comments
 (0)