Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 1c557a4

Browse files
authored
feat: integrate stats in to optd (#117)
Generate the statistics in perftest and put them into `BaseCostModel` in `DatafusionOptimizer`. Below is the comparison before & after stats are added. You can check `PhysicalScan`, where the cost has changed. The final cardinality remains the same because when stats on a column is missing, we use a very small magic number `INVALID_SELECTIVITY` (0.001) that just sets cardinality to 1. ### Todos in Future PRs - Support generating stats on `Utf8`. - Set a better magic number. - Generate MCV. ### Before ``` plan space size budget used, not applying logical rules any more. current plan space: 1094 explain: PhysicalSort ├── exprs:SortOrder { order: Desc } │ └── #1 ├── cost: weighted=185.17,row_cnt=1.00,compute=179.17,io=6.00 └── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=182.12,row_cnt=1.00,compute=176.12,io=6.00 } └── PhysicalAgg ├── aggrs:Agg(Sum) │ └── Mul │ ├── #0 │ └── Sub │ ├── 1 │ └── #1 ├── groups: [ #2 ] ├── cost: weighted=182.02,row_cnt=1.00,compute=176.02,io=6.00 └── PhysicalProjection { exprs: [ #0, #1, #2 ], cost: weighted=64.90,row_cnt=1.00,compute=58.90,io=6.00 } └── PhysicalProjection { exprs: [ #0, #1, #4, #5, #6 ], cost: weighted=64.76,row_cnt=1.00,compute=58.76,io=6.00 } └── PhysicalProjection { exprs: [ #2, #3, #5, #6, #7, #8, #9 ], cost: weighted=64.54,row_cnt=1.00,compute=58.54,io=6.00 } └── PhysicalProjection { exprs: [ #0, #3, #4, #5, #6, #7, #8, #9, #10, #11 ], cost: weighted=64.24,row_cnt=1.00,compute=58.24,io=6.00 } └── PhysicalProjection { exprs: [ #1, #2, #4, #5, #6, #7, #8, #9, #10, #11, #12, #13 ], cost: weighted=63.82,row_cnt=1.00,compute=57.82,io=6.00 } └── PhysicalProjection { exprs: [ #0, #3, #8, #9, #10, #11, #12, #13, #14, #15, #16, #17, #18, #19 ], cost: weighted=63.32,row_cnt=1.00,compute=57.32,io=6.00 } └── PhysicalNestedLoopJoin ├── join_type: Inner ├── cond:And │ ├── Eq │ │ ├── #11 │ │ └── #14 │ └── Eq │ ├── #3 │ └── #15 ├── cost: weighted=62.74,row_cnt=1.00,compute=56.74,io=6.00 ├── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #1 ], cost: weighted=35.70,row_cnt=1.00,compute=32.70,io=3.00 } │ ├── PhysicalScan { table: customer, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ], cost: weighted=31.64,row_cnt=1.00,compute=29.64,io=2.00 } │ ├── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=27.40,row_cnt=1.00,compute=26.40,io=1.00 } │ │ └── PhysicalFilter │ │ ├── cond:And │ │ │ ├── Geq │ │ │ │ ├── #2 │ │ │ │ └── 9131 │ │ │ └── Lt │ │ │ ├── #2 │ │ │ └── 9496 │ │ ├── cost: weighted=27.30,row_cnt=1.00,compute=26.30,io=1.00 │ │ └── PhysicalProjection { exprs: [ #0, #1, #4 ], cost: weighted=1.14,row_cnt=1.00,compute=0.14,io=1.00 } │ │ └── PhysicalScan { table: orders, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } │ └── PhysicalProjection { exprs: [ #0, #2, #5, #6 ], cost: weighted=1.18,row_cnt=1.00,compute=0.18,io=1.00 } │ └── PhysicalScan { table: lineitem, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } └── PhysicalProjection { exprs: [ #0, #3, #7, #8, #9, #10 ], cost: weighted=15.72,row_cnt=1.00,compute=12.72,io=3.00 } └── PhysicalHashJoin { join_type: Inner, left_keys: [ #3 ], right_keys: [ #0 ], cost: weighted=15.46,row_cnt=1.00,compute=12.46,io=3.00 } ├── PhysicalScan { table: supplier, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } └── PhysicalHashJoin { join_type: Inner, left_keys: [ #2 ], right_keys: [ #0 ], cost: weighted=11.40,row_cnt=1.00,compute=9.40,io=2.00 } ├── PhysicalProjection { exprs: [ #0, #1, #2 ], cost: weighted=1.14,row_cnt=1.00,compute=0.14,io=1.00 } │ └── PhysicalScan { table: nation, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } └── PhysicalProjection { exprs: [ #0 ], cost: weighted=7.20,row_cnt=1.00,compute=6.20,io=1.00 } └── PhysicalFilter ├── cond:Eq │ ├── #1 │ └── "AMERICA" ├── cost: weighted=7.14,row_cnt=1.00,compute=6.14,io=1.00 └── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=1.10,row_cnt=1.00,compute=0.10,io=1.00 } └── PhysicalScan { table: region, cost: weighted=1.00,row_cnt=1.00,compute=0.00,io=1.00 } plan space size budget used, not applying logical rules any more. current plan space: 1094 qerrors: {"DataFusion": [5.0]} ``` ### After ``` plan space size budget used, not applying logical rules any more. current plan space: 1094 explain: PhysicalSort ├── exprs:SortOrder { order: Desc } │ └── #1 ├── cost: weighted=336032.32,row_cnt=1.00,compute=259227.32,io=76805.00 └── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=336029.27,row_cnt=1.00,compute=259224.27,io=76805.00 } └── PhysicalAgg ├── aggrs:Agg(Sum) │ └── Mul │ ├── #0 │ └── Sub │ ├── 1 │ └── #1 ├── groups: [ #2 ] ├── cost: weighted=336029.17,row_cnt=1.00,compute=259224.17,io=76805.00 └── PhysicalProjection { exprs: [ #0, #1, #2 ], cost: weighted=335912.05,row_cnt=1.00,compute=259107.05,io=76805.00 } └── PhysicalProjection { exprs: [ #0, #1, #4, #5, #6 ], cost: weighted=335911.91,row_cnt=1.00,compute=259106.91,io=76805.00 } └── PhysicalProjection { exprs: [ #2, #3, #5, #6, #7, #8, #9 ], cost: weighted=335911.69,row_cnt=1.00,compute=259106.69,io=76805.00 } └── PhysicalProjection { exprs: [ #0, #3, #4, #5, #6, #7, #8, #9, #10, #11 ], cost: weighted=335911.39,row_cnt=1.00,compute=259106.39,io=76805.00 } └── PhysicalProjection { exprs: [ #1, #2, #4, #5, #6, #7, #8, #9, #10, #11, #12, #13 ], cost: weighted=335910.97,row_cnt=1.00,compute=259105.97,io=76805.00 } └── PhysicalProjection { exprs: [ #0, #3, #8, #9, #10, #11, #12, #13, #14, #15, #16, #17, #18, #19 ], cost: weighted=335910.47,row_cnt=1.00,compute=259105.47,io=76805.00 } └── PhysicalNestedLoopJoin ├── join_type: Inner ├── cond:And │ ├── Eq │ │ ├── #11 │ │ └── #14 │ └── Eq │ ├── #3 │ └── #15 ├── cost: weighted=335909.89,row_cnt=1.00,compute=259104.89,io=76805.00 ├── PhysicalProjection { exprs: [ #6, #7, #8, #9, #10, #11, #12, #13, #0, #1, #2, #3, #4, #5 ], cost: weighted=335619.21,row_cnt=1.00,compute=258944.21,io=76675.00 } │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #1 ], right_keys: [ #0 ], cost: weighted=335618.63,row_cnt=1.00,compute=258943.63,io=76675.00 } │ ├── PhysicalProjection { exprs: [ #4, #5, #0, #1, #2, #3 ], cost: weighted=332616.57,row_cnt=1.00,compute=257441.57,io=75175.00 } │ │ └── PhysicalProjection { exprs: [ #0, #2, #5, #6, #16, #17 ], cost: weighted=332616.31,row_cnt=1.00,compute=257441.31,io=75175.00 } │ │ └── PhysicalProjection { exprs: [ #2, #3, #4, #5, #6, #7, #8, #9, #10, #11, #12, #13, #14, #15, #16, #17, #0, #1 ], cost: weighted=332616.05,row_cnt=1.00,compute=257441.05,io=75175.00 } │ │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #0 ], cost: weighted=332615.31,row_cnt=1.00,compute=257440.31,io=75175.00 } │ │ ├── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=212263.25,row_cnt=1.00,compute=197263.25,io=15000.00 } │ │ │ └── PhysicalFilter │ │ │ ├── cond:And │ │ │ │ ├── Geq │ │ │ │ │ ├── #2 │ │ │ │ │ └── 9131 │ │ │ │ └── Lt │ │ │ │ ├── #2 │ │ │ │ └── 9496 │ │ │ ├── cost: weighted=212263.15,row_cnt=1.00,compute=197263.15,io=15000.00 │ │ │ └── PhysicalProjection { exprs: [ #0, #1, #4 ], cost: weighted=16050.07,row_cnt=15000.00,compute=1050.07,io=15000.00 } │ │ │ └── PhysicalScan { table: orders, cost: weighted=15000.00,row_cnt=15000.00,compute=0.00,io=15000.00 } │ │ └── PhysicalScan { table: lineitem, cost: weighted=60175.00,row_cnt=60175.00,compute=0.00,io=60175.00 } │ └── PhysicalScan { table: customer, cost: weighted=1500.00,row_cnt=1500.00,compute=0.00,io=1500.00 } └── PhysicalProjection { exprs: [ #0, #3, #7, #8, #9, #10 ], cost: weighted=279.36,row_cnt=1.00,compute=149.36,io=130.00 } └── PhysicalProjection { exprs: [ #4, #5, #6, #7, #8, #9, #10, #0, #1, #2, #3 ], cost: weighted=279.10,row_cnt=1.00,compute=149.10,io=130.00 } └── PhysicalProjection { exprs: [ #1, #2, #3, #0, #4, #5, #6, #7, #8, #9, #10 ], cost: weighted=278.64,row_cnt=1.00,compute=148.64,io=130.00 } └── PhysicalHashJoin { join_type: Inner, left_keys: [ #1 ], right_keys: [ #3 ], cost: weighted=278.18,row_cnt=1.00,compute=148.18,io=130.00 } ├── PhysicalProjection { exprs: [ #3, #0, #1, #2 ], cost: weighted=76.12,row_cnt=1.00,compute=46.12,io=30.00 } │ └── PhysicalProjection { exprs: [ #0, #1, #2, #4 ], cost: weighted=75.94,row_cnt=1.00,compute=45.94,io=30.00 } │ └── PhysicalProjection { exprs: [ #1, #2, #3, #4, #0 ], cost: weighted=75.76,row_cnt=1.00,compute=45.76,io=30.00 } │ └── PhysicalHashJoin { join_type: Inner, left_keys: [ #0 ], right_keys: [ #2 ], cost: weighted=75.54,row_cnt=1.00,compute=45.54,io=30.00 } │ ├── PhysicalProjection { exprs: [ #0 ], cost: weighted=23.48,row_cnt=1.00,compute=18.48,io=5.00 } │ │ └── PhysicalFilter │ │ ├── cond:Eq │ │ │ ├── #1 │ │ │ └── "AMERICA" │ │ ├── cost: weighted=23.42,row_cnt=1.00,compute=18.42,io=5.00 │ │ └── PhysicalProjection { exprs: [ #0, #1 ], cost: weighted=5.30,row_cnt=5.00,compute=0.30,io=5.00 } │ │ └── PhysicalScan { table: region, cost: weighted=5.00,row_cnt=5.00,compute=0.00,io=5.00 } │ └── PhysicalScan { table: nation, cost: weighted=25.00,row_cnt=25.00,compute=0.00,io=25.00 } └── PhysicalScan { table: supplier, cost: weighted=100.00,row_cnt=100.00,compute=0.00,io=100.00 } plan space size budget used, not applying logical rules any more. current plan space: 1094 qerrors: {"DataFusion": [5.0]} ```
1 parent 7915fb9 commit 1c557a4

File tree

9 files changed

+324
-40
lines changed

9 files changed

+324
-40
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

optd-datafusion-repr/src/cost/adaptive_cost.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,13 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
5151
) -> Cost {
5252
if let OptRelNodeTyp::PhysicalScan = node {
5353
let guard = self.runtime_row_cnt.lock().unwrap();
54-
if let Some((runtime_row_cnt, iter)) = guard.history.get(&context.unwrap().group_id) {
54+
if let Some((runtime_row_cnt, iter)) =
55+
guard.history.get(&context.as_ref().unwrap().group_id)
56+
{
5557
if *iter + self.decay >= guard.iter_cnt {
5658
let runtime_row_cnt = (*runtime_row_cnt).max(1) as f64;
5759
return OptCostModel::cost(runtime_row_cnt, 0.0, runtime_row_cnt);
58-
} else {
59-
return OptCostModel::cost(1.0, 0.0, 1.0);
6060
}
61-
} else {
62-
return OptCostModel::cost(1.0, 0.0, 1.0);
6361
}
6462
}
6563
let (mut row_cnt, compute_cost, io_cost) = OptCostModel::cost_tuple(

optd-datafusion-repr/src/cost/base_cost.rs

+216-8
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,20 @@ use crate::{
66
plan_nodes::{OptRelNodeRef, OptRelNodeTyp},
77
properties::column_ref::ColumnRef,
88
};
9+
use arrow_schema::{ArrowError, DataType};
10+
use datafusion::arrow::array::{
11+
Array, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, Int16Array,
12+
Int32Array, Int8Array, RecordBatch, RecordBatchIterator, RecordBatchReader, UInt16Array,
13+
UInt32Array, UInt8Array,
14+
};
915
use itertools::Itertools;
1016
use optd_core::{
1117
cascades::{CascadesOptimizer, RelNodeContext},
1218
cost::{Cost, CostModel},
1319
rel_node::{RelNode, RelNodeTyp, Value},
1420
};
21+
use optd_gungnir::stats::hyperloglog::{self, HyperLogLog};
22+
use optd_gungnir::stats::tdigest::{self, TDigest};
1523

1624
fn compute_plan_node_cost<T: RelNodeTyp, C: CostModel<T>>(
1725
model: &C,
@@ -34,9 +42,207 @@ pub struct OptCostModel {
3442
per_table_stats_map: BaseTableStats,
3543
}
3644

45+
struct MockMostCommonValues {
46+
mcvs: HashMap<Value, f64>,
47+
}
48+
49+
impl MockMostCommonValues {
50+
pub fn empty() -> Self {
51+
MockMostCommonValues {
52+
mcvs: HashMap::new(),
53+
}
54+
}
55+
}
56+
57+
impl MostCommonValues for MockMostCommonValues {
58+
fn freq(&self, value: &Value) -> Option<f64> {
59+
self.mcvs.get(value).copied()
60+
}
61+
62+
fn total_freq(&self) -> f64 {
63+
self.mcvs.values().sum()
64+
}
65+
66+
fn freq_over_pred(&self, pred: Box<dyn Fn(&Value) -> bool>) -> f64 {
67+
self.mcvs
68+
.iter()
69+
.filter(|(val, _)| pred(val))
70+
.map(|(_, freq)| freq)
71+
.sum()
72+
}
73+
74+
fn cnt(&self) -> usize {
75+
self.mcvs.len()
76+
}
77+
}
78+
3779
pub struct PerTableStats {
3880
row_cnt: usize,
39-
per_column_stats_vec: Vec<PerColumnStats>,
81+
per_column_stats_vec: Vec<Option<PerColumnStats>>,
82+
}
83+
84+
impl PerTableStats {
85+
pub fn from_record_batches<I: IntoIterator<Item = Result<RecordBatch, ArrowError>>>(
86+
batch_iter: RecordBatchIterator<I>,
87+
) -> anyhow::Result<Self> {
88+
let schema = batch_iter.schema();
89+
let col_types = schema
90+
.fields()
91+
.iter()
92+
.map(|f| f.data_type().clone())
93+
.collect_vec();
94+
let col_cnt = col_types.len();
95+
96+
let mut row_cnt = 0;
97+
let mut mcvs = col_types
98+
.iter()
99+
.map(|col_type| {
100+
if Self::is_type_supported(col_type) {
101+
Some(MockMostCommonValues::empty())
102+
} else {
103+
None
104+
}
105+
})
106+
.collect_vec();
107+
let mut distr = col_types
108+
.iter()
109+
.map(|col_type| {
110+
if Self::is_type_supported(col_type) {
111+
Some(TDigest::new(tdigest::DEFAULT_COMPRESSION))
112+
} else {
113+
None
114+
}
115+
})
116+
.collect_vec();
117+
let mut hlls = vec![HyperLogLog::new(hyperloglog::DEFAULT_PRECISION); col_cnt];
118+
let mut null_cnt = vec![0; col_cnt];
119+
120+
for batch in batch_iter {
121+
let batch = batch?;
122+
row_cnt += batch.num_rows();
123+
124+
// Enumerate the columns.
125+
for (i, col) in batch.columns().iter().enumerate() {
126+
let col_type = &col_types[i];
127+
if Self::is_type_supported(col_type) {
128+
// Update null cnt.
129+
null_cnt[i] += col.null_count();
130+
131+
Self::generate_stats_for_column(col, col_type, &mut distr[i], &mut hlls[i]);
132+
}
133+
}
134+
}
135+
136+
// Assemble the per-column stats.
137+
let mut per_column_stats_vec = Vec::with_capacity(col_cnt);
138+
for i in 0..col_cnt {
139+
per_column_stats_vec.push(if Self::is_type_supported(&col_types[i]) {
140+
Some(PerColumnStats {
141+
mcvs: Box::new(mcvs[i].take().unwrap()) as Box<dyn MostCommonValues>,
142+
ndistinct: hlls[i].n_distinct(),
143+
null_frac: null_cnt[i] as f64 / row_cnt as f64,
144+
distr: Box::new(distr[i].take().unwrap()) as Box<dyn Distribution>,
145+
})
146+
} else {
147+
None
148+
});
149+
}
150+
Ok(Self {
151+
row_cnt,
152+
per_column_stats_vec,
153+
})
154+
}
155+
156+
fn is_type_supported(data_type: &DataType) -> bool {
157+
matches!(
158+
data_type,
159+
DataType::Boolean
160+
| DataType::Int8
161+
| DataType::Int16
162+
| DataType::Int32
163+
| DataType::UInt8
164+
| DataType::UInt16
165+
| DataType::UInt32
166+
| DataType::Float32
167+
| DataType::Float64
168+
)
169+
}
170+
171+
/// Generate statistics for a column.
172+
fn generate_stats_for_column(
173+
col: &Arc<dyn Array>,
174+
col_type: &DataType,
175+
distr: &mut Option<TDigest>,
176+
hll: &mut HyperLogLog,
177+
) {
178+
macro_rules! generate_stats_for_col {
179+
({ $col:expr, $distr:expr, $hll:expr, $array_type:path, $to_f64:ident }) => {{
180+
let array = $col.as_any().downcast_ref::<$array_type>().unwrap();
181+
// Filter out `None` values.
182+
let values = array.iter().filter_map(|x| x).collect::<Vec<_>>();
183+
184+
// Update distribution.
185+
*$distr = {
186+
let mut f64_values = values.iter().map(|x| $to_f64(*x)).collect::<Vec<_>>();
187+
Some($distr.take().unwrap().merge_values(&mut f64_values))
188+
};
189+
190+
// Update hll.
191+
$hll.aggregate(&values);
192+
}};
193+
}
194+
195+
/// Convert a value to f64 with no out of range or precision loss.
196+
fn to_f64_safe<T: Into<f64>>(val: T) -> f64 {
197+
val.into()
198+
}
199+
200+
/// Convert i128 to f64 with possible precision loss.
201+
///
202+
/// Note: optd represents decimal with the significand as f64 (see `ConstantExpr::decimal`).
203+
/// For instance 0.04 of type `Decimal128(15, 2)` is just 4.0, the type information
204+
/// is discarded. Therefore we must use the significand to generate the statistics.
205+
fn i128_to_f64(val: i128) -> f64 {
206+
val as f64
207+
}
208+
209+
match col_type {
210+
DataType::Boolean => {
211+
generate_stats_for_col!({ col, distr, hll, BooleanArray, to_f64_safe })
212+
}
213+
DataType::Int8 => {
214+
generate_stats_for_col!({ col, distr, hll, Int8Array, to_f64_safe })
215+
}
216+
DataType::Int16 => {
217+
generate_stats_for_col!({ col, distr, hll, Int16Array, to_f64_safe })
218+
}
219+
DataType::Int32 => {
220+
generate_stats_for_col!({ col, distr, hll, Int32Array, to_f64_safe })
221+
}
222+
DataType::UInt8 => {
223+
generate_stats_for_col!({ col, distr, hll, UInt8Array, to_f64_safe })
224+
}
225+
DataType::UInt16 => {
226+
generate_stats_for_col!({ col, distr, hll, UInt16Array, to_f64_safe })
227+
}
228+
DataType::UInt32 => {
229+
generate_stats_for_col!({ col, distr, hll, UInt32Array, to_f64_safe })
230+
}
231+
DataType::Float32 => {
232+
generate_stats_for_col!({ col, distr, hll, Float32Array, to_f64_safe })
233+
}
234+
DataType::Float64 => {
235+
generate_stats_for_col!({ col, distr, hll, Float64Array, to_f64_safe })
236+
}
237+
DataType::Date32 => {
238+
generate_stats_for_col!({ col, distr, hll, Date32Array, to_f64_safe })
239+
}
240+
DataType::Decimal128(_, _) => {
241+
generate_stats_for_col!({ col, distr, hll, Decimal128Array, i128_to_f64 })
242+
}
243+
_ => unreachable!(),
244+
}
245+
}
40246
}
41247

42248
pub struct PerColumnStats {
@@ -45,7 +251,7 @@ pub struct PerColumnStats {
45251

46252
// ndistinct _does_ include the values in mcvs
47253
// ndistinct _does not_ include nulls
48-
ndistinct: i32,
254+
ndistinct: u64,
49255

50256
// postgres uses null_frac instead of something like "num_nulls" so we'll follow suit
51257
// my guess for why they use null_frac is because we only ever use the fraction of nulls, not the #
@@ -445,7 +651,8 @@ impl OptCostModel {
445651
is_eq: bool,
446652
) -> Option<f64> {
447653
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
448-
if let Some(per_column_stats) = per_table_stats.per_column_stats_vec.get(col_idx) {
654+
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
655+
{
449656
let eq_freq = if let Some(freq) = per_column_stats.mcvs.freq(value) {
450657
freq
451658
} else {
@@ -484,7 +691,8 @@ impl OptCostModel {
484691
is_col_eq_val: bool,
485692
) -> Option<f64> {
486693
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
487-
if let Some(per_column_stats) = per_table_stats.per_column_stats_vec.get(col_idx) {
694+
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
695+
{
488696
// because distr does not include the values in MCVs, we need to compute the CDFs there as well
489697
// because nulls return false in any comparison, they are never included when computing range selectivity
490698
let distr_leq_freq = per_column_stats.distr.cdf(value);
@@ -555,7 +763,7 @@ impl OptCostModel {
555763
}
556764

557765
impl PerTableStats {
558-
pub fn new(row_cnt: usize, per_column_stats_vec: Vec<PerColumnStats>) -> Self {
766+
pub fn new(row_cnt: usize, per_column_stats_vec: Vec<Option<PerColumnStats>>) -> Self {
559767
Self {
560768
row_cnt,
561769
per_column_stats_vec,
@@ -566,7 +774,7 @@ impl PerTableStats {
566774
impl PerColumnStats {
567775
pub fn new(
568776
mcvs: Box<dyn MostCommonValues>,
569-
ndistinct: i32,
777+
ndistinct: u64,
570778
null_frac: f64,
571779
distr: Box<dyn Distribution>,
572780
) -> Self {
@@ -612,7 +820,7 @@ mod tests {
612820
}
613821
}
614822

615-
fn empty() -> Self {
823+
pub fn empty() -> Self {
616824
MockMostCommonValues::new(vec![])
617825
}
618826
}
@@ -664,7 +872,7 @@ mod tests {
664872
OptCostModel::new(
665873
vec![(
666874
String::from(TABLE1_NAME),
667-
PerTableStats::new(100, vec![per_column_stats]),
875+
PerTableStats::new(100, vec![Some(per_column_stats)]),
668876
)]
669877
.into_iter()
670878
.collect(),

optd-gungnir/src/stats/hyperloglog.rs

+10
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@
88
use crate::stats::murmur2::murmur_hash;
99
use std::cmp::max;
1010

11+
pub const DEFAULT_PRECISION: u8 = 12;
12+
1113
/// Trait to transform any object into a stream of bytes.
1214
pub trait ByteSerializable {
1315
fn to_bytes(&self) -> Vec<u8>;
1416
}
1517

1618
/// The HyperLogLog (HLL) structure to provide a statistical estimate of NDistinct.
1719
/// For safety reasons, HLLs can only count elements of the same ByteSerializable type.
20+
#[derive(Clone)]
1821
pub struct HyperLogLog {
1922
registers: Vec<u8>, // The buckets to estimate HLL on (i.e. upper p bits).
2023
precision: u8, // The precision (p) of our HLL; 4 <= p <= 16.
@@ -29,6 +32,13 @@ impl ByteSerializable for String {
2932
}
3033
}
3134

35+
// Serialize common data types for hashing (bool).
36+
impl ByteSerializable for bool {
37+
fn to_bytes(&self) -> Vec<u8> {
38+
(*self as u8).to_bytes()
39+
}
40+
}
41+
3242
// Serialize common data types for hashing (numeric).
3343
macro_rules! impl_byte_serializable_for_numeric {
3444
($($type:ty),*) => {

optd-gungnir/src/stats/tdigest.rs

+3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@
66
use itertools::Itertools;
77
use std::f64::consts::PI;
88

9+
pub const DEFAULT_COMPRESSION: f64 = 200.0;
10+
911
/// The TDigest structure for the statistical aggregator to query quantiles.
12+
#[derive(Clone)]
1013
pub struct TDigest {
1114
/// A sorted array of Centroids, according to their mean.
1215
centroids: Vec<Centroid>,

optd-perftest/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ datafusion = { version = "32.0.0", features = [
1616
] }
1717
optd-datafusion-repr = { path = "../optd-datafusion-repr" }
1818
optd-datafusion-bridge = { path = "../optd-datafusion-bridge" }
19+
optd-gungnir = { path = "../optd-gungnir" }
1920
datafusion-optd-cli = { path = "../datafusion-optd-cli" }
2021
futures = "0.3"
2122
anyhow = { version = "1", features = ["backtrace"] }

optd-perftest/src/cardtest.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::collections::HashMap;
22
use std::path::Path;
33

44
use crate::postgres_db::PostgresDb;
5-
use crate::{benchmark::Benchmark, datafusion_db_cardtest::DatafusionDb, tpch::TpchConfig};
5+
use crate::{benchmark::Benchmark, datafusion_db::DatafusionDb, tpch::TpchConfig};
66

77
use anyhow::{self};
88
use async_trait::async_trait;

0 commit comments

Comments
 (0)