From e0cfec8db53260d74ca7110b20a882d8f61bf14a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pereira?= Date: Wed, 13 Nov 2024 10:46:14 -0500 Subject: [PATCH 1/5] feature: dump and visualize the memo as a graph --- optd-core/src/cascades/optimizer.rs | 70 ++++++++++++++++++++++ optd-core/src/cascades/tasks/apply_rule.rs | 15 ++++- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index 2af46e12..aa53853b 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -5,6 +5,8 @@ use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; use std::fmt::Display; +use std::hash::{DefaultHasher, Hasher}; +use std::io::Write; use std::sync::Arc; use anyhow::Result; @@ -170,6 +172,74 @@ impl> CascadesOptimizer { self.disabled_rules.contains(&rule_id) } + pub fn dump_dot(&self, writer: &mut Box) -> Result<(), std::io::Error> { + let memo = self.memo(); + + // filter out predicates and use a btree for predictable iteration order + let mut groups = BTreeSet::new(); + for group_id in memo.get_all_group_ids() { + let group = memo.get_group(group_id); + let mut rel = false; + for expr_id in memo.get_all_exprs_in_group(group_id).iter() { + if memo.get_expr_memoed(*expr_id).typ.is_logical() { + groups.insert(group_id); + break; + } + } + } + + writeln!(writer, "digraph Memo {{")?; + writeln!( + writer, + "compound=true; ranksep=1.0; node [colorscheme=set312];" + )?; + for group_id in groups.iter() { + let group = memo.get_group(*group_id); + writeln!(writer, "subgraph cluster_{} {{", group_id.0)?; + writeln!(writer, "rank=source;")?; + writeln!(writer, "edge [style=invis];")?; + writeln!( + writer, + "g{} [shape=plaintext,label=\"group_id=!{}\"];", + group_id.0, group_id.0 + )?; + for expr_id in memo.get_all_exprs_in_group(*group_id).iter() { + let expr = memo.get_expr_memoed(*expr_id); + let mut s = DefaultHasher::new(); + expr.typ.hash(&mut s); + let color = (s.finish() % 11) + 1; // %11 looks better than %12! :-) + let shape = if expr.typ.is_logical() { "oval" } else { "box" }; + let rules = match self.fired_rules.get(expr_id) { + None => 0, + Some(v) => v.len(), + }; + writeln!( + writer, + "e{} [shape={},label=\"{}: {:?} ({})\",style=filled,color={}]", + expr_id.0, shape, expr_id.0, expr.typ, rules, color + )?; + writeln!(writer, "g{} -> e{};", group_id.0, expr_id.0)?; + } + writeln!(writer, "}}"); + } + for group_id in groups.iter() { + for expr_id in memo.get_all_exprs_in_group(*group_id).iter() { + let expr = memo.get_expr_memoed(*expr_id); + //writeln!(writer, "\"g{}\" -> \"e{}\";", i, e)?; + for child in expr.children.iter() { + if groups.contains(child) { + writeln!( + writer, + "e{} -> g{} [lhead=\"cluster_{}\"];", + expr_id.0, child.0, child.0 + )?; + } + } + } + } + writeln!(writer, "}}") + } + pub fn dump(&self) { for group_id in self.memo.get_all_group_ids() { let winner_str = match &self.memo.get_group_info(group_id).winner { diff --git a/optd-core/src/cascades/tasks/apply_rule.rs b/optd-core/src/cascades/tasks/apply_rule.rs index b6e62efb..148c779b 100644 --- a/optd-core/src/cascades/tasks/apply_rule.rs +++ b/optd-core/src/cascades/tasks/apply_rule.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use anyhow::Result; use itertools::Itertools; -use tracing::trace; +use tracing::{log::log_enabled, log::Level::Trace, trace}; use super::Task; use crate::cascades::memo::ArcMemoPlanNode; @@ -164,6 +164,19 @@ impl> Task for ApplyRuleTask { return Ok(vec![]); } + if log_enabled!(target: "optd-memoviz", Trace) { + let path = format!( + "optd-{:#016x}.dot", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + ); + let mut writer = + Box::new(std::fs::File::create(path).unwrap()) as Box; + optimizer.dump_dot(&mut writer); + } + let rule = optimizer.rules()[self.rule_id].clone(); trace!(event = "task_begin", task = "apply_rule", expr_id = %self.expr_id, rule_id = %self.rule_id, rule = %rule.name()); From 69c8779750f5967c7520d2472bc771e035f17683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pereira?= Date: Fri, 15 Nov 2024 11:09:47 -0500 Subject: [PATCH 2/5] make it an option, sequence names, and remove Box --- optd-core/src/cascades/optimizer.rs | 7 ++++++- optd-core/src/cascades/tasks/apply_rule.rs | 18 +++++++----------- optd-datafusion-repr/src/lib.rs | 1 + 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index aa53853b..beaf9163 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -7,6 +7,7 @@ use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; use std::fmt::Display; use std::hash::{DefaultHasher, Hasher}; use std::io::Write; +use std::path::Path; use std::sync::Arc; use anyhow::Result; @@ -42,6 +43,8 @@ pub struct OptimizerProperties { pub partial_explore_space: Option, /// Disable pruning during optimization. pub disable_pruning: bool, + /// Dump the memo as 0000.dot, 0001.dot, ... files to this path + pub dot_file_path: Option, } pub struct CascadesOptimizer = NaiveMemo> { @@ -56,6 +59,7 @@ pub struct CascadesOptimizer = NaiveMemo> { property_builders: Arc<[Box>]>, pub ctx: OptimizerContext, pub prop: OptimizerProperties, + pub next_dot_file: usize, } /// `RelNode` only contains the representation of the plan nodes. Sometimes, we need more context, @@ -125,6 +129,7 @@ impl CascadesOptimizer> { property_builders, prop, disabled_rules: HashSet::new(), + next_dot_file: 0, } } @@ -172,7 +177,7 @@ impl> CascadesOptimizer { self.disabled_rules.contains(&rule_id) } - pub fn dump_dot(&self, writer: &mut Box) -> Result<(), std::io::Error> { + pub fn dump_dot(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> { let memo = self.memo(); // filter out predicates and use a btree for predictable iteration order diff --git a/optd-core/src/cascades/tasks/apply_rule.rs b/optd-core/src/cascades/tasks/apply_rule.rs index 148c779b..d5932a9c 100644 --- a/optd-core/src/cascades/tasks/apply_rule.rs +++ b/optd-core/src/cascades/tasks/apply_rule.rs @@ -3,11 +3,12 @@ // Use of this source code is governed by an MIT-style license that can be found in the LICENSE file or at // https://opensource.org/licenses/MIT. +use std::path::Path; use std::sync::Arc; use anyhow::Result; use itertools::Itertools; -use tracing::{log::log_enabled, log::Level::Trace, trace}; +use tracing::{debug, trace}; use super::Task; use crate::cascades::memo::ArcMemoPlanNode; @@ -164,16 +165,11 @@ impl> Task for ApplyRuleTask { return Ok(vec![]); } - if log_enabled!(target: "optd-memoviz", Trace) { - let path = format!( - "optd-{:#016x}.dot", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_nanos() - ); - let mut writer = - Box::new(std::fs::File::create(path).unwrap()) as Box; + if let Some(pathname) = &optimizer.prop.dot_file_path { + let path = Path::new(pathname).join(format!("{:#08}.dot", optimizer.next_dot_file)); + debug!("dumping memo to {:?}", path); + optimizer.next_dot_file += 1; + let mut writer = std::fs::File::create(path).unwrap(); optimizer.dump_dot(&mut writer); } diff --git a/optd-datafusion-repr/src/lib.rs b/optd-datafusion-repr/src/lib.rs index 5a491d1d..eb224db5 100644 --- a/optd-datafusion-repr/src/lib.rs +++ b/optd-datafusion-repr/src/lib.rs @@ -152,6 +152,7 @@ impl DatafusionOptimizer { partial_explore_iter: Some(1 << 20), partial_explore_space: Some(1 << 10), disable_pruning: false, + dot_file_path: None, }, ), heuristic_optimizer: HeuristicsOptimizer::new_with_rules( From 9538ebfc101aff9dd007e39eefdd7036447ddd32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pereira?= Date: Fri, 15 Nov 2024 12:58:35 -0500 Subject: [PATCH 3/5] simplify traversal without predicates --- optd-core/src/cascades/optimizer.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index beaf9163..37f1de2e 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -232,13 +232,11 @@ impl> CascadesOptimizer { let expr = memo.get_expr_memoed(*expr_id); //writeln!(writer, "\"g{}\" -> \"e{}\";", i, e)?; for child in expr.children.iter() { - if groups.contains(child) { - writeln!( - writer, - "e{} -> g{} [lhead=\"cluster_{}\"];", - expr_id.0, child.0, child.0 - )?; - } + writeln!( + writer, + "e{} -> g{} [lhead=\"cluster_{}\"];", + expr_id.0, child.0, child.0 + )?; } } } From 7df07eaf7a702d4c3eb5f80ed6ef1fb5c83b1921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pereira?= Date: Sun, 17 Nov 2024 06:48:57 -0500 Subject: [PATCH 4/5] simplify group ordering --- optd-core/src/cascades/optimizer.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index 37f1de2e..26a2b864 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -11,6 +11,7 @@ use std::path::Path; use std::sync::Arc; use anyhow::Result; +use itertools::Itertools; use tracing::trace; use super::memo::{ArcMemoPlanNode, GroupInfo, Memo}; @@ -180,18 +181,8 @@ impl> CascadesOptimizer { pub fn dump_dot(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> { let memo = self.memo(); - // filter out predicates and use a btree for predictable iteration order - let mut groups = BTreeSet::new(); - for group_id in memo.get_all_group_ids() { - let group = memo.get_group(group_id); - let mut rel = false; - for expr_id in memo.get_all_exprs_in_group(group_id).iter() { - if memo.get_expr_memoed(*expr_id).typ.is_logical() { - groups.insert(group_id); - break; - } - } - } + // Collect all groups in a predictable iteration order + let groups: Vec = memo.get_all_group_ids().iter().sorted().cloned().collect(); writeln!(writer, "digraph Memo {{")?; writeln!( @@ -230,7 +221,6 @@ impl> CascadesOptimizer { for group_id in groups.iter() { for expr_id in memo.get_all_exprs_in_group(*group_id).iter() { let expr = memo.get_expr_memoed(*expr_id); - //writeln!(writer, "\"g{}\" -> \"e{}\";", i, e)?; for child in expr.children.iter() { writeln!( writer, From a702a42d79782e1b7c84318189baeeb07b6186fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pereira?= Date: Sun, 17 Nov 2024 06:50:04 -0500 Subject: [PATCH 5/5] traverse predicates --- optd-core/src/cascades/optimizer.rs | 34 ++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/optd-core/src/cascades/optimizer.rs b/optd-core/src/cascades/optimizer.rs index 26a2b864..001442b8 100644 --- a/optd-core/src/cascades/optimizer.rs +++ b/optd-core/src/cascades/optimizer.rs @@ -5,7 +5,7 @@ use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; use std::fmt::Display; -use std::hash::{DefaultHasher, Hasher}; +use std::hash::{DefaultHasher, Hash, Hasher}; use std::io::Write; use std::path::Path; use std::sync::Arc; @@ -228,11 +228,43 @@ impl> CascadesOptimizer { expr_id.0, child.0, child.0 )?; } + let mut next_pred: usize = 0; + for pred_id in expr.predicates.iter() { + let pred = memo.get_pred(*pred_id); + let id = next_pred; + self.dump_dot_pred(writer, &pred, expr_id.0, &mut next_pred); + writeln!(writer, "e{} -> p{}_{};", expr_id.0, expr_id.0, id)?; + } } } writeln!(writer, "}}") } + fn dump_dot_pred( + &self, + writer: &mut dyn Write, + pred: &ArcPredNode, + base: usize, + next_pred: &mut usize, + ) -> Result<(), std::io::Error> { + let mut s = DefaultHasher::new(); + pred.typ.hash(&mut s); + let color = (s.finish() % 11) + 1; + let id = *next_pred; + *next_pred += 1; + writeln!( + writer, + "p{}_{} [shape=diamond,label=\"{:?}\",penwidth=3,color={}]", + base, id, pred.typ, color + )?; + for child in pred.children.iter() { + let child_id = *next_pred; + self.dump_dot_pred(writer, child, base, next_pred)?; + writeln!(writer, "p{}_{} -> p{}_{};", base, id, base, child_id)?; + } + Ok(()) + } + pub fn dump(&self) { for group_id in self.memo.get_all_group_ids() { let winner_str = match &self.memo.get_group_info(group_id).winner {