Skip to content

Commit 61cdf5a

Browse files
authored
fix(query): fold constant subquery to build filter plan instead of join plan (#17448)
* fix(query): fold constant subquery to build filter plan instead of join plan * check max_inlist_to_or * fix * fix
1 parent 67d4a16 commit 61cdf5a

File tree

7 files changed

+423
-7
lines changed

7 files changed

+423
-7
lines changed

src/query/sql/src/planner/binder/bind_mutation/mutation_expression.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,8 @@ impl MutationExpression {
281281
Arc::new(s_expr),
282282
);
283283

284-
let mut rewriter = SubqueryRewriter::new(binder.metadata.clone(), None);
284+
let mut rewriter =
285+
SubqueryRewriter::new(binder.ctx.clone(), binder.metadata.clone(), None);
285286
let s_expr = rewriter.rewrite(&s_expr)?;
286287

287288
Ok(MutationExpressionBindResult {

src/query/sql/src/planner/binder/bind_table_reference/bind_join.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,8 @@ impl Binder {
381381
let mut is_lateral = false;
382382
if !right_prop.outer_columns.is_empty() {
383383
// If there are outer columns in right child, then the join is a correlated lateral join
384-
let mut decorrelator = SubqueryRewriter::new(self.metadata.clone(), Some(self.clone()));
384+
let mut decorrelator =
385+
SubqueryRewriter::new(self.ctx.clone(), self.metadata.clone(), Some(self.clone()));
385386
right_child = decorrelator.flatten_plan(
386387
&right_child,
387388
&right_prop.outer_columns,

src/query/sql/src/planner/optimizer/decorrelate/decorrelate.rs

Lines changed: 301 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,21 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use std::collections::BTreeSet;
1516
use std::collections::HashSet;
1617
use std::sync::Arc;
1718

1819
use databend_common_ast::Span;
20+
use databend_common_catalog::table_context::TableContext;
21+
use databend_common_exception::ErrorCode;
1922
use databend_common_exception::Result;
23+
use databend_common_expression::type_check::common_super_type;
2024
use databend_common_expression::types::DataType;
25+
use databend_common_expression::types::NumberScalar;
26+
use databend_common_expression::ColumnBuilder;
27+
use databend_common_expression::Scalar;
28+
use databend_common_expression::ScalarRef;
29+
use databend_common_functions::BUILTIN_FUNCTIONS;
2130

2231
use crate::binder::ColumnBindingBuilder;
2332
use crate::binder::JoinPredicate;
@@ -30,12 +39,16 @@ use crate::optimizer::ColumnSet;
3039
use crate::optimizer::RelExpr;
3140
use crate::optimizer::SExpr;
3241
use crate::plans::BoundColumnRef;
42+
use crate::plans::CastExpr;
43+
use crate::plans::ComparisonOp;
44+
use crate::plans::ConstantExpr;
3345
use crate::plans::Filter;
3446
use crate::plans::FunctionCall;
3547
use crate::plans::Join;
3648
use crate::plans::JoinEquiCondition;
3749
use crate::plans::JoinType;
3850
use crate::plans::RelOp;
51+
use crate::plans::RelOperator;
3952
use crate::plans::ScalarExpr;
4053
use crate::plans::SubqueryExpr;
4154
use crate::plans::SubqueryType;
@@ -53,8 +66,12 @@ use crate::MetadataRef;
5366
/// Correlated exists subquery -> Marker join
5467
///
5568
/// More information can be found in the paper: Unnesting Arbitrary Queries
56-
pub fn decorrelate_subquery(metadata: MetadataRef, s_expr: SExpr) -> Result<SExpr> {
57-
let mut rewriter = SubqueryRewriter::new(metadata, None);
69+
pub fn decorrelate_subquery(
70+
ctx: Arc<dyn TableContext>,
71+
metadata: MetadataRef,
72+
s_expr: SExpr,
73+
) -> Result<SExpr> {
74+
let mut rewriter = SubqueryRewriter::new(ctx, metadata, None);
5875
rewriter.rewrite(&s_expr)
5976
}
6077

@@ -517,4 +534,286 @@ impl SubqueryRewriter {
517534
true
518535
}))
519536
}
537+
538+
// Try folding the subquery into a constant value expression,
539+
// which turns the join plan into a filter plan, so that the bloom filter
540+
// can be used to reduce the amount of data that needs to be read.
541+
pub fn try_fold_constant_subquery(
542+
&self,
543+
subquery: &SubqueryExpr,
544+
) -> Result<Option<ScalarExpr>> {
545+
// (1) EvalScalar
546+
// \
547+
// DummyTableScan
548+
//
549+
// (2) EvalScalar
550+
// \
551+
// EvalScalar
552+
// \
553+
// ProjectSet
554+
// \
555+
// DummyTableScan
556+
let matchers = vec![
557+
Matcher::MatchOp {
558+
op_type: RelOp::EvalScalar,
559+
children: vec![Matcher::MatchOp {
560+
op_type: RelOp::DummyTableScan,
561+
children: vec![],
562+
}],
563+
},
564+
Matcher::MatchOp {
565+
op_type: RelOp::EvalScalar,
566+
children: vec![Matcher::MatchOp {
567+
op_type: RelOp::EvalScalar,
568+
children: vec![Matcher::MatchOp {
569+
op_type: RelOp::ProjectSet,
570+
children: vec![Matcher::MatchOp {
571+
op_type: RelOp::DummyTableScan,
572+
children: vec![],
573+
}],
574+
}],
575+
}],
576+
},
577+
];
578+
579+
let mut matched = false;
580+
for matcher in matchers {
581+
if matcher.matches(&subquery.subquery) {
582+
matched = true;
583+
break;
584+
}
585+
}
586+
if !matched {
587+
return Ok(None);
588+
}
589+
590+
let child = subquery.subquery.child(0)?;
591+
if let RelOperator::DummyTableScan(_) = child.plan() {
592+
// subquery is a simple constant value.
593+
// for example: `SELECT * FROM t WHERE id = (select 1);`
594+
if let RelOperator::EvalScalar(eval) = subquery.subquery.plan() {
595+
if eval.items.len() != 1 {
596+
return Ok(None);
597+
}
598+
let Ok(const_scalar) = ConstantExpr::try_from(eval.items[0].scalar.clone()) else {
599+
return Ok(None);
600+
};
601+
match (&subquery.child_expr, subquery.compare_op) {
602+
(Some(child_expr), Some(compare_op)) => {
603+
let func_name = compare_op.to_func_name().to_string();
604+
let func = ScalarExpr::FunctionCall(FunctionCall {
605+
span: subquery.span,
606+
func_name,
607+
params: vec![],
608+
arguments: vec![*child_expr.clone(), const_scalar.into()],
609+
});
610+
return Ok(Some(func));
611+
}
612+
(None, None) => match subquery.typ {
613+
SubqueryType::Scalar => {
614+
return Ok(Some(const_scalar.into()));
615+
}
616+
SubqueryType::Exists => {
617+
return Ok(Some(ScalarExpr::ConstantExpr(ConstantExpr {
618+
span: subquery.span,
619+
value: Scalar::Boolean(true),
620+
})));
621+
}
622+
SubqueryType::NotExists => {
623+
return Ok(Some(ScalarExpr::ConstantExpr(ConstantExpr {
624+
span: subquery.span,
625+
value: Scalar::Boolean(false),
626+
})));
627+
}
628+
_ => {}
629+
},
630+
(_, _) => {}
631+
}
632+
}
633+
} else {
634+
// subquery is a set returning function return constant values.
635+
// for example: `SELECT * FROM t WHERE id IN (SELECT * FROM UNNEST(SPLIT('1,2,3', ',')) AS t1);`
636+
let mut output_column_index = None;
637+
if let RelOperator::EvalScalar(eval) = subquery.subquery.plan() {
638+
if eval.items.len() != 1 {
639+
return Ok(None);
640+
}
641+
if let ScalarExpr::BoundColumnRef(bound_column) = &eval.items[0].scalar {
642+
output_column_index = Some(bound_column.column.index);
643+
}
644+
}
645+
if output_column_index.is_none() {
646+
return Ok(None);
647+
}
648+
let output_column_index = output_column_index.unwrap();
649+
650+
let mut srf_column_index = None;
651+
if let RelOperator::EvalScalar(eval) = child.plan() {
652+
if eval.items.len() != 1 || eval.items[0].index != output_column_index {
653+
return Ok(None);
654+
}
655+
if let ScalarExpr::FunctionCall(get_func) = &eval.items[0].scalar {
656+
if get_func.func_name == "get"
657+
&& get_func.arguments.len() == 1
658+
&& get_func.params.len() == 1
659+
&& get_func.params[0] == Scalar::Number(NumberScalar::Int64(1))
660+
{
661+
if let ScalarExpr::BoundColumnRef(bound_column) = &get_func.arguments[0] {
662+
srf_column_index = Some(bound_column.column.index);
663+
}
664+
}
665+
}
666+
}
667+
if srf_column_index.is_none() {
668+
return Ok(None);
669+
}
670+
let srf_column_index = srf_column_index.unwrap();
671+
672+
let project_set_expr = child.child(0)?;
673+
if let RelOperator::ProjectSet(project_set) = project_set_expr.plan() {
674+
if project_set.srfs.len() != 1
675+
|| project_set.srfs[0].index != srf_column_index
676+
|| subquery.compare_op != Some(ComparisonOp::Equal)
677+
|| subquery.typ != SubqueryType::Any
678+
{
679+
return Ok(None);
680+
}
681+
let Ok(srf) = FunctionCall::try_from(project_set.srfs[0].scalar.clone()) else {
682+
return Ok(None);
683+
};
684+
if srf.arguments.len() != 1 {
685+
return Ok(None);
686+
}
687+
let Ok(const_scalar) = ConstantExpr::try_from(srf.arguments[0].clone()) else {
688+
return Ok(None);
689+
};
690+
let Some(child_expr) = &subquery.child_expr else {
691+
return Ok(None);
692+
};
693+
match &const_scalar.value {
694+
Scalar::EmptyArray => {
695+
return Ok(Some(ScalarExpr::ConstantExpr(ConstantExpr {
696+
span: subquery.span,
697+
value: Scalar::Null,
698+
})));
699+
}
700+
Scalar::Array(array_column) => {
701+
let mut values = BTreeSet::new();
702+
for scalar in array_column.iter() {
703+
// Ignoring NULL values in equivalent filter
704+
if scalar == ScalarRef::Null {
705+
continue;
706+
}
707+
values.insert(scalar.to_owned());
708+
}
709+
// If there are no equivalent values, the filter condition does not match,
710+
// return a NULL value.
711+
if values.is_empty() {
712+
return Ok(Some(ScalarExpr::ConstantExpr(ConstantExpr {
713+
span: subquery.span,
714+
value: Scalar::Null,
715+
})));
716+
}
717+
// If the number of values more than `inlist_to_join_threshold`, need convert to join.
718+
if values.len() >= self.ctx.get_settings().get_inlist_to_join_threshold()? {
719+
return Ok(None);
720+
}
721+
// If the number of values more than `max_inlist_to_or`, use contains function instead of or.
722+
if values.len() > self.ctx.get_settings().get_max_inlist_to_or()? as usize {
723+
let value_type = values.first().unwrap().as_ref().infer_data_type();
724+
let mut builder =
725+
ColumnBuilder::with_capacity(&value_type, values.len());
726+
for value in values.into_iter() {
727+
builder.push(value.as_ref());
728+
}
729+
let array_value = ScalarExpr::ConstantExpr(ConstantExpr {
730+
span: subquery.span,
731+
value: Scalar::Array(builder.build()),
732+
});
733+
734+
let expr_type = child_expr.data_type()?;
735+
let common_type = common_super_type(
736+
value_type.clone(),
737+
expr_type.clone(),
738+
&BUILTIN_FUNCTIONS.default_cast_rules,
739+
)
740+
.ok_or_else(|| {
741+
ErrorCode::IllegalDataType(format!(
742+
"Cannot find common type for inlist subquery value {:?} and expr {:?}",
743+
&array_value, &child_expr
744+
))
745+
})?;
746+
747+
let mut arguments = Vec::with_capacity(2);
748+
if value_type != common_type {
749+
arguments.push(ScalarExpr::CastExpr(CastExpr {
750+
span: subquery.span,
751+
is_try: false,
752+
argument: Box::new(array_value),
753+
target_type: Box::new(DataType::Array(Box::new(
754+
common_type.clone(),
755+
))),
756+
}));
757+
} else {
758+
arguments.push(array_value);
759+
}
760+
if expr_type != common_type {
761+
arguments.push(ScalarExpr::CastExpr(CastExpr {
762+
span: subquery.span,
763+
is_try: false,
764+
argument: Box::new(*child_expr.clone()),
765+
target_type: Box::new(common_type.clone()),
766+
}));
767+
} else {
768+
arguments.push(*child_expr.clone());
769+
}
770+
let func = ScalarExpr::FunctionCall(FunctionCall {
771+
span: subquery.span,
772+
func_name: "contains".to_string(),
773+
params: vec![],
774+
arguments,
775+
});
776+
return Ok(Some(func));
777+
}
778+
779+
let mut funcs = Vec::with_capacity(values.len());
780+
for value in values.into_iter() {
781+
let scalar_value = ScalarExpr::ConstantExpr(ConstantExpr {
782+
span: subquery.span,
783+
value,
784+
});
785+
let func = ScalarExpr::FunctionCall(FunctionCall {
786+
span: subquery.span,
787+
func_name: "eq".to_string(),
788+
params: vec![],
789+
arguments: vec![*child_expr.clone(), scalar_value],
790+
});
791+
funcs.push(func);
792+
}
793+
let or_func = funcs
794+
.into_iter()
795+
.fold(None, |mut acc, func| {
796+
match acc.as_mut() {
797+
None => acc = Some(func),
798+
Some(acc) => {
799+
*acc = ScalarExpr::FunctionCall(FunctionCall {
800+
span: subquery.span,
801+
func_name: "or".to_string(),
802+
params: vec![],
803+
arguments: vec![acc.clone(), func],
804+
});
805+
}
806+
}
807+
acc
808+
})
809+
.unwrap();
810+
return Ok(Some(or_func));
811+
}
812+
_ => {}
813+
}
814+
}
815+
}
816+
817+
Ok(None)
818+
}
520819
}

src/query/sql/src/planner/optimizer/decorrelate/subquery_rewriter.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use std::collections::HashMap;
1616
use std::sync::Arc;
1717
use std::vec;
1818

19+
use databend_common_catalog::table_context::TableContext;
1920
use databend_common_exception::ErrorCode;
2021
use databend_common_exception::Result;
2122
use databend_common_expression::types::DataType;
@@ -69,14 +70,16 @@ pub struct FlattenInfo {
6970

7071
/// Rewrite subquery into `Apply` operator
7172
pub struct SubqueryRewriter {
73+
pub(crate) ctx: Arc<dyn TableContext>,
7274
pub(crate) metadata: MetadataRef,
7375
pub(crate) derived_columns: HashMap<IndexType, IndexType>,
7476
pub(crate) binder: Option<Binder>,
7577
}
7678

7779
impl SubqueryRewriter {
78-
pub fn new(metadata: MetadataRef, binder: Option<Binder>) -> Self {
80+
pub fn new(ctx: Arc<dyn TableContext>, metadata: MetadataRef, binder: Option<Binder>) -> Self {
7981
Self {
82+
ctx,
8083
metadata,
8184
derived_columns: Default::default(),
8285
binder,
@@ -254,6 +257,10 @@ impl SubqueryRewriter {
254257
let mut subquery = subquery.clone();
255258
subquery.subquery = Box::new(self.rewrite(&subquery.subquery)?);
256259

260+
if let Some(constant_subquery) = self.try_fold_constant_subquery(&subquery)? {
261+
return Ok((constant_subquery, s_expr.clone()));
262+
}
263+
257264
// Check if the subquery is a correlated subquery.
258265
// If it is, we'll try to flatten it and rewrite to join.
259266
// If it is not, we'll just rewrite it to join

0 commit comments

Comments
 (0)