Skip to content

Commit 2ec4dc3

Browse files
UBarneylongxiang.lv
authored and
longxiang.lv
committed
Implement predicate pruning for not like expressions
1 parent 9c12919 commit 2ec4dc3

File tree

2 files changed

+214
-13
lines changed

2 files changed

+214
-13
lines changed

Diff for: datafusion/core/tests/fuzz_cases/pruning.rs

+14
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,27 @@ async fn test_utf8_not_like_prefix() {
110110
.await;
111111
}
112112

113+
#[tokio::test]
114+
async fn test_utf8_not_like_ecsape() {
115+
Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value))))
116+
.run()
117+
.await;
118+
}
119+
113120
#[tokio::test]
114121
async fn test_utf8_not_like_suffix() {
115122
Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value))))
116123
.run()
117124
.await;
118125
}
119126

127+
#[tokio::test]
128+
async fn test_utf8_not_like_suffix_one() {
129+
Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value))))
130+
.run()
131+
.await;
132+
}
133+
120134
/// Fuzz testing for UTF8 predicate pruning
121135
/// The basic idea is that query results should always be the same with or without stats/pruning
122136
/// If we get this right we at least guarantee that there are no incorrect results

Diff for: datafusion/physical-optimizer/src/pruning.rs

+200-13
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
15901590
)),
15911591
))
15921592
}
1593+
Operator::NotLikeMatch => build_not_like_match(expr_builder)?,
15931594
Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
15941595
plan_datafusion_err!(
15951596
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
16381639
Ok(statistics_expr)
16391640
}
16401641

1642+
/// returns the string literal of the scalar value if it is a string
1643+
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1644+
s.try_as_str().flatten()
1645+
}
1646+
1647+
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1648+
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1649+
let s = unpack_string(lit.value())?;
1650+
return Some(s);
1651+
}
1652+
None
1653+
}
1654+
16411655
/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421656
/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431657
/// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
16501664
// column LIKE '%foo%' => min <= '' && '' <= max => true
16511665
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521666

1653-
/// returns the string literal of the scalar value if it is a string
1654-
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1655-
s.try_as_str().flatten()
1656-
}
1657-
1658-
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1659-
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1660-
let s = unpack_string(lit.value())?;
1661-
return Some(s);
1662-
}
1663-
None
1664-
}
1665-
16661667
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671668
// this may involve building the physical expressions that call lower() and upper()
16681669
let min_column_expr = expr_builder.min_column_expr().ok()?;
@@ -1710,6 +1711,66 @@ fn build_like_match(
17101711
Some(combined)
17111712
}
17121713

1714+
// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1715+
fn build_not_like_match(
1716+
expr_builder: &mut PruningExpressionBuilder<'_>,
1717+
) -> Result<Arc<dyn PhysicalExpr>> {
1718+
// col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1719+
1720+
let min_column_expr = expr_builder.min_column_expr()?;
1721+
let max_column_expr = expr_builder.max_column_expr()?;
1722+
1723+
let scalar_expr = expr_builder.scalar_expr();
1724+
1725+
let pattern = extract_string_literal(scalar_expr).ok_or_else(|| {
1726+
plan_datafusion_err!("cannot extract literal from NOT LIKE expression")
1727+
})?;
1728+
1729+
let chars: Vec<char> = pattern.chars().collect();
1730+
for i in 0..chars.len() - 1 {
1731+
// Check if current char is a wildcard and is not escaped with backslash
1732+
if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
1733+
// Example: For pattern "foo%bar", the row group might include values like
1734+
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1735+
// Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1736+
// match the pattern, intermediate values like "food" may not
1737+
// match the full pattern "foo%bar", making pruning unsafe.
1738+
// (truncate foo%bar to foo% have same problem)
1739+
return Err(plan_datafusion_err!(
1740+
"NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
1741+
));
1742+
}
1743+
}
1744+
1745+
if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
1746+
// Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1747+
// which means not every row is guaranteed to match the pattern.
1748+
return Err(plan_datafusion_err!(
1749+
"NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
1750+
));
1751+
}
1752+
1753+
let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new(
1754+
true,
1755+
false,
1756+
Arc::clone(&min_column_expr),
1757+
Arc::clone(scalar_expr),
1758+
));
1759+
1760+
let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new(
1761+
true,
1762+
false,
1763+
Arc::clone(&max_column_expr),
1764+
Arc::clone(scalar_expr),
1765+
));
1766+
1767+
Ok(Arc::new(phys_expr::BinaryExpr::new(
1768+
min_col_not_like_epxr,
1769+
Operator::Or,
1770+
max_col_not_like_expr,
1771+
)))
1772+
}
1773+
17131774
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141775
/// This makes it so that the returned string will always compare greater than the input string
17151776
/// or any other string with the same prefix.
@@ -4061,6 +4122,132 @@ mod tests {
40614122
prune_with_expr(expr, &schema, &statistics, expected_ret);
40624123
}
40634124

4125+
#[test]
4126+
fn prune_utf8_not_like_one() {
4127+
let (schema, statistics) = utf8_setup();
4128+
4129+
let expr = col("s1").not_like(lit("A\u{10ffff}_"));
4130+
#[rustfmt::skip]
4131+
let expected_ret = &[
4132+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4133+
true,
4134+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4135+
true,
4136+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4137+
true,
4138+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4139+
true,
4140+
// s1 [NULL, NULL] ==> unknown (must keep)
4141+
true,
4142+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4143+
true,
4144+
// s1 ["", "A"] ==> some rows could pass (must keep)
4145+
true,
4146+
// s1 ["", ""] ==> some rows could pass (must keep)
4147+
true,
4148+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4149+
true,
4150+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4151+
// orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4152+
true,
4153+
];
4154+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4155+
}
4156+
4157+
#[test]
4158+
fn prune_utf8_not_like_many() {
4159+
let (schema, statistics) = utf8_setup();
4160+
4161+
let expr = col("s1").not_like(lit("A\u{10ffff}%"));
4162+
#[rustfmt::skip]
4163+
let expected_ret = &[
4164+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4165+
true,
4166+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4167+
true,
4168+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4169+
true,
4170+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4171+
true,
4172+
// s1 [NULL, NULL] ==> unknown (must keep)
4173+
true,
4174+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4175+
true,
4176+
// s1 ["", "A"] ==> some rows could pass (must keep)
4177+
true,
4178+
// s1 ["", ""] ==> some rows could pass (must keep)
4179+
true,
4180+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4181+
true,
4182+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4183+
false,
4184+
];
4185+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4186+
4187+
let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}"));
4188+
#[rustfmt::skip]
4189+
let expected_ret = &[
4190+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4191+
true,
4192+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4193+
true,
4194+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4195+
true,
4196+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4197+
true,
4198+
// s1 [NULL, NULL] ==> unknown (must keep)
4199+
true,
4200+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4201+
true,
4202+
// s1 ["", "A"] ==> some rows could pass (must keep)
4203+
true,
4204+
// s1 ["", ""] ==> some rows could pass (must keep)
4205+
true,
4206+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4207+
true,
4208+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4209+
true,
4210+
];
4211+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4212+
4213+
let expr = col("s1").not_like(lit("M"));
4214+
#[rustfmt::skip]
4215+
let expected_ret = &[
4216+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4217+
true,
4218+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4219+
true,
4220+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4221+
true,
4222+
// s1 ["M", "M"] ==> no row match
4223+
false,
4224+
// s1 [NULL, NULL] ==> unknown (must keep)
4225+
true,
4226+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4227+
true,
4228+
// s1 ["", "A"] ==> some rows could pass (must keep)
4229+
true,
4230+
// s1 ["", ""] ==> some rows could pass (must keep)
4231+
true,
4232+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4233+
true,
4234+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4235+
true,
4236+
];
4237+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4238+
4239+
let expr = col("s1").not_like(lit("A\\%%"));
4240+
let statistics = TestStatistics::new().with(
4241+
"s1",
4242+
ContainerStats::new_utf8(
4243+
vec![Some("A%a"), Some("A")],
4244+
vec![Some("A%c"), Some("A")],
4245+
),
4246+
);
4247+
let expected_ret = &[false, true];
4248+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4249+
}
4250+
40644251
#[test]
40654252
fn test_rewrite_expr_to_prunable() {
40664253
let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);

0 commit comments

Comments
 (0)