Skip to content

Commit b301f8c

Browse files
committed
Implement predicate pruning for not like expressions
1 parent 9c12919 commit b301f8c

File tree

2 files changed

+186
-13
lines changed

2 files changed

+186
-13
lines changed

Diff for: datafusion/core/tests/fuzz_cases/pruning.rs

+15
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,28 @@ async fn test_utf8_not_like_prefix() {
110110
.await;
111111
}
112112

113+
#[tokio::test]
114+
async fn test_utf8_not_like_ecsape() {
115+
Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value))))
116+
.run()
117+
.await;
118+
}
119+
120+
113121
#[tokio::test]
114122
async fn test_utf8_not_like_suffix() {
115123
Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value))))
116124
.run()
117125
.await;
118126
}
119127

128+
#[tokio::test]
129+
async fn test_utf8_not_like_suffix_one() {
130+
Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value))))
131+
.run()
132+
.await;
133+
}
134+
120135
/// Fuzz testing for UTF8 predicate pruning
121136
/// The basic idea is that query results should always be the same with or without stats/pruning
122137
/// If we get this right we at least guarantee that there are no incorrect results

Diff for: datafusion/physical-optimizer/src/pruning.rs

+171-13
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,13 @@ fn build_statistics_expr(
15901590
)),
15911591
))
15921592
}
1593+
Operator::NotLikeMatch => {
1594+
build_not_like_match(expr_builder).ok_or_else(|| {
1595+
plan_datafusion_err!(
1596+
"The NOT LIKE expression with wildcards is only supported at the end of the pattern"
1597+
)
1598+
})?
1599+
}
15931600
Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
15941601
plan_datafusion_err!(
15951602
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1645,19 @@ fn build_statistics_expr(
16381645
Ok(statistics_expr)
16391646
}
16401647

1648+
/// returns the string literal of the scalar value if it is a string
1649+
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1650+
s.try_as_str().flatten()
1651+
}
1652+
1653+
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1654+
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1655+
let s = unpack_string(lit.value())?;
1656+
return Some(s);
1657+
}
1658+
None
1659+
}
1660+
16411661
/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421662
/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431663
/// lowest string after all P* strings.
@@ -1650,19 +1670,6 @@ fn build_like_match(
16501670
// column LIKE '%foo%' => min <= '' && '' <= max => true
16511671
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521672

1653-
/// returns the string literal of the scalar value if it is a string
1654-
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1655-
s.try_as_str().flatten()
1656-
}
1657-
1658-
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1659-
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1660-
let s = unpack_string(lit.value())?;
1661-
return Some(s);
1662-
}
1663-
None
1664-
}
1665-
16661673
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671674
// this may involve building the physical expressions that call lower() and upper()
16681675
let min_column_expr = expr_builder.min_column_expr().ok()?;
@@ -1710,6 +1717,57 @@ fn build_like_match(
17101717
Some(combined)
17111718
}
17121719

1720+
// col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1721+
// If both col_min and col_max match a prefix pattern, we can prune entire row group because **ALL** values in this row group will match the pattern.
1722+
fn build_not_like_match(
1723+
expr_builder: &mut PruningExpressionBuilder<'_>,
1724+
) -> Option<Arc<dyn PhysicalExpr>> {
1725+
let min_column_expr = expr_builder.min_column_expr().ok()?;
1726+
let max_column_expr = expr_builder.max_column_expr().ok()?;
1727+
1728+
let scalar_expr = expr_builder.scalar_expr();
1729+
1730+
let pattern = extract_string_literal(scalar_expr)?;
1731+
1732+
let chars: Vec<char> = pattern.chars().collect();
1733+
for i in 0..chars.len() - 1 {
1734+
// Check if current char is a wildcard and is not escaped with backslash
1735+
if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
1736+
// Example: For pattern "foo%bar", the row group might include values like
1737+
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1738+
return None;
1739+
}
1740+
}
1741+
1742+
if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
1743+
// Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1744+
// which means not every row is guaranteed to match the pattern.
1745+
return None;
1746+
}
1747+
1748+
let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new(
1749+
true,
1750+
false,
1751+
Arc::clone(&min_column_expr),
1752+
scalar_expr.clone(),
1753+
));
1754+
1755+
let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new(
1756+
true,
1757+
false,
1758+
Arc::clone(&max_column_expr),
1759+
scalar_expr.clone(),
1760+
));
1761+
1762+
Some(Arc::new(phys_expr::BinaryExpr::new(
1763+
min_col_not_like_epxr,
1764+
Operator::Or,
1765+
max_col_not_like_expr,
1766+
)))
1767+
}
1768+
1769+
1770+
17131771
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141772
/// This makes it so that the returned string will always compare greater than the input string
17151773
/// or any other string with the same prefix.
@@ -4061,6 +4119,106 @@ mod tests {
40614119
prune_with_expr(expr, &schema, &statistics, expected_ret);
40624120
}
40634121

4122+
#[test]
4123+
fn prune_utf8_not_like_one() {
4124+
let (schema, statistics) = utf8_setup();
4125+
4126+
let expr = col("s1").not_like(lit("A\u{10ffff}_"));
4127+
#[rustfmt::skip]
4128+
let expected_ret = &[
4129+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4130+
true,
4131+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4132+
true,
4133+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4134+
true,
4135+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4136+
true,
4137+
// s1 [NULL, NULL] ==> unknown (must keep)
4138+
true,
4139+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4140+
true,
4141+
// s1 ["", "A"] ==> some rows could pass (must keep)
4142+
true,
4143+
// s1 ["", ""] ==> some rows could pass (must keep)
4144+
true,
4145+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4146+
true,
4147+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4148+
// orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4149+
true,
4150+
];
4151+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4152+
}
4153+
4154+
#[test]
4155+
fn prune_utf8_not_like_many() {
4156+
let (schema, statistics) = utf8_setup();
4157+
4158+
let expr = col("s1").not_like(lit("A\u{10ffff}%"));
4159+
#[rustfmt::skip]
4160+
let expected_ret = &[
4161+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4162+
true,
4163+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4164+
true,
4165+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4166+
true,
4167+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4168+
true,
4169+
// s1 [NULL, NULL] ==> unknown (must keep)
4170+
true,
4171+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4172+
true,
4173+
// s1 ["", "A"] ==> some rows could pass (must keep)
4174+
true,
4175+
// s1 ["", ""] ==> some rows could pass (must keep)
4176+
true,
4177+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4178+
true,
4179+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4180+
false,
4181+
];
4182+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4183+
4184+
let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}"));
4185+
#[rustfmt::skip]
4186+
let expected_ret = &[
4187+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4188+
true,
4189+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4190+
true,
4191+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4192+
true,
4193+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4194+
true,
4195+
// s1 [NULL, NULL] ==> unknown (must keep)
4196+
true,
4197+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4198+
true,
4199+
// s1 ["", "A"] ==> some rows could pass (must keep)
4200+
true,
4201+
// s1 ["", ""] ==> some rows could pass (must keep)
4202+
true,
4203+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4204+
true,
4205+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4206+
true,
4207+
];
4208+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4209+
4210+
let expr = col("s1").not_like(lit("A\\%%"));
4211+
let statistics = TestStatistics::new().with(
4212+
"s1",
4213+
ContainerStats::new_utf8(
4214+
vec![Some("A%a"), Some("A")],
4215+
vec![Some("A%c"), Some("A")],
4216+
),
4217+
);
4218+
let expected_ret = &[false, true];
4219+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4220+
}
4221+
40644222
#[test]
40654223
fn test_rewrite_expr_to_prunable() {
40664224
let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);

0 commit comments

Comments
 (0)