Skip to content

Commit 061ac03

Browse files
UBarneylongxiang.lv
authored and
longxiang.lv
committed
Implement predicate pruning for not like expressions
1 parent 9c12919 commit 061ac03

File tree

2 files changed

+184
-13
lines changed

2 files changed

+184
-13
lines changed

Diff for: datafusion/core/tests/fuzz_cases/pruning.rs

+14
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,27 @@ async fn test_utf8_not_like_prefix() {
110110
.await;
111111
}
112112

113+
#[tokio::test]
114+
async fn test_utf8_not_like_ecsape() {
115+
Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value))))
116+
.run()
117+
.await;
118+
}
119+
113120
#[tokio::test]
114121
async fn test_utf8_not_like_suffix() {
115122
Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value))))
116123
.run()
117124
.await;
118125
}
119126

127+
#[tokio::test]
128+
async fn test_utf8_not_like_suffix_one() {
129+
Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value))))
130+
.run()
131+
.await;
132+
}
133+
120134
/// Fuzz testing for UTF8 predicate pruning
121135
/// The basic idea is that query results should always be the same with or without stats/pruning
122136
/// If we get this right we at least guarantee that there are no incorrect results

Diff for: datafusion/physical-optimizer/src/pruning.rs

+170-13
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,13 @@ fn build_statistics_expr(
15901590
)),
15911591
))
15921592
}
1593+
Operator::NotLikeMatch => {
1594+
build_not_like_match(expr_builder).ok_or_else(|| {
1595+
plan_datafusion_err!(
1596+
"The NOT LIKE expression with wildcards is only supported at the end of the pattern"
1597+
)
1598+
})?
1599+
}
15931600
Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
15941601
plan_datafusion_err!(
15951602
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1645,19 @@ fn build_statistics_expr(
16381645
Ok(statistics_expr)
16391646
}
16401647

1648+
/// returns the string literal of the scalar value if it is a string
1649+
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1650+
s.try_as_str().flatten()
1651+
}
1652+
1653+
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1654+
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1655+
let s = unpack_string(lit.value())?;
1656+
return Some(s);
1657+
}
1658+
None
1659+
}
1660+
16411661
/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421662
/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431663
/// lowest string after all P* strings.
@@ -1650,19 +1670,6 @@ fn build_like_match(
16501670
// column LIKE '%foo%' => min <= '' && '' <= max => true
16511671
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521672

1653-
/// returns the string literal of the scalar value if it is a string
1654-
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1655-
s.try_as_str().flatten()
1656-
}
1657-
1658-
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1659-
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1660-
let s = unpack_string(lit.value())?;
1661-
return Some(s);
1662-
}
1663-
None
1664-
}
1665-
16661673
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671674
// this may involve building the physical expressions that call lower() and upper()
16681675
let min_column_expr = expr_builder.min_column_expr().ok()?;
@@ -1710,6 +1717,56 @@ fn build_like_match(
17101717
Some(combined)
17111718
}
17121719

1720+
// For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1721+
fn build_not_like_match(
1722+
expr_builder: &mut PruningExpressionBuilder<'_>,
1723+
) -> Option<Arc<dyn PhysicalExpr>> {
1724+
// col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1725+
1726+
let min_column_expr = expr_builder.min_column_expr().ok()?;
1727+
let max_column_expr = expr_builder.max_column_expr().ok()?;
1728+
1729+
let scalar_expr = expr_builder.scalar_expr();
1730+
1731+
let pattern = extract_string_literal(scalar_expr)?;
1732+
1733+
let chars: Vec<char> = pattern.chars().collect();
1734+
for i in 0..chars.len() - 1 {
1735+
// Check if current char is a wildcard and is not escaped with backslash
1736+
if (chars[i] == '%' || chars[i] == '_') && (i == 0 || chars[i - 1] != '\\') {
1737+
// Example: For pattern "foo%bar", the row group might include values like
1738+
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1739+
return None;
1740+
}
1741+
}
1742+
1743+
if chars.last() == Some(&'_') && (chars.len() > 1 && chars[chars.len() - 2] != '\\') {
1744+
// Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1745+
// which means not every row is guaranteed to match the pattern.
1746+
return None;
1747+
}
1748+
1749+
let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new(
1750+
true,
1751+
false,
1752+
Arc::clone(&min_column_expr),
1753+
Arc::clone(scalar_expr),
1754+
));
1755+
1756+
let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new(
1757+
true,
1758+
false,
1759+
Arc::clone(&max_column_expr),
1760+
Arc::clone(scalar_expr),
1761+
));
1762+
1763+
Some(Arc::new(phys_expr::BinaryExpr::new(
1764+
min_col_not_like_epxr,
1765+
Operator::Or,
1766+
max_col_not_like_expr,
1767+
)))
1768+
}
1769+
17131770
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141771
/// This makes it so that the returned string will always compare greater than the input string
17151772
/// or any other string with the same prefix.
@@ -4061,6 +4118,106 @@ mod tests {
40614118
prune_with_expr(expr, &schema, &statistics, expected_ret);
40624119
}
40634120

4121+
#[test]
4122+
fn prune_utf8_not_like_one() {
4123+
let (schema, statistics) = utf8_setup();
4124+
4125+
let expr = col("s1").not_like(lit("A\u{10ffff}_"));
4126+
#[rustfmt::skip]
4127+
let expected_ret = &[
4128+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4129+
true,
4130+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4131+
true,
4132+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4133+
true,
4134+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4135+
true,
4136+
// s1 [NULL, NULL] ==> unknown (must keep)
4137+
true,
4138+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4139+
true,
4140+
// s1 ["", "A"] ==> some rows could pass (must keep)
4141+
true,
4142+
// s1 ["", ""] ==> some rows could pass (must keep)
4143+
true,
4144+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4145+
true,
4146+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4147+
// orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4148+
true,
4149+
];
4150+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4151+
}
4152+
4153+
#[test]
4154+
fn prune_utf8_not_like_many() {
4155+
let (schema, statistics) = utf8_setup();
4156+
4157+
let expr = col("s1").not_like(lit("A\u{10ffff}%"));
4158+
#[rustfmt::skip]
4159+
let expected_ret = &[
4160+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4161+
true,
4162+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4163+
true,
4164+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4165+
true,
4166+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4167+
true,
4168+
// s1 [NULL, NULL] ==> unknown (must keep)
4169+
true,
4170+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4171+
true,
4172+
// s1 ["", "A"] ==> some rows could pass (must keep)
4173+
true,
4174+
// s1 ["", ""] ==> some rows could pass (must keep)
4175+
true,
4176+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4177+
true,
4178+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4179+
false,
4180+
];
4181+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4182+
4183+
let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}"));
4184+
#[rustfmt::skip]
4185+
let expected_ret = &[
4186+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4187+
true,
4188+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4189+
true,
4190+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4191+
true,
4192+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4193+
true,
4194+
// s1 [NULL, NULL] ==> unknown (must keep)
4195+
true,
4196+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4197+
true,
4198+
// s1 ["", "A"] ==> some rows could pass (must keep)
4199+
true,
4200+
// s1 ["", ""] ==> some rows could pass (must keep)
4201+
true,
4202+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4203+
true,
4204+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4205+
true,
4206+
];
4207+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4208+
4209+
let expr = col("s1").not_like(lit("A\\%%"));
4210+
let statistics = TestStatistics::new().with(
4211+
"s1",
4212+
ContainerStats::new_utf8(
4213+
vec![Some("A%a"), Some("A")],
4214+
vec![Some("A%c"), Some("A")],
4215+
),
4216+
);
4217+
let expected_ret = &[false, true];
4218+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4219+
}
4220+
40644221
#[test]
40654222
fn test_rewrite_expr_to_prunable() {
40664223
let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);

0 commit comments

Comments
 (0)