@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
1590
1590
) ) ,
1591
1591
) )
1592
1592
}
1593
+ Operator :: NotLikeMatch => build_not_like_match ( expr_builder) ?,
1593
1594
Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
1594
1595
plan_datafusion_err ! (
1595
1596
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
1638
1639
Ok ( statistics_expr)
1639
1640
}
1640
1641
1642
+ /// returns the string literal of the scalar value if it is a string
1643
+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1644
+ s. try_as_str ( ) . flatten ( )
1645
+ }
1646
+
1647
+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1648
+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1649
+ let s = unpack_string ( lit. value ( ) ) ?;
1650
+ return Some ( s) ;
1651
+ }
1652
+ None
1653
+ }
1654
+
1641
1655
/// Convert `column LIKE literal` where P is a constant prefix of the literal
1642
1656
/// to a range check on the column: `P <= column && column < P'`, where P' is the
1643
1657
/// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
1650
1664
// column LIKE '%foo%' => min <= '' && '' <= max => true
1651
1665
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
1652
1666
1653
- /// returns the string literal of the scalar value if it is a string
1654
- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655
- s. try_as_str ( ) . flatten ( )
1656
- }
1657
-
1658
- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659
- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660
- let s = unpack_string ( lit. value ( ) ) ?;
1661
- return Some ( s) ;
1662
- }
1663
- None
1664
- }
1665
-
1666
1667
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
1667
1668
// this may involve building the physical expressions that call lower() and upper()
1668
1669
let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1711,66 @@ fn build_like_match(
1710
1711
Some ( combined)
1711
1712
}
1712
1713
1714
+ // For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1715
+ fn build_not_like_match (
1716
+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1717
+ ) -> Result < Arc < dyn PhysicalExpr > > {
1718
+ // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1719
+
1720
+ let min_column_expr = expr_builder. min_column_expr ( ) ?;
1721
+ let max_column_expr = expr_builder. max_column_expr ( ) ?;
1722
+
1723
+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1724
+
1725
+ let pattern = extract_string_literal ( scalar_expr) . ok_or_else ( || {
1726
+ plan_datafusion_err ! ( "cannot extract literal from NOT LIKE expression" )
1727
+ } ) ?;
1728
+
1729
+ let chars: Vec < char > = pattern. chars ( ) . collect ( ) ;
1730
+ for i in 0 ..chars. len ( ) - 1 {
1731
+ // Check if current char is a wildcard and is not escaped with backslash
1732
+ if ( chars[ i] == '%' || chars[ i] == '_' ) && ( i == 0 || chars[ i - 1 ] != '\\' ) {
1733
+ // Example: For pattern "foo%bar", the row group might include values like
1734
+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1735
+ // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1736
+ // match the pattern, intermediate values like "food" may not
1737
+ // match the full pattern "foo%bar", making pruning unsafe.
1738
+ // (truncate foo%bar to foo% have same problem)
1739
+ return Err ( plan_datafusion_err ! (
1740
+ "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
1741
+ ) ) ;
1742
+ }
1743
+ }
1744
+
1745
+ if chars. last ( ) == Some ( & '_' ) && ( chars. len ( ) > 1 && chars[ chars. len ( ) - 2 ] != '\\' ) {
1746
+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1747
+ // which means not every row is guaranteed to match the pattern.
1748
+ return Err ( plan_datafusion_err ! (
1749
+ "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
1750
+ ) ) ;
1751
+ }
1752
+
1753
+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1754
+ true ,
1755
+ false ,
1756
+ Arc :: clone ( & min_column_expr) ,
1757
+ Arc :: clone ( scalar_expr) ,
1758
+ ) ) ;
1759
+
1760
+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1761
+ true ,
1762
+ false ,
1763
+ Arc :: clone ( & max_column_expr) ,
1764
+ Arc :: clone ( scalar_expr) ,
1765
+ ) ) ;
1766
+
1767
+ Ok ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1768
+ min_col_not_like_epxr,
1769
+ Operator :: Or ,
1770
+ max_col_not_like_expr,
1771
+ ) ) )
1772
+ }
1773
+
1713
1774
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
1714
1775
/// This makes it so that the returned string will always compare greater than the input string
1715
1776
/// or any other string with the same prefix.
@@ -4061,6 +4122,132 @@ mod tests {
4061
4122
prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4062
4123
}
4063
4124
4125
+ #[ test]
4126
+ fn prune_utf8_not_like_one ( ) {
4127
+ let ( schema, statistics) = utf8_setup ( ) ;
4128
+
4129
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4130
+ #[ rustfmt:: skip]
4131
+ let expected_ret = & [
4132
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4133
+ true ,
4134
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4135
+ true ,
4136
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4137
+ true ,
4138
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4139
+ true ,
4140
+ // s1 [NULL, NULL] ==> unknown (must keep)
4141
+ true ,
4142
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4143
+ true ,
4144
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4145
+ true ,
4146
+ // s1 ["", ""] ==> some rows could pass (must keep)
4147
+ true ,
4148
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4149
+ true ,
4150
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4151
+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4152
+ true ,
4153
+ ] ;
4154
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4155
+ }
4156
+
4157
+ #[ test]
4158
+ fn prune_utf8_not_like_many ( ) {
4159
+ let ( schema, statistics) = utf8_setup ( ) ;
4160
+
4161
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4162
+ #[ rustfmt:: skip]
4163
+ let expected_ret = & [
4164
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4165
+ true ,
4166
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4167
+ true ,
4168
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4169
+ true ,
4170
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4171
+ true ,
4172
+ // s1 [NULL, NULL] ==> unknown (must keep)
4173
+ true ,
4174
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4175
+ true ,
4176
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4177
+ true ,
4178
+ // s1 ["", ""] ==> some rows could pass (must keep)
4179
+ true ,
4180
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4181
+ true ,
4182
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4183
+ false ,
4184
+ ] ;
4185
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4186
+
4187
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4188
+ #[ rustfmt:: skip]
4189
+ let expected_ret = & [
4190
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4191
+ true ,
4192
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4193
+ true ,
4194
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4195
+ true ,
4196
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4197
+ true ,
4198
+ // s1 [NULL, NULL] ==> unknown (must keep)
4199
+ true ,
4200
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4201
+ true ,
4202
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4203
+ true ,
4204
+ // s1 ["", ""] ==> some rows could pass (must keep)
4205
+ true ,
4206
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4207
+ true ,
4208
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4209
+ true ,
4210
+ ] ;
4211
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4212
+
4213
+ let expr = col ( "s1" ) . not_like ( lit ( "M" ) ) ;
4214
+ #[ rustfmt:: skip]
4215
+ let expected_ret = & [
4216
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4217
+ true ,
4218
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4219
+ true ,
4220
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4221
+ true ,
4222
+ // s1 ["M", "M"] ==> no row match
4223
+ false ,
4224
+ // s1 [NULL, NULL] ==> unknown (must keep)
4225
+ true ,
4226
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4227
+ true ,
4228
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4229
+ true ,
4230
+ // s1 ["", ""] ==> some rows could pass (must keep)
4231
+ true ,
4232
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4233
+ true ,
4234
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4235
+ true ,
4236
+ ] ;
4237
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4238
+
4239
+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4240
+ let statistics = TestStatistics :: new ( ) . with (
4241
+ "s1" ,
4242
+ ContainerStats :: new_utf8 (
4243
+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4244
+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4245
+ ) ,
4246
+ ) ;
4247
+ let expected_ret = & [ false , true ] ;
4248
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4249
+ }
4250
+
4064
4251
#[ test]
4065
4252
fn test_rewrite_expr_to_prunable ( ) {
4066
4253
let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments