@@ -1590,6 +1590,13 @@ fn build_statistics_expr(
1590
1590
) ) ,
1591
1591
) )
1592
1592
}
1593
+ Operator :: NotLikeMatch => {
1594
+ build_not_like_match ( expr_builder) . ok_or_else ( || {
1595
+ plan_datafusion_err ! (
1596
+ "The NOT LIKE expression with wildcards is only supported at the end of the pattern"
1597
+ )
1598
+ } ) ?
1599
+ }
1593
1600
Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
1594
1601
plan_datafusion_err ! (
1595
1602
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1645,19 @@ fn build_statistics_expr(
1638
1645
Ok ( statistics_expr)
1639
1646
}
1640
1647
1648
+ /// returns the string literal of the scalar value if it is a string
1649
+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1650
+ s. try_as_str ( ) . flatten ( )
1651
+ }
1652
+
1653
+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1654
+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1655
+ let s = unpack_string ( lit. value ( ) ) ?;
1656
+ return Some ( s) ;
1657
+ }
1658
+ None
1659
+ }
1660
+
1641
1661
/// Convert `column LIKE literal` where P is a constant prefix of the literal
1642
1662
/// to a range check on the column: `P <= column && column < P'`, where P' is the
1643
1663
/// lowest string after all P* strings.
@@ -1650,19 +1670,6 @@ fn build_like_match(
1650
1670
// column LIKE '%foo%' => min <= '' && '' <= max => true
1651
1671
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
1652
1672
1653
- /// returns the string literal of the scalar value if it is a string
1654
- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655
- s. try_as_str ( ) . flatten ( )
1656
- }
1657
-
1658
- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659
- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660
- let s = unpack_string ( lit. value ( ) ) ?;
1661
- return Some ( s) ;
1662
- }
1663
- None
1664
- }
1665
-
1666
1673
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
1667
1674
// this may involve building the physical expressions that call lower() and upper()
1668
1675
let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1717,56 @@ fn build_like_match(
1710
1717
Some ( combined)
1711
1718
}
1712
1719
1720
+ // For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1721
+ fn build_not_like_match (
1722
+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1723
+ ) -> Option < Arc < dyn PhysicalExpr > > {
1724
+ // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1725
+
1726
+ let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
1727
+ let max_column_expr = expr_builder. max_column_expr ( ) . ok ( ) ?;
1728
+
1729
+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1730
+
1731
+ let pattern = extract_string_literal ( scalar_expr) ?;
1732
+
1733
+ let chars: Vec < char > = pattern. chars ( ) . collect ( ) ;
1734
+ for i in 0 ..chars. len ( ) - 1 {
1735
+ // Check if current char is a wildcard and is not escaped with backslash
1736
+ if ( chars[ i] == '%' || chars[ i] == '_' ) && ( i == 0 || chars[ i - 1 ] != '\\' ) {
1737
+ // Example: For pattern "foo%bar", the row group might include values like
1738
+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1739
+ return None ;
1740
+ }
1741
+ }
1742
+
1743
+ if chars. last ( ) == Some ( & '_' ) && ( chars. len ( ) > 1 && chars[ chars. len ( ) - 2 ] != '\\' ) {
1744
+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1745
+ // which means not every row is guaranteed to match the pattern.
1746
+ return None ;
1747
+ }
1748
+
1749
+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1750
+ true ,
1751
+ false ,
1752
+ Arc :: clone ( & min_column_expr) ,
1753
+ Arc :: clone ( scalar_expr) ,
1754
+ ) ) ;
1755
+
1756
+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1757
+ true ,
1758
+ false ,
1759
+ Arc :: clone ( & max_column_expr) ,
1760
+ Arc :: clone ( scalar_expr) ,
1761
+ ) ) ;
1762
+
1763
+ Some ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1764
+ min_col_not_like_epxr,
1765
+ Operator :: Or ,
1766
+ max_col_not_like_expr,
1767
+ ) ) )
1768
+ }
1769
+
1713
1770
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
1714
1771
/// This makes it so that the returned string will always compare greater than the input string
1715
1772
/// or any other string with the same prefix.
@@ -4061,6 +4118,106 @@ mod tests {
4061
4118
prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4062
4119
}
4063
4120
4121
+ #[ test]
4122
+ fn prune_utf8_not_like_one ( ) {
4123
+ let ( schema, statistics) = utf8_setup ( ) ;
4124
+
4125
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4126
+ #[ rustfmt:: skip]
4127
+ let expected_ret = & [
4128
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4129
+ true ,
4130
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4131
+ true ,
4132
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4133
+ true ,
4134
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4135
+ true ,
4136
+ // s1 [NULL, NULL] ==> unknown (must keep)
4137
+ true ,
4138
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4139
+ true ,
4140
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4141
+ true ,
4142
+ // s1 ["", ""] ==> some rows could pass (must keep)
4143
+ true ,
4144
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4145
+ true ,
4146
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4147
+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4148
+ true ,
4149
+ ] ;
4150
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4151
+ }
4152
+
4153
+ #[ test]
4154
+ fn prune_utf8_not_like_many ( ) {
4155
+ let ( schema, statistics) = utf8_setup ( ) ;
4156
+
4157
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4158
+ #[ rustfmt:: skip]
4159
+ let expected_ret = & [
4160
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4161
+ true ,
4162
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4163
+ true ,
4164
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4165
+ true ,
4166
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4167
+ true ,
4168
+ // s1 [NULL, NULL] ==> unknown (must keep)
4169
+ true ,
4170
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4171
+ true ,
4172
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4173
+ true ,
4174
+ // s1 ["", ""] ==> some rows could pass (must keep)
4175
+ true ,
4176
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4177
+ true ,
4178
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4179
+ false ,
4180
+ ] ;
4181
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4182
+
4183
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4184
+ #[ rustfmt:: skip]
4185
+ let expected_ret = & [
4186
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4187
+ true ,
4188
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4189
+ true ,
4190
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4191
+ true ,
4192
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4193
+ true ,
4194
+ // s1 [NULL, NULL] ==> unknown (must keep)
4195
+ true ,
4196
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4197
+ true ,
4198
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4199
+ true ,
4200
+ // s1 ["", ""] ==> some rows could pass (must keep)
4201
+ true ,
4202
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4203
+ true ,
4204
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4205
+ true ,
4206
+ ] ;
4207
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4208
+
4209
+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4210
+ let statistics = TestStatistics :: new ( ) . with (
4211
+ "s1" ,
4212
+ ContainerStats :: new_utf8 (
4213
+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4214
+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4215
+ ) ,
4216
+ ) ;
4217
+ let expected_ret = & [ false , true ] ;
4218
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4219
+ }
4220
+
4064
4221
#[ test]
4065
4222
fn test_rewrite_expr_to_prunable ( ) {
4066
4223
let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments