@@ -1590,6 +1590,13 @@ fn build_statistics_expr(
1590
1590
) ) ,
1591
1591
) )
1592
1592
}
1593
+ Operator :: NotLikeMatch => {
1594
+ build_not_like_match ( expr_builder) . ok_or_else ( || {
1595
+ plan_datafusion_err ! (
1596
+ "The NOT LIKE expression with wildcards is only supported at the end of the pattern"
1597
+ )
1598
+ } ) ?
1599
+ }
1593
1600
Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
1594
1601
plan_datafusion_err ! (
1595
1602
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1645,19 @@ fn build_statistics_expr(
1638
1645
Ok ( statistics_expr)
1639
1646
}
1640
1647
1648
+ /// returns the string literal of the scalar value if it is a string
1649
+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1650
+ s. try_as_str ( ) . flatten ( )
1651
+ }
1652
+
1653
+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1654
+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1655
+ let s = unpack_string ( lit. value ( ) ) ?;
1656
+ return Some ( s) ;
1657
+ }
1658
+ None
1659
+ }
1660
+
1641
1661
/// Convert `column LIKE literal` where P is a constant prefix of the literal
1642
1662
/// to a range check on the column: `P <= column && column < P'`, where P' is the
1643
1663
/// lowest string after all P* strings.
@@ -1650,19 +1670,6 @@ fn build_like_match(
1650
1670
// column LIKE '%foo%' => min <= '' && '' <= max => true
1651
1671
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
1652
1672
1653
- /// returns the string literal of the scalar value if it is a string
1654
- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655
- s. try_as_str ( ) . flatten ( )
1656
- }
1657
-
1658
- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659
- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660
- let s = unpack_string ( lit. value ( ) ) ?;
1661
- return Some ( s) ;
1662
- }
1663
- None
1664
- }
1665
-
1666
1673
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
1667
1674
// this may involve building the physical expressions that call lower() and upper()
1668
1675
let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1717,57 @@ fn build_like_match(
1710
1717
Some ( combined)
1711
1718
}
1712
1719
1720
+ // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1721
+ // If both col_min and col_max match a prefix pattern, we can prune entire row group because **ALL** values in this row group will match the pattern.
1722
+ fn build_not_like_match (
1723
+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1724
+ ) -> Option < Arc < dyn PhysicalExpr > > {
1725
+ let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
1726
+ let max_column_expr = expr_builder. max_column_expr ( ) . ok ( ) ?;
1727
+
1728
+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1729
+
1730
+ let pattern = extract_string_literal ( scalar_expr) ?;
1731
+
1732
+ let chars: Vec < char > = pattern. chars ( ) . collect ( ) ;
1733
+ for i in 0 ..chars. len ( ) - 1 {
1734
+ // Check if current char is a wildcard and is not escaped with backslash
1735
+ if ( chars[ i] == '%' || chars[ i] == '_' ) && ( i == 0 || chars[ i - 1 ] != '\\' ) {
1736
+ // Example: For pattern "foo%bar", the row group might include values like
1737
+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1738
+ return None ;
1739
+ }
1740
+ }
1741
+
1742
+ if chars. last ( ) == Some ( & '_' ) && ( chars. len ( ) > 1 && chars[ chars. len ( ) - 2 ] != '\\' ) {
1743
+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1744
+ // which means not every row is guaranteed to match the pattern.
1745
+ return None ;
1746
+ }
1747
+
1748
+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1749
+ true ,
1750
+ false ,
1751
+ Arc :: clone ( & min_column_expr) ,
1752
+ scalar_expr. clone ( ) ,
1753
+ ) ) ;
1754
+
1755
+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1756
+ true ,
1757
+ false ,
1758
+ Arc :: clone ( & max_column_expr) ,
1759
+ scalar_expr. clone ( ) ,
1760
+ ) ) ;
1761
+
1762
+ Some ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1763
+ min_col_not_like_epxr,
1764
+ Operator :: Or ,
1765
+ max_col_not_like_expr,
1766
+ ) ) )
1767
+ }
1768
+
1769
+
1770
+
1713
1771
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
1714
1772
/// This makes it so that the returned string will always compare greater than the input string
1715
1773
/// or any other string with the same prefix.
@@ -4061,6 +4119,106 @@ mod tests {
4061
4119
prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4062
4120
}
4063
4121
4122
+ #[ test]
4123
+ fn prune_utf8_not_like_one ( ) {
4124
+ let ( schema, statistics) = utf8_setup ( ) ;
4125
+
4126
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4127
+ #[ rustfmt:: skip]
4128
+ let expected_ret = & [
4129
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4130
+ true ,
4131
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4132
+ true ,
4133
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4134
+ true ,
4135
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4136
+ true ,
4137
+ // s1 [NULL, NULL] ==> unknown (must keep)
4138
+ true ,
4139
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4140
+ true ,
4141
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4142
+ true ,
4143
+ // s1 ["", ""] ==> some rows could pass (must keep)
4144
+ true ,
4145
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4146
+ true ,
4147
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4148
+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4149
+ true ,
4150
+ ] ;
4151
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4152
+ }
4153
+
4154
+ #[ test]
4155
+ fn prune_utf8_not_like_many ( ) {
4156
+ let ( schema, statistics) = utf8_setup ( ) ;
4157
+
4158
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4159
+ #[ rustfmt:: skip]
4160
+ let expected_ret = & [
4161
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4162
+ true ,
4163
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4164
+ true ,
4165
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4166
+ true ,
4167
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4168
+ true ,
4169
+ // s1 [NULL, NULL] ==> unknown (must keep)
4170
+ true ,
4171
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4172
+ true ,
4173
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4174
+ true ,
4175
+ // s1 ["", ""] ==> some rows could pass (must keep)
4176
+ true ,
4177
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4178
+ true ,
4179
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4180
+ false ,
4181
+ ] ;
4182
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4183
+
4184
+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4185
+ #[ rustfmt:: skip]
4186
+ let expected_ret = & [
4187
+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4188
+ true ,
4189
+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4190
+ true ,
4191
+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4192
+ true ,
4193
+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4194
+ true ,
4195
+ // s1 [NULL, NULL] ==> unknown (must keep)
4196
+ true ,
4197
+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4198
+ true ,
4199
+ // s1 ["", "A"] ==> some rows could pass (must keep)
4200
+ true ,
4201
+ // s1 ["", ""] ==> some rows could pass (must keep)
4202
+ true ,
4203
+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4204
+ true ,
4205
+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4206
+ true ,
4207
+ ] ;
4208
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4209
+
4210
+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4211
+ let statistics = TestStatistics :: new ( ) . with (
4212
+ "s1" ,
4213
+ ContainerStats :: new_utf8 (
4214
+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4215
+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4216
+ ) ,
4217
+ ) ;
4218
+ let expected_ret = & [ false , true ] ;
4219
+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4220
+ }
4221
+
4064
4222
#[ test]
4065
4223
fn test_rewrite_expr_to_prunable ( ) {
4066
4224
let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments