@@ -859,18 +859,62 @@ idx_t ParquetReader::GetGroupOffset(ParquetReaderScanState &state) {
859
859
860
860
static FilterPropagateResult CheckParquetStringFilter (BaseStatistics &stats, const Statistics &pq_col_stats,
861
861
TableFilter &filter) {
862
- if (filter.filter_type == TableFilterType::CONSTANT_COMPARISON) {
862
+ switch (filter.filter_type ) {
863
+ case TableFilterType::CONJUNCTION_AND: {
864
+ auto &conjunction_filter = filter.Cast <ConjunctionAndFilter>();
865
+ auto and_result = FilterPropagateResult::FILTER_ALWAYS_TRUE;
866
+ for (auto &child_filter : conjunction_filter.child_filters ) {
867
+ auto child_prune_result = CheckParquetStringFilter (stats, pq_col_stats, *child_filter);
868
+ if (child_prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
869
+ return FilterPropagateResult::FILTER_ALWAYS_FALSE;
870
+ }
871
+ if (child_prune_result != and_result) {
872
+ and_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
873
+ }
874
+ }
875
+ return and_result;
876
+ }
877
+ case TableFilterType::CONSTANT_COMPARISON: {
863
878
auto &constant_filter = filter.Cast <ConstantFilter>();
864
879
auto &min_value = pq_col_stats.min_value ;
865
880
auto &max_value = pq_col_stats.max_value ;
866
881
return StringStats::CheckZonemap (const_data_ptr_cast (min_value.c_str ()), min_value.size (),
867
882
const_data_ptr_cast (max_value.c_str ()), max_value.size (),
868
883
constant_filter.comparison_type , StringValue::Get (constant_filter.constant ));
869
- } else {
884
+ }
885
+ default :
870
886
return filter.CheckStatistics (stats);
871
887
}
872
888
}
873
889
890
+ static FilterPropagateResult CheckParquetFloatFilter (ColumnReader &reader, const Statistics &pq_col_stats,
891
+ TableFilter &filter) {
892
+ // floating point values can have values in the [min, max] domain AND nan values
893
+ // check both stats against the filter
894
+ auto &type = reader.Type ();
895
+ auto nan_stats = NumericStats::CreateUnknown (type);
896
+ auto nan_value = Value (" nan" ).DefaultCastAs (type);
897
+ NumericStats::SetMin (nan_stats, nan_value);
898
+ NumericStats::SetMax (nan_stats, nan_value);
899
+ auto nan_prune = filter.CheckStatistics (nan_stats);
900
+
901
+ auto min_max_stats = ParquetStatisticsUtils::CreateNumericStats (reader.Type (), reader.Schema (), pq_col_stats);
902
+ auto prune = filter.CheckStatistics (*min_max_stats);
903
+
904
+ // if EITHER of them cannot be pruned - we cannot prune
905
+ if (prune == FilterPropagateResult::NO_PRUNING_POSSIBLE ||
906
+ nan_prune == FilterPropagateResult::NO_PRUNING_POSSIBLE) {
907
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
908
+ }
909
+ // if both are the same we can return that value
910
+ if (prune == nan_prune) {
911
+ return prune;
912
+ }
913
+ // if they are different we need to return that we cannot prune
914
+ // e.g. prune = always false, nan_prune = always true -> we don't know
915
+ return FilterPropagateResult::NO_PRUNING_POSSIBLE;
916
+ }
917
+
874
918
void ParquetReader::PrepareRowGroupBuffer (ParquetReaderScanState &state, idx_t i) {
875
919
auto &group = GetGroup (state);
876
920
auto col_idx = MultiFileLocalIndex (i);
@@ -889,43 +933,38 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
889
933
// check the bloom filter if present
890
934
bool is_generated_column = column_reader.ColumnIndex () >= group.columns .size ();
891
935
bool is_expression = column_reader.Schema ().schema_type == ::duckdb::ParquetColumnSchemaType::EXPRESSION;
936
+ bool has_min_max = false ;
937
+ if (!is_generated_column) {
938
+ has_min_max = group.columns [column_reader.ColumnIndex ()].meta_data .statistics .__isset .min_value &&
939
+ group.columns [column_reader.ColumnIndex ()].meta_data .statistics .__isset .max_value ;
940
+ }
892
941
if (is_expression) {
893
942
// no pruning possible for expressions
894
943
prune_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
895
- } else if (!column_reader.Type ().IsNested () && !is_generated_column &&
896
- ParquetStatisticsUtils::BloomFilterSupported (column_reader.Type ().id ()) &&
897
- ParquetStatisticsUtils::BloomFilterExcludes (filter,
898
- group.columns [column_reader.ColumnIndex ()].meta_data ,
899
- *state.thrift_file_proto , allocator)) {
900
- prune_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
901
- } else if (column_reader.Type ().id () == LogicalTypeId::VARCHAR && !is_generated_column &&
902
- group.columns [column_reader.ColumnIndex ()].meta_data .statistics .__isset .min_value &&
903
- group.columns [column_reader.ColumnIndex ()].meta_data .statistics .__isset .max_value ) {
904
-
944
+ } else if (!is_generated_column && has_min_max && column_reader.Type ().id () == LogicalTypeId::VARCHAR) {
905
945
// our StringStats only store the first 8 bytes of strings (even if Parquet has longer string stats)
906
946
// however, when reading remote Parquet files, skipping row groups is really important
907
947
// here, we implement a special case to check the full length for string filters
908
- if (filter.filter_type == TableFilterType::CONJUNCTION_AND) {
909
- const auto &and_filter = filter.Cast <ConjunctionAndFilter>();
910
- auto and_result = FilterPropagateResult::FILTER_ALWAYS_TRUE;
911
- for (auto &child_filter : and_filter.child_filters ) {
912
- auto child_prune_result = CheckParquetStringFilter (
913
- *stats, group.columns [column_reader.ColumnIndex ()].meta_data .statistics , *child_filter);
914
- if (child_prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
915
- and_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
916
- break ;
917
- } else if (child_prune_result != and_result) {
918
- and_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
919
- }
920
- }
921
- prune_result = and_result;
922
- } else {
923
- prune_result = CheckParquetStringFilter (
924
- *stats, group.columns [column_reader.ColumnIndex ()].meta_data .statistics , filter);
925
- }
948
+ prune_result = CheckParquetStringFilter (
949
+ *stats, group.columns [column_reader.ColumnIndex ()].meta_data .statistics , filter);
950
+ } else if (!is_generated_column && has_min_max &&
951
+ (column_reader.Type ().id () == LogicalTypeId::FLOAT ||
952
+ column_reader.Type ().id () == LogicalTypeId::DOUBLE)) {
953
+ // floating point columns can have NaN values in addition to the min/max bounds defined in the file
954
+ // in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
955
+ // based on nan
956
+ prune_result = CheckParquetFloatFilter (
957
+ column_reader, group.columns [column_reader.ColumnIndex ()].meta_data .statistics , filter);
926
958
} else {
927
959
prune_result = filter.CheckStatistics (*stats);
928
960
}
961
+ if (prune_result == FilterPropagateResult::NO_PRUNING_POSSIBLE && !column_reader.Type ().IsNested () &&
962
+ !is_generated_column && ParquetStatisticsUtils::BloomFilterSupported (column_reader.Type ().id ()) &&
963
+ ParquetStatisticsUtils::BloomFilterExcludes (filter,
964
+ group.columns [column_reader.ColumnIndex ()].meta_data ,
965
+ *state.thrift_file_proto , allocator)) {
966
+ prune_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
967
+ }
929
968
930
969
if (prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
931
970
// this effectively will skip this chunk
0 commit comments