Update vendored DuckDB sources to d392e43

duckdblabs-bot · duckdblabs-bot · commit 9507d4904fc5 · 2025-04-04T00:36:20.000Z
diff --git a/src/duckdb/extension/parquet/include/parquet_statistics.hpp b/src/duckdb/extension/parquet/include/parquet_statistics.hpp
@@ -36,6 +36,9 @@ struct ParquetStatisticsUtils {
 	static bool BloomFilterExcludes(const TableFilter &filter, const duckdb_parquet::ColumnMetaData &column_meta_data,
 	                                duckdb_apache::thrift::protocol::TProtocol &file_proto, Allocator &allocator);
 
+	static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type, const ParquetColumnSchema &schema_ele,
+	                                                     const duckdb_parquet::Statistics &parquet_stats);
+
 private:
 	static Value ConvertValueInternal(const LogicalType &type, const ParquetColumnSchema &schema_ele,
 	                                  const std::string &stats);
diff --git a/src/duckdb/extension/parquet/parquet_reader.cpp b/src/duckdb/extension/parquet/parquet_reader.cpp
@@ -859,18 +859,62 @@ idx_t ParquetReader::GetGroupOffset(ParquetReaderScanState &state) {
 
 static FilterPropagateResult CheckParquetStringFilter(BaseStatistics &stats, const Statistics &pq_col_stats,
                                                       TableFilter &filter) {
-	if (filter.filter_type == TableFilterType::CONSTANT_COMPARISON) {
+	switch (filter.filter_type) {
+	case TableFilterType::CONJUNCTION_AND: {
+		auto &conjunction_filter = filter.Cast<ConjunctionAndFilter>();
+		auto and_result = FilterPropagateResult::FILTER_ALWAYS_TRUE;
+		for (auto &child_filter : conjunction_filter.child_filters) {
+			auto child_prune_result = CheckParquetStringFilter(stats, pq_col_stats, *child_filter);
+			if (child_prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
+				return FilterPropagateResult::FILTER_ALWAYS_FALSE;
+			}
+			if (child_prune_result != and_result) {
+				and_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
+			}
+		}
+		return and_result;
+	}
+	case TableFilterType::CONSTANT_COMPARISON: {
 		auto &constant_filter = filter.Cast<ConstantFilter>();
 		auto &min_value = pq_col_stats.min_value;
 		auto &max_value = pq_col_stats.max_value;
 		return StringStats::CheckZonemap(const_data_ptr_cast(min_value.c_str()), min_value.size(),
 		                                 const_data_ptr_cast(max_value.c_str()), max_value.size(),
 		                                 constant_filter.comparison_type, StringValue::Get(constant_filter.constant));
-	} else {
+	}
+	default:
 		return filter.CheckStatistics(stats);
 	}
 }
 
+static FilterPropagateResult CheckParquetFloatFilter(ColumnReader &reader, const Statistics &pq_col_stats,
+                                                     TableFilter &filter) {
+	// floating point values can have values in the [min, max] domain AND nan values
+	// check both stats against the filter
+	auto &type = reader.Type();
+	auto nan_stats = NumericStats::CreateUnknown(type);
+	auto nan_value = Value("nan").DefaultCastAs(type);
+	NumericStats::SetMin(nan_stats, nan_value);
+	NumericStats::SetMax(nan_stats, nan_value);
+	auto nan_prune = filter.CheckStatistics(nan_stats);
+
+	auto min_max_stats = ParquetStatisticsUtils::CreateNumericStats(reader.Type(), reader.Schema(), pq_col_stats);
+	auto prune = filter.CheckStatistics(*min_max_stats);
+
+	// if EITHER of them cannot be pruned - we cannot prune
+	if (prune == FilterPropagateResult::NO_PRUNING_POSSIBLE ||
+	    nan_prune == FilterPropagateResult::NO_PRUNING_POSSIBLE) {
+		return FilterPropagateResult::NO_PRUNING_POSSIBLE;
+	}
+	// if both are the same we can return that value
+	if (prune == nan_prune) {
+		return prune;
+	}
+	// if they are different we need to return that we cannot prune
+	// e.g. prune = always false, nan_prune = always true -> we don't know
+	return FilterPropagateResult::NO_PRUNING_POSSIBLE;
+}
+
 void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i) {
 	auto &group = GetGroup(state);
 	auto col_idx = MultiFileLocalIndex(i);
@@ -889,43 +933,38 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
 			// check the bloom filter if present
 			bool is_generated_column = column_reader.ColumnIndex() >= group.columns.size();
 			bool is_expression = column_reader.Schema().schema_type == ::duckdb::ParquetColumnSchemaType::EXPRESSION;
+			bool has_min_max = false;
+			if (!is_generated_column) {
+				has_min_max = group.columns[column_reader.ColumnIndex()].meta_data.statistics.__isset.min_value &&
+				              group.columns[column_reader.ColumnIndex()].meta_data.statistics.__isset.max_value;
+			}
 			if (is_expression) {
 				// no pruning possible for expressions
 				prune_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
-			} else if (!column_reader.Type().IsNested() && !is_generated_column &&
-			           ParquetStatisticsUtils::BloomFilterSupported(column_reader.Type().id()) &&
-			           ParquetStatisticsUtils::BloomFilterExcludes(filter,
-			                                                       group.columns[column_reader.ColumnIndex()].meta_data,
-			                                                       *state.thrift_file_proto, allocator)) {
-				prune_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
-			} else if (column_reader.Type().id() == LogicalTypeId::VARCHAR && !is_generated_column &&
-			           group.columns[column_reader.ColumnIndex()].meta_data.statistics.__isset.min_value &&
-			           group.columns[column_reader.ColumnIndex()].meta_data.statistics.__isset.max_value) {
-
+			} else if (!is_generated_column && has_min_max && column_reader.Type().id() == LogicalTypeId::VARCHAR) {
 				// our StringStats only store the first 8 bytes of strings (even if Parquet has longer string stats)
 				// however, when reading remote Parquet files, skipping row groups is really important
 				// here, we implement a special case to check the full length for string filters
-				if (filter.filter_type == TableFilterType::CONJUNCTION_AND) {
-					const auto &and_filter = filter.Cast<ConjunctionAndFilter>();
-					auto and_result = FilterPropagateResult::FILTER_ALWAYS_TRUE;
-					for (auto &child_filter : and_filter.child_filters) {
-						auto child_prune_result = CheckParquetStringFilter(
-						    *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, *child_filter);
-						if (child_prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
-							and_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
-							break;
-						} else if (child_prune_result != and_result) {
-							and_result = FilterPropagateResult::NO_PRUNING_POSSIBLE;
-						}
-					}
-					prune_result = and_result;
-				} else {
-					prune_result = CheckParquetStringFilter(
-					    *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
-				}
+				prune_result = CheckParquetStringFilter(
+				    *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
+			} else if (!is_generated_column && has_min_max &&
+			           (column_reader.Type().id() == LogicalTypeId::FLOAT ||
+			            column_reader.Type().id() == LogicalTypeId::DOUBLE)) {
+				// floating point columns can have NaN values in addition to the min/max bounds defined in the file
+				// in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
+				// based on nan
+				prune_result = CheckParquetFloatFilter(
+				    column_reader, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
 			} else {
 				prune_result = filter.CheckStatistics(*stats);
 			}
+			if (prune_result == FilterPropagateResult::NO_PRUNING_POSSIBLE && !column_reader.Type().IsNested() &&
+			    !is_generated_column && ParquetStatisticsUtils::BloomFilterSupported(column_reader.Type().id()) &&
+			    ParquetStatisticsUtils::BloomFilterExcludes(filter,
+			                                                group.columns[column_reader.ColumnIndex()].meta_data,
+			                                                *state.thrift_file_proto, allocator)) {
+				prune_result = FilterPropagateResult::FILTER_ALWAYS_FALSE;
+			}
 
 			if (prune_result == FilterPropagateResult::FILTER_ALWAYS_FALSE) {
 				// this effectively will skip this chunk
diff --git a/src/duckdb/extension/parquet/parquet_statistics.cpp b/src/duckdb/extension/parquet/parquet_statistics.cpp
@@ -23,8 +23,9 @@ namespace duckdb {
 using duckdb_parquet::ConvertedType;
 using duckdb_parquet::Type;
 
-static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type, const ParquetColumnSchema &schema_ele,
-                                                     const duckdb_parquet::Statistics &parquet_stats) {
+unique_ptr<BaseStatistics> ParquetStatisticsUtils::CreateNumericStats(const LogicalType &type,
+                                                                      const ParquetColumnSchema &schema_ele,
+                                                                      const duckdb_parquet::Statistics &parquet_stats) {
 	auto stats = NumericStats::CreateUnknown(type);
 
 	// for reasons unknown to science, Parquet defines *both* `min` and `min_value` as well as `max` and
@@ -50,6 +51,27 @@ static unique_ptr<BaseStatistics> CreateNumericStats(const LogicalType &type, co
 	return stats.ToUnique();
 }
 
+static unique_ptr<BaseStatistics> CreateFloatingPointStats(const LogicalType &type,
+                                                           const ParquetColumnSchema &schema_ele,
+                                                           const duckdb_parquet::Statistics &parquet_stats) {
+	auto stats = NumericStats::CreateUnknown(type);
+
+	// floating point values can always have NaN values - hence we cannot use the max value from the file
+	Value min;
+	Value max;
+	if (parquet_stats.__isset.min_value) {
+		min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min_value);
+	} else if (parquet_stats.__isset.min) {
+		min = ParquetStatisticsUtils::ConvertValue(type, schema_ele, parquet_stats.min);
+	} else {
+		min = Value(type);
+	}
+	max = Value("nan").DefaultCastAs(type);
+	NumericStats::SetMin(stats, min);
+	NumericStats::SetMax(stats, max);
+	return stats.ToUnique();
+}
+
 Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele,
                                            const std::string &stats) {
 	Value result;
@@ -328,8 +350,6 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
 	case LogicalTypeId::SMALLINT:
 	case LogicalTypeId::INTEGER:
 	case LogicalTypeId::BIGINT:
-	case LogicalTypeId::FLOAT:
-	case LogicalTypeId::DOUBLE:
 	case LogicalTypeId::DATE:
 	case LogicalTypeId::TIME:
 	case LogicalTypeId::TIME_TZ:
@@ -341,6 +361,10 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
 	case LogicalTypeId::DECIMAL:
 		row_group_stats = CreateNumericStats(type, schema, parquet_stats);
 		break;
+	case LogicalTypeId::FLOAT:
+	case LogicalTypeId::DOUBLE:
+		row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
+		break;
 	case LogicalTypeId::VARCHAR: {
 		auto string_stats = StringStats::CreateEmpty(type);
 		if (parquet_stats.__isset.min_value) {
diff --git a/src/duckdb/src/common/multi_file/multi_file_column_mapper.cpp b/src/duckdb/src/common/multi_file/multi_file_column_mapper.cpp
@@ -583,6 +583,7 @@ static bool EvaluateFilterAgainstConstant(TableFilter &filter, const Value &cons
 			//! No filter_data assigned (does this mean the DynamicFilter is broken??)
 			return true;
 		}
+		lock_guard<mutex> lock(dynamic_filter.filter_data->lock);
 		if (!dynamic_filter.filter_data->initialized) {
 			//! Not initialized
 			return true;
@@ -591,7 +592,6 @@ static bool EvaluateFilterAgainstConstant(TableFilter &filter, const Value &cons
 			//! No filter present
 			return true;
 		}
-		lock_guard<mutex> lock(dynamic_filter.filter_data->lock);
 		return EvaluateFilterAgainstConstant(*dynamic_filter.filter_data->filter, constant);
 	}
 	default:
diff --git a/src/duckdb/src/execution/physical_plan_generator.cpp b/src/duckdb/src/execution/physical_plan_generator.cpp
@@ -50,7 +50,7 @@ PhysicalOperator &PhysicalPlanGenerator::ResolveAndPlan(unique_ptr<LogicalOperat
 
 unique_ptr<PhysicalPlan> PhysicalPlanGenerator::PlanInternal(LogicalOperator &op) {
 	if (!physical_plan) {
-		physical_plan = make_uniq<PhysicalPlan>();
+		physical_plan = make_uniq<PhysicalPlan>(Allocator::Get(context));
 	}
 	op.estimated_cardinality = op.EstimateCardinality(context);
 	physical_plan->SetRoot(CreatePlan(op));
diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp
@@ -1,5 +1,5 @@
 #ifndef DUCKDB_PATCH_VERSION
-#define DUCKDB_PATCH_VERSION "0-dev2171"
+#define DUCKDB_PATCH_VERSION "0-dev2195"
 #endif
 #ifndef DUCKDB_MINOR_VERSION
 #define DUCKDB_MINOR_VERSION 3
@@ -8,10 +8,10 @@
 #define DUCKDB_MAJOR_VERSION 1
 #endif
 #ifndef DUCKDB_VERSION
-#define DUCKDB_VERSION "v1.3.0-dev2171"
+#define DUCKDB_VERSION "v1.3.0-dev2195"
 #endif
 #ifndef DUCKDB_SOURCE_ID
-#define DUCKDB_SOURCE_ID "422538e013"
+#define DUCKDB_SOURCE_ID "cf02bffeb0"
 #endif
 #include "duckdb/function/table/system_functions.hpp"
 #include "duckdb/main/database.hpp"
diff --git a/src/duckdb/src/include/duckdb/execution/physical_plan_generator.hpp b/src/duckdb/src/include/duckdb/execution/physical_plan_generator.hpp
@@ -22,14 +22,25 @@ class ClientContext;
 class ColumnDataCollection;
 
 class PhysicalPlan {
+public:
+	explicit PhysicalPlan(Allocator &allocator) : arena(allocator) {};
+
+	~PhysicalPlan() {
+		// Call the destructor of each physical operator.
+		for (auto &op : ops) {
+			auto &op_ref = op.get();
+			op_ref.~PhysicalOperator();
+		}
+	}
+
 public:
 	template <class T, class... ARGS>
 	PhysicalOperator &Make(ARGS &&... args) {
-		auto op = make_uniq_base<PhysicalOperator, T>(std::forward<ARGS>(args)...);
-		D_ASSERT(op);
-		auto &op_ref = *op;
-		ops.push_back(std::move(op));
-		return op_ref;
+		static_assert(std::is_base_of<PhysicalOperator, T>::value, "T must be a physical operator");
+		auto mem = arena.AllocateAligned(sizeof(T));
+		auto ptr = new (mem) T(std::forward<ARGS>(args)...);
+		ops.push_back(*ptr);
+		return *ptr;
 	}
 
 	PhysicalOperator &Root() {
@@ -41,8 +52,10 @@ class PhysicalPlan {
 	}
 
 private:
-	//! Contains the memory of the physical plan.
-	vector<unique_ptr<PhysicalOperator>> ops;
+	//! The arena allocator storing the physical operator memory.
+	ArenaAllocator arena;
+	//! References to the physical operators.
+	vector<reference<PhysicalOperator>> ops;
 	//! The root of the physical plan.
 	optional_ptr<PhysicalOperator> root;
 };
diff --git a/src/duckdb/src/include/duckdb/storage/checkpoint/string_checkpoint_state.hpp b/src/duckdb/src/include/duckdb/storage/checkpoint/string_checkpoint_state.hpp
@@ -43,6 +43,8 @@ struct UncompressedStringSegmentState : public CompressedSegmentState {
 	unordered_map<block_id_t, reference<StringBlock>> overflow_blocks;
 	//! Overflow string writer (if any), if not set overflow strings will be written to memory blocks
 	unique_ptr<OverflowStringWriter> overflow_writer;
+	//! The block manager with which to write
+	optional_ptr<BlockManager> block_manager;
 	//! The set of overflow blocks written to disk (if any)
 	vector<block_id_t> on_disk_blocks;
 
@@ -51,18 +53,10 @@ struct UncompressedStringSegmentState : public CompressedSegmentState {
 
 	void RegisterBlock(BlockManager &manager, block_id_t block_id);
 
-	string GetSegmentInfo() const override {
-		if (on_disk_blocks.empty()) {
-			return "";
-		}
-		string result = StringUtil::Join(on_disk_blocks, on_disk_blocks.size(), ", ",
-		                                 [&](block_id_t block) { return to_string(block); });
-		return "Overflow String Block Ids: " + result;
-	}
+	string GetSegmentInfo() const override;
 
-	vector<block_id_t> GetAdditionalBlocks() const override {
-		return on_disk_blocks;
-	}
+	vector<block_id_t> GetAdditionalBlocks() const override;
+	void Cleanup(BlockManager &manager);
 
 private:
 	mutex block_lock;
diff --git a/src/duckdb/src/storage/checkpoint/write_overflow_strings_to_disk.cpp b/src/duckdb/src/storage/checkpoint/write_overflow_strings_to_disk.cpp
@@ -14,29 +14,52 @@ WriteOverflowStringsToDisk::~WriteOverflowStringsToDisk() {
 	D_ASSERT(Exception::UncaughtException() || offset == 0);
 }
 
-shared_ptr<BlockHandle> UncompressedStringSegmentState::GetHandle(BlockManager &manager, block_id_t block_id) {
+shared_ptr<BlockHandle> UncompressedStringSegmentState::GetHandle(BlockManager &manager_p, block_id_t block_id) {
 	lock_guard<mutex> lock(block_lock);
 	auto entry = handles.find(block_id);
 	if (entry != handles.end()) {
 		return entry->second;
 	}
+	auto &manager = block_manager ? *block_manager : manager_p;
 	auto result = manager.RegisterBlock(block_id);
 	handles.insert(make_pair(block_id, result));
 	return result;
 }
 
-void UncompressedStringSegmentState::RegisterBlock(BlockManager &manager, block_id_t block_id) {
+void UncompressedStringSegmentState::RegisterBlock(BlockManager &manager_p, block_id_t block_id) {
 	lock_guard<mutex> lock(block_lock);
 	auto entry = handles.find(block_id);
 	if (entry != handles.end()) {
 		throw InternalException("UncompressedStringSegmentState::RegisterBlock - block id %llu already exists",
 		                        block_id);
 	}
+	auto &manager = block_manager ? *block_manager : manager_p;
 	auto result = manager.RegisterBlock(block_id);
 	handles.insert(make_pair(block_id, std::move(result)));
 	on_disk_blocks.push_back(block_id);
 }
 
+string UncompressedStringSegmentState::GetSegmentInfo() const {
+	if (on_disk_blocks.empty()) {
+		return "";
+	}
+	string result = StringUtil::Join(on_disk_blocks, on_disk_blocks.size(), ", ",
+	                                 [&](block_id_t block) { return to_string(block); });
+	return "Overflow String Block Ids: " + result;
+}
+
+vector<block_id_t> UncompressedStringSegmentState::GetAdditionalBlocks() const {
+	return on_disk_blocks;
+}
+
+void UncompressedStringSegmentState::Cleanup(BlockManager &manager_p) {
+	auto &manager = block_manager ? *block_manager : manager_p;
+	for (auto &block_id : on_disk_blocks) {
+		manager.MarkBlockAsModified(block_id);
+	}
+	on_disk_blocks.clear();
+}
+
 void WriteOverflowStringsToDisk::WriteString(UncompressedStringSegmentState &state, string_t string,
                                              block_id_t &result_block, int32_t &result_offset) {
 	auto &block_manager = partial_block_manager.GetBlockManager();
diff --git a/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp b/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp
@@ -72,8 +72,9 @@ void UncompressedCompressState::CreateEmptySegment(idx_t row_start) {
 	    ColumnSegment::CreateTransientSegment(db, function, type, row_start, info.GetBlockSize(), info.GetBlockSize());
 	if (type.InternalType() == PhysicalType::VARCHAR) {
 		auto &state = compressed_segment->GetSegmentState()->Cast<UncompressedStringSegmentState>();
-		state.overflow_writer =
-		    make_uniq<WriteOverflowStringsToDisk>(checkpoint_data.GetCheckpointState().GetPartialBlockManager());
+		auto &partial_block_manager = checkpoint_data.GetCheckpointState().GetPartialBlockManager();
+		state.block_manager = partial_block_manager.GetBlockManager();
+		state.overflow_writer = make_uniq<WriteOverflowStringsToDisk>(partial_block_manager);
 	}
 	current_segment = std::move(compressed_segment);
 	current_segment->InitializeAppend(append_state);
diff --git a/src/duckdb/src/storage/compression/string_uncompressed.cpp b/src/duckdb/src/storage/compression/string_uncompressed.cpp
@@ -260,9 +260,7 @@ unique_ptr<ColumnSegmentState> UncompressedStringStorage::DeserializeState(Deser
 void UncompressedStringStorage::CleanupState(ColumnSegment &segment) {
 	auto &state = segment.GetSegmentState()->Cast<UncompressedStringSegmentState>();
 	auto &block_manager = segment.GetBlockManager();
-	for (auto &block_id : state.on_disk_blocks) {
-		block_manager.MarkBlockAsModified(block_id);
-	}
+	state.Cleanup(block_manager);
 }
 
 //===--------------------------------------------------------------------===//
diff --git a/src/duckdb/src/storage/table/column_segment.cpp b/src/duckdb/src/storage/table/column_segment.cpp

Original file line number	Diff line number	Diff line change
`@@ -583,6 +583,7 @@ static bool EvaluateFilterAgainstConstant(TableFilter &filter, const Value &cons`
`583`	`583`	`//! No filter_data assigned (does this mean the DynamicFilter is broken??)`
`584`	`584`	`return true;`
`585`	`585`	`}`
	`586`	`+ lock_guard<mutex> lock(dynamic_filter.filter_data->lock);`
`586`	`587`	`if (!dynamic_filter.filter_data->initialized) {`
`587`	`588`	`//! Not initialized`
`588`	`589`	`return true;`
`@@ -591,7 +592,6 @@ static bool EvaluateFilterAgainstConstant(TableFilter &filter, const Value &cons`
`591`	`592`	`//! No filter present`
`592`	`593`	`return true;`
`593`	`594`	`}`
`594`		`- lock_guard<mutex> lock(dynamic_filter.filter_data->lock);`
`595`	`595`	`return EvaluateFilterAgainstConstant(*dynamic_filter.filter_data->filter, constant);`
`596`	`596`	`}`
`597`	`597`	`default:`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ PhysicalOperator &PhysicalPlanGenerator::ResolveAndPlan(unique_ptr<LogicalOperat`
`50`	`50`
`51`	`51`	`unique_ptr<PhysicalPlan> PhysicalPlanGenerator::PlanInternal(LogicalOperator &op) {`
`52`	`52`	`if (!physical_plan) {`
`53`		`- physical_plan = make_uniq<PhysicalPlan>();`
	`53`	`+ physical_plan = make_uniq<PhysicalPlan>(Allocator::Get(context));`
`54`	`54`	`}`
`55`	`55`	`op.estimated_cardinality = op.EstimateCardinality(context);`
`56`	`56`	`physical_plan->SetRoot(CreatePlan(op));`