staticlibs
diff --git a/‎src/duckdb/extension/parquet/parquet_reader.cpp
Lines changed: 2 additions & 1 deletion b/‎src/duckdb/extension/parquet/parquet_reader.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/duckdb/src/common/radix_partitioning.cpp
Lines changed: 42 additions & 25 deletions b/‎src/duckdb/src/common/radix_partitioning.cpp
Lines changed: 42 additions & 25 deletions
diff --git a/‎src/duckdb/src/common/row_operations/row_aggregate.cpp
Lines changed: 4 additions & 1 deletion b/‎src/duckdb/src/common/row_operations/row_aggregate.cpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/duckdb/src/common/sort/partition_state.cpp
Lines changed: 8 additions & 28 deletions b/‎src/duckdb/src/common/sort/partition_state.cpp
Lines changed: 8 additions & 28 deletions
diff --git a/‎src/duckdb/src/common/types/hash.cpp
Lines changed: 49 additions & 39 deletions b/‎src/duckdb/src/common/types/hash.cpp
Lines changed: 49 additions & 39 deletions
@@ -558,7 +558,8 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
 	} else { // leaf node
 		if (!s_ele.__isset.type) {
 			throw InvalidInputException(
-			    "Node has neither num_children nor type set - this violates the Parquet spec (corrupted file)");
+			    "Node '%s' has neither num_children nor type set - this violates the Parquet spec (corrupted file)",
+			    s_ele.name.c_str());
 		}
 		auto result = ParseColumnSchema(s_ele, max_define, max_repeat, this_idx, next_file_idx++);
 		if (s_ele.repetition_type == FieldRepetitionType::REPEATED) {
 
@@ -83,16 +83,34 @@ idx_t RadixPartitioning::Select(Vector &hashes, const SelectionVector *sel, cons
 
 struct ComputePartitionIndicesFunctor {
 	template <idx_t radix_bits>
-	static void Operation(Vector &hashes, Vector &partition_indices, const SelectionVector &append_sel,
-	                      const idx_t append_count) {
+	static void Operation(Vector &hashes, Vector &partition_indices, const idx_t original_count,
+	                      const SelectionVector &append_sel, const idx_t append_count) {
 		using CONSTANTS = RadixPartitioningConstants<radix_bits>;
-		if (append_sel.IsSet()) {
-			auto hashes_sliced = Vector(hashes, append_sel, append_count);
-			UnaryExecutor::Execute<hash_t, hash_t>(hashes_sliced, partition_indices, append_count,
-			                                       [&](hash_t hash) { return CONSTANTS::ApplyMask(hash); });
-		} else {
+		if (!append_sel.IsSet() || hashes.GetVectorType() == VectorType::CONSTANT_VECTOR) {
 			UnaryExecutor::Execute<hash_t, hash_t>(hashes, partition_indices, append_count,
 			                                       [&](hash_t hash) { return CONSTANTS::ApplyMask(hash); });
+		} else {
+			// We could just slice the "hashes" vector and use the UnaryExecutor
+			// But slicing a dictionary vector causes SelectionData to be allocated
+			// Instead, we just directly compute the partition indices using the selection vectors
+			UnifiedVectorFormat format;
+			hashes.ToUnifiedFormat(original_count, format);
+			const auto source_data = UnifiedVectorFormat::GetData<hash_t>(format);
+			const auto &source_sel = *format.sel;
+
+			const auto target = FlatVector::GetData<hash_t>(partition_indices);
+
+			if (source_sel.IsSet()) {
+				for (idx_t i = 0; i < append_count; i++) {
+					const auto source_idx = source_sel.get_index(append_sel[i]);
+					target[i] = CONSTANTS::ApplyMask(source_data[source_idx]);
+				}
+			} else {
+				for (idx_t i = 0; i < append_count; i++) {
+					const auto source_idx = append_sel[i];
+					target[i] = CONSTANTS::ApplyMask(source_data[source_idx]);
+				}
+			}
 		}
 	}
 };
@@ -143,24 +161,20 @@ void RadixPartitionedColumnData::ComputePartitionIndices(PartitionedColumnDataAp
 	D_ASSERT(partitions.size() == RadixPartitioning::NumberOfPartitions(radix_bits));
 	D_ASSERT(state.partition_buffers.size() == RadixPartitioning::NumberOfPartitions(radix_bits));
 	RadixBitsSwitch<ComputePartitionIndicesFunctor, void>(radix_bits, input.data[hash_col_idx], state.partition_indices,
-	                                                      *FlatVector::IncrementalSelectionVector(), input.size());
+	                                                      input.size(), *FlatVector::IncrementalSelectionVector(),
+	                                                      input.size());
 }
 
 //===--------------------------------------------------------------------===//
 // Tuple Data Partitioning
 //===--------------------------------------------------------------------===//
-RadixPartitionedTupleData::RadixPartitionedTupleData(BufferManager &buffer_manager, const TupleDataLayout &layout_p,
-                                                     const idx_t radix_bits_p, const idx_t hash_col_idx_p)
-    : PartitionedTupleData(PartitionedTupleDataType::RADIX, buffer_manager, layout_p.Copy()), radix_bits(radix_bits_p),
+RadixPartitionedTupleData::RadixPartitionedTupleData(BufferManager &buffer_manager,
+                                                     shared_ptr<TupleDataLayout> layout_ptr, const idx_t radix_bits_p,
+                                                     const idx_t hash_col_idx_p)
+    : PartitionedTupleData(PartitionedTupleDataType::RADIX, buffer_manager, layout_ptr), radix_bits(radix_bits_p),
       hash_col_idx(hash_col_idx_p) {
 	D_ASSERT(radix_bits <= RadixPartitioning::MAX_RADIX_BITS);
 	D_ASSERT(hash_col_idx < layout.GetTypes().size());
-	const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
-	allocators->allocators.reserve(num_partitions);
-	for (idx_t i = 0; i < num_partitions; i++) {
-		CreateAllocator();
-	}
-	D_ASSERT(allocators->allocators.size() == num_partitions);
 	Initialize();
 }
 
@@ -186,8 +200,8 @@ void RadixPartitionedTupleData::InitializeAppendStateInternal(PartitionedTupleDa
 	const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
 	state.partition_pin_states.reserve(num_partitions);
 	for (idx_t i = 0; i < num_partitions; i++) {
-		state.partition_pin_states.emplace_back(make_unsafe_uniq<TupleDataPinState>());
-		partitions[i]->InitializeAppend(*state.partition_pin_states[i], properties);
+		state.partition_pin_states.emplace_back();
+		partitions[i]->InitializeAppend(state.partition_pin_states[i], properties);
 	}
 
 	// Init single chunk state
@@ -207,15 +221,18 @@ void RadixPartitionedTupleData::ComputePartitionIndices(PartitionedTupleDataAppe
                                                         const SelectionVector &append_sel, const idx_t append_count) {
 	D_ASSERT(partitions.size() == RadixPartitioning::NumberOfPartitions(radix_bits));
 	RadixBitsSwitch<ComputePartitionIndicesFunctor, void>(radix_bits, input.data[hash_col_idx], state.partition_indices,
-	                                                      append_sel, append_count);
+	                                                      input.size(), append_sel, append_count);
 }
 
-void RadixPartitionedTupleData::ComputePartitionIndices(Vector &row_locations, idx_t count,
-                                                        Vector &partition_indices) const {
-	Vector intermediate(LogicalType::HASH);
+void RadixPartitionedTupleData::ComputePartitionIndices(Vector &row_locations, idx_t count, Vector &partition_indices,
+                                                        unique_ptr<Vector> &utility_vector) const {
+	if (!utility_vector) {
+		utility_vector = make_uniq<Vector>(LogicalType::HASH);
+	}
+	Vector &intermediate = *utility_vector;
 	partitions[0]->Gather(row_locations, *FlatVector::IncrementalSelectionVector(), count, hash_col_idx, intermediate,
 	                      *FlatVector::IncrementalSelectionVector(), nullptr);
-	RadixBitsSwitch<ComputePartitionIndicesFunctor, void>(radix_bits, intermediate, partition_indices,
+	RadixBitsSwitch<ComputePartitionIndicesFunctor, void>(radix_bits, intermediate, partition_indices, count,
 	                                                      *FlatVector::IncrementalSelectionVector(), count);
 }
 
@@ -240,7 +257,7 @@ void RadixPartitionedTupleData::RepartitionFinalizeStates(PartitionedTupleData &
 	auto &partitions = new_partitioned_data.GetPartitions();
 	for (idx_t partition_index = from_idx; partition_index < to_idx; partition_index++) {
 		auto &partition = *partitions[partition_index];
-		auto &partition_pin_state = *state.partition_pin_states[partition_index];
+		auto &partition_pin_state = state.partition_pin_states[partition_index];
 		partition.FinalizePinState(partition_pin_state);
 	}
 }
 
@@ -102,7 +102,10 @@ void RowOperations::CombineStates(RowOperationsState &state, TupleDataLayout &la
 void RowOperations::FinalizeStates(RowOperationsState &state, TupleDataLayout &layout, Vector &addresses,
                                    DataChunk &result, idx_t aggr_idx) {
 	// Copy the addresses
-	Vector addresses_copy(LogicalType::POINTER);
+	if (!state.addresses) {
+		state.addresses = make_uniq<Vector>(LogicalType::POINTER);
+	}
+	auto &addresses_copy = *state.addresses;
 	VectorOperations::Copy(addresses, addresses_copy, result.size(), 0, 0);
 
 	//	Move to the first aggregate state
 
@@ -1,12 +1,9 @@
 #include "duckdb/common/sort/partition_state.hpp"
 
-#include "duckdb/common/types/column/column_data_consumer.hpp"
 #include "duckdb/common/row_operations/row_operations.hpp"
 #include "duckdb/main/config.hpp"
 #include "duckdb/parallel/executor_task.hpp"
 
-#include <numeric>
-
 namespace duckdb {
 
 PartitionGlobalHashGroup::PartitionGlobalHashGroup(ClientContext &context, const Orders &partitions,
@@ -99,16 +96,17 @@ PartitionGlobalSinkState::PartitionGlobalSinkState(ClientContext &context,
 		++max_bits;
 	}
 
+	grouping_types_ptr = make_shared_ptr<TupleDataLayout>();
 	if (!orders.empty()) {
 		if (partitions.empty()) {
 			//	Sort early into a dedicated hash group if we only sort.
-			grouping_types.Initialize(payload_types);
+			grouping_types_ptr->Initialize(payload_types);
 			auto new_group = make_uniq<PartitionGlobalHashGroup>(context, partitions, orders, payload_types, external);
 			hash_groups.emplace_back(std::move(new_group));
 		} else {
 			auto types = payload_types;
 			types.push_back(LogicalType::HASH);
-			grouping_types.Initialize(types);
+			grouping_types_ptr->Initialize(types);
 			ResizeGroupingData(estimated_cardinality);
 		}
 	}
@@ -132,13 +130,14 @@ void PartitionGlobalSinkState::SyncPartitioning(const PartitionGlobalSinkState &
 	const auto old_bits = grouping_data ? grouping_data->GetRadixBits() : 0;
 	if (fixed_bits != old_bits) {
 		const auto hash_col_idx = payload_types.size();
-		grouping_data = make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, fixed_bits, hash_col_idx);
+		grouping_data =
+		    make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types_ptr, fixed_bits, hash_col_idx);
 	}
 }
 
 unique_ptr<RadixPartitionedTupleData> PartitionGlobalSinkState::CreatePartition(idx_t new_bits) const {
 	const auto hash_col_idx = payload_types.size();
-	return make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, new_bits, hash_col_idx);
+	return make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types_ptr, new_bits, hash_col_idx);
 }
 
 void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
@@ -476,7 +475,7 @@ void PartitionLocalMergeState::ExecuteTask() {
 bool PartitionGlobalMergeState::AssignTask(PartitionLocalMergeState &local_state) {
 	lock_guard<mutex> guard(lock);
 
-	if (tasks_assigned >= total_tasks) {
+	if (tasks_assigned >= total_tasks && !TryPrepareNextStage()) {
 		return false;
 	}
 
@@ -495,15 +494,13 @@ void PartitionGlobalMergeState::CompleteTask() {
 }
 
 bool PartitionGlobalMergeState::TryPrepareNextStage() {
-	lock_guard<mutex> guard(lock);
-
 	if (tasks_completed < total_tasks) {
 		return false;
 	}
 
 	tasks_assigned = tasks_completed = 0;
 
-	switch (stage) {
+	switch (stage.load()) {
 	case PartitionSortStage::INIT:
 		//	If the partitions are unordered, don't scan in parallel
 		//	because it produces non-deterministic orderings.
@@ -632,23 +629,6 @@ bool PartitionGlobalMergeStates::ExecuteTask(PartitionLocalMergeState &local_sta
 				break;
 			}
 
-			// Hash group global state couldn't assign a task to this thread
-			// Try to prepare the next stage
-			if (!global_state->TryPrepareNextStage()) {
-				// This current hash group is not yet done
-				// But we were not able to assign a task for it to this thread
-				// See if the next hash group is better
-				continue;
-			}
-
-			// We were able to prepare the next stage for this hash group!
-			// Try to assign a task once more
-			if (global_state->AssignTask(local_state)) {
-				// We assigned a task to this thread!
-				// Break out of this loop to re-enter the top-level loop and execute the task
-				break;
-			}
-
 			// We were able to prepare the next merge round,
 			// but we were not able to assign a task for it to this thread
 			// The tasks were assigned to other threads while this thread waited for the lock
 
@@ -4,7 +4,6 @@
 #include "duckdb/common/types/string_type.hpp"
 #include "duckdb/common/types/interval.hpp"
 #include "duckdb/common/types/uhugeint.hpp"
-#include "duckdb/common/fast_mem.hpp"
 
 #include <functional>
 #include <cmath>
@@ -76,6 +75,43 @@ hash_t Hash(const char *str) {
 	return Hash(str, strlen(str));
 }
 
+template <bool AT_LEAST_8_BYTES = false>
+hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept {
+	// This seed slightly improves bit distribution, taken from here:
+	// https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE
+	// MIT License Copyright (c) 2018-2021 Martin Ankerl
+	hash_t h = 0xe17a1465U ^ (len * 0xc6a4a7935bd1e995U);
+
+	// Hash/combine in blocks of 8 bytes
+	const auto remainder = len & 7U;
+	for (const auto end = ptr + len - remainder; ptr != end; ptr += 8U) {
+		h ^= Load<hash_t>(ptr);
+		h *= 0xd6e8feb86659fd93U;
+	}
+
+	if (remainder != 0) {
+		if (AT_LEAST_8_BYTES) {
+			D_ASSERT(len >= 8);
+			// Load remaining (<8) bytes (with a Load instead of a memcpy)
+			const auto inv_rem = 8U - remainder;
+			const auto hr = Load<hash_t>(ptr - inv_rem) >> (inv_rem * 8U);
+
+			h ^= hr;
+			h *= 0xd6e8feb86659fd93U;
+		} else {
+			// Load remaining (<8) bytes (with a memcpy)
+			hash_t hr = 0;
+			memcpy(&hr, ptr, remainder);
+
+			h ^= hr;
+			h *= 0xd6e8feb86659fd93U;
+		}
+	}
+
+	// Finalize
+	return Hash(h);
+}
+
 template <>
 hash_t Hash(string_t val) {
 	// If the string is inlined, we can do a branchless hash
@@ -86,64 +122,38 @@ hash_t Hash(string_t val) {
 		hash_t h = 0xe17a1465U ^ (val.GetSize() * 0xc6a4a7935bd1e995U);
 
 		// Hash/combine the first 8-byte block
-		const bool not_an_empty_string = !val.Empty();
-		h ^= Load<hash_t>(const_data_ptr_cast(val.GetPrefix()));
-		h *= 0xd6e8feb86659fd93U * not_an_empty_string + (1 - not_an_empty_string);
+		if (!val.Empty()) {
+			h ^= Load<hash_t>(const_data_ptr_cast(val.GetPrefix()));
+			h *= 0xd6e8feb86659fd93U;
+		}
 
 		// Load remaining 4 bytes
-		hash_t hr = 0;
-		memcpy(&hr, const_data_ptr_cast(val.GetPrefix()) + sizeof(hash_t), 4U);
+		if (val.GetSize() > sizeof(hash_t)) {
+			hash_t hr = 0;
+			memcpy(&hr, const_data_ptr_cast(val.GetPrefix()) + sizeof(hash_t), 4U);
 
-		// Process the remainder the same an 8-byte block
-		// This operation is a NOP if the string is <= 8 bytes
-		const bool not_a_nop = val.GetSize() > sizeof(hash_t);
-		h ^= hr;
-		h *= 0xd6e8feb86659fd93U * not_a_nop + (1 - not_a_nop);
+			h ^= hr;
+			h *= 0xd6e8feb86659fd93U;
+		}
 
 		// Finalize
 		h = Hash(h);
 
 		// This is just an optimization. It should not change the result
 		// This property is important for verification (e.g., DUCKDB_DEBUG_NO_INLINE)
-		// We achieved this with the NOP trick above (and in HashBytes)
 		D_ASSERT(h == Hash(val.GetData(), val.GetSize()));
 
 		return h;
 	}
-	return Hash(val.GetData(), val.GetSize());
+	// Required for DUCKDB_DEBUG_NO_INLINE
+	return HashBytes<string_t::INLINE_LENGTH >= sizeof(hash_t)>(const_data_ptr_cast(val.GetData()), val.GetSize());
 }
 
 template <>
 hash_t Hash(char *val) {
 	return Hash<const char *>(val);
 }
 
-hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept {
-	// This seed slightly improves bit distribution, taken from here:
-	// https://github.com/martinus/robin-hood-hashing/blob/3.11.5/LICENSE
-	// MIT License Copyright (c) 2018-2021 Martin Ankerl
-	hash_t h = 0xe17a1465U ^ (len * 0xc6a4a7935bd1e995U);
-
-	// Hash/combine in blocks of 8 bytes
-	for (const auto end = ptr + len - (len & 7U); ptr != end; ptr += 8U) {
-		h ^= Load<hash_t>(ptr);
-		h *= 0xd6e8feb86659fd93U;
-	}
-
-	// Load remaining (<8) bytes
-	hash_t hr = 0;
-	memcpy(&hr, ptr, len & 7U);
-
-	// Process the remainder same as an 8-byte block
-	// This operation is a NOP if the number of remaining bytes is 0
-	const bool not_a_nop = len & 7U;
-	h ^= hr;
-	h *= 0xd6e8feb86659fd93U * not_a_nop + (1 - not_a_nop);
-
-	// Finalize
-	return Hash(h);
-}
-
 hash_t Hash(const char *val, size_t size) {
 	return HashBytes(const_data_ptr_cast(val), size);
 }