Skip to content

Commit a9bf1a6

Browse files
Update vendored DuckDB sources to 25c7873
1 parent 25c7873 commit a9bf1a6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+15914
-15288
lines changed

src/duckdb/extension/json/json_scan.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,19 @@ void JSONScanLocalState::ParseJSON(char *const json_start, const idx_t json_size
390390
doc = JSONCommon::ReadDocumentUnsafe(json_start, remaining, JSONCommon::READ_INSITU_FLAG, allocator.GetYYAlc(),
391391
&err);
392392
}
393-
if (!bind_data.ignore_errors && err.code != YYJSON_READ_SUCCESS) {
394-
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err);
393+
if (err.code != YYJSON_READ_SUCCESS) {
394+
auto can_ignore_this_error = bind_data.ignore_errors;
395+
string extra;
396+
if (current_reader->GetFormat() != JSONFormat::NEWLINE_DELIMITED) {
397+
can_ignore_this_error = false;
398+
extra = bind_data.ignore_errors
399+
? "Parse errors cannot be ignored for JSON formats other than 'newline_delimited'"
400+
: "";
401+
}
402+
if (!can_ignore_this_error) {
403+
current_reader->ThrowParseError(current_buffer_handle->buffer_index, lines_or_objects_in_buffer, err,
404+
extra);
405+
}
395406
}
396407

397408
// We parse with YYJSON_STOP_WHEN_DONE, so we need to check this by hand

src/duckdb/src/common/radix_partitioning.cpp

+13-15
Original file line numberDiff line numberDiff line change
@@ -60,27 +60,25 @@ RETURN_TYPE RadixBitsSwitch(const idx_t radix_bits, ARGS &&... args) {
6060
} // LCOV_EXCL_STOP
6161
}
6262

63-
template <idx_t radix_bits>
64-
struct RadixLessThan {
65-
static inline bool Operation(hash_t hash, hash_t cutoff) {
66-
using CONSTANTS = RadixPartitioningConstants<radix_bits>;
67-
return CONSTANTS::ApplyMask(hash) < cutoff;
68-
}
69-
};
70-
7163
struct SelectFunctor {
7264
template <idx_t radix_bits>
73-
static idx_t Operation(Vector &hashes, const SelectionVector *sel, const idx_t count, const idx_t cutoff,
74-
SelectionVector *true_sel, SelectionVector *false_sel) {
75-
Vector cutoff_vector(Value::HASH(cutoff));
76-
return BinaryExecutor::Select<hash_t, hash_t, RadixLessThan<radix_bits>>(hashes, cutoff_vector, sel, count,
77-
true_sel, false_sel);
65+
static idx_t Operation(Vector &hashes, const SelectionVector *sel, const idx_t count,
66+
const ValidityMask &partition_mask, SelectionVector *true_sel, SelectionVector *false_sel) {
67+
using CONSTANTS = RadixPartitioningConstants<radix_bits>;
68+
return UnaryExecutor::Select<hash_t>(
69+
hashes, sel, count,
70+
[&](const hash_t hash) {
71+
const auto partition_idx = CONSTANTS::ApplyMask(hash);
72+
return partition_mask.RowIsValidUnsafe(partition_idx);
73+
},
74+
true_sel, false_sel);
7875
}
7976
};
8077

8178
idx_t RadixPartitioning::Select(Vector &hashes, const SelectionVector *sel, const idx_t count, const idx_t radix_bits,
82-
const idx_t cutoff, SelectionVector *true_sel, SelectionVector *false_sel) {
83-
return RadixBitsSwitch<SelectFunctor, idx_t>(radix_bits, hashes, sel, count, cutoff, true_sel, false_sel);
79+
const ValidityMask &partition_mask, SelectionVector *true_sel,
80+
SelectionVector *false_sel) {
81+
return RadixBitsSwitch<SelectFunctor, idx_t>(radix_bits, hashes, sel, count, partition_mask, true_sel, false_sel);
8482
}
8583

8684
struct ComputePartitionIndicesFunctor {

src/duckdb/src/common/row_operations/row_radix_scatter.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,13 @@ void RadixScatterStructVector(Vector &v, UnifiedVectorFormat &vdata, idx_t vcoun
251251
for (idx_t i = 0; i < add_count; i++) {
252252
auto idx = sel.get_index(i);
253253
auto source_idx = vdata.sel->get_index(idx) + offset;
254+
254255
// write validity and according value
255256
if (validity.RowIsValid(source_idx)) {
256257
key_locations[i][0] = valid;
257258
} else {
258259
key_locations[i][0] = invalid;
260+
memset(key_locations[i] + 1, '\0', width - 1);
259261
}
260262
key_locations[i]++;
261263
}

src/duckdb/src/execution/join_hashtable.cpp

+76-27
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ JoinHashTable::JoinHashTable(ClientContext &context, const vector<JoinCondition>
3434
: buffer_manager(BufferManager::GetBufferManager(context)), conditions(conditions_p),
3535
build_types(std::move(btypes)), output_columns(output_columns_p), entry_size(0), tuple_size(0),
3636
vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),
37-
radix_bits(INITIAL_RADIX_BITS), partition_start(0), partition_end(0) {
37+
radix_bits(INITIAL_RADIX_BITS) {
3838
for (idx_t i = 0; i < conditions.size(); ++i) {
3939
auto &condition = conditions[i];
4040
D_ASSERT(condition.left->return_type == condition.right->return_type);
@@ -108,6 +108,8 @@ JoinHashTable::JoinHashTable(ClientContext &context, const vector<JoinCondition>
108108
auto &config = ClientConfig::GetConfig(context);
109109
single_join_error_on_multiple_rows = config.scalar_subquery_error_on_multiple_rows;
110110
}
111+
112+
InitializePartitionMasks();
111113
}
112114

113115
JoinHashTable::~JoinHashTable() {
@@ -1430,7 +1432,10 @@ idx_t JoinHashTable::GetRemainingSize() const {
14301432

14311433
idx_t count = 0;
14321434
idx_t data_size = 0;
1433-
for (idx_t partition_idx = partition_end; partition_idx < num_partitions; partition_idx++) {
1435+
for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
1436+
if (completed_partitions.RowIsValidUnsafe(partition_idx)) {
1437+
continue;
1438+
}
14341439
count += partitions[partition_idx]->Count();
14351440
data_size += partitions[partition_idx]->SizeInBytes();
14361441
}
@@ -1464,6 +1469,32 @@ void JoinHashTable::SetRepartitionRadixBits(const idx_t max_ht_size, const idx_t
14641469
radix_bits += added_bits;
14651470
sink_collection =
14661471
make_uniq<RadixPartitionedTupleData>(buffer_manager, layout, radix_bits, layout.ColumnCount() - 1);
1472+
1473+
// Need to initialize again after changing the number of bits
1474+
InitializePartitionMasks();
1475+
}
1476+
1477+
void JoinHashTable::InitializePartitionMasks() {
1478+
const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
1479+
1480+
current_partitions.Initialize(num_partitions);
1481+
current_partitions.SetAllInvalid(num_partitions);
1482+
1483+
completed_partitions.Initialize(num_partitions);
1484+
completed_partitions.SetAllInvalid(num_partitions);
1485+
}
1486+
1487+
idx_t JoinHashTable::CurrentPartitionCount() const {
1488+
const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
1489+
D_ASSERT(current_partitions.Capacity() == num_partitions);
1490+
return current_partitions.CountValid(num_partitions);
1491+
}
1492+
1493+
idx_t JoinHashTable::FinishedPartitionCount() const {
1494+
const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
1495+
D_ASSERT(completed_partitions.Capacity() == num_partitions);
1496+
// We already marked the active partitions as done, so we have to subtract them here
1497+
return completed_partitions.CountValid(num_partitions) - CurrentPartitionCount();
14671498
}
14681499

14691500
void JoinHashTable::Repartition(JoinHashTable &global_ht) {
@@ -1477,6 +1508,7 @@ void JoinHashTable::Repartition(JoinHashTable &global_ht) {
14771508
void JoinHashTable::Reset() {
14781509
data_collection->Reset();
14791510
hash_map.Reset();
1511+
current_partitions.SetAllInvalid(RadixPartitioning::NumberOfPartitions(radix_bits));
14801512
finalized = false;
14811513
}
14821514

@@ -1486,33 +1518,46 @@ bool JoinHashTable::PrepareExternalFinalize(const idx_t max_ht_size) {
14861518
}
14871519

14881520
const auto num_partitions = RadixPartitioning::NumberOfPartitions(radix_bits);
1489-
if (partition_end == num_partitions) {
1490-
return false;
1521+
D_ASSERT(current_partitions.Capacity() == num_partitions);
1522+
D_ASSERT(completed_partitions.Capacity() == num_partitions);
1523+
D_ASSERT(current_partitions.CheckAllInvalid(num_partitions));
1524+
1525+
if (completed_partitions.CheckAllValid(num_partitions)) {
1526+
return false; // All partitions are done
14911527
}
14921528

1493-
// Start where we left off
1529+
// Create vector with unfinished partition indices
14941530
auto &partitions = sink_collection->GetPartitions();
1495-
partition_start = partition_end;
1531+
vector<idx_t> partition_indices;
1532+
partition_indices.reserve(num_partitions);
1533+
for (idx_t partition_idx = 0; partition_idx < num_partitions; partition_idx++) {
1534+
if (!completed_partitions.RowIsValidUnsafe(partition_idx)) {
1535+
partition_indices.push_back(partition_idx);
1536+
}
1537+
}
1538+
// Sort partitions by size, from small to large
1539+
std::sort(partition_indices.begin(), partition_indices.end(), [&](const idx_t &lhs, const idx_t &rhs) {
1540+
const auto lhs_size = partitions[lhs]->SizeInBytes() + PointerTableSize(partitions[lhs]->Count());
1541+
const auto rhs_size = partitions[rhs]->SizeInBytes() + PointerTableSize(partitions[rhs]->Count());
1542+
return lhs_size < rhs_size;
1543+
});
14961544

1497-
// Determine how many partitions we can do next (at least one)
1545+
// Determine which partitions should go next
14981546
idx_t count = 0;
14991547
idx_t data_size = 0;
1500-
idx_t partition_idx;
1501-
for (partition_idx = partition_start; partition_idx < num_partitions; partition_idx++) {
1502-
auto incl_count = count + partitions[partition_idx]->Count();
1503-
auto incl_data_size = data_size + partitions[partition_idx]->SizeInBytes();
1504-
auto incl_ht_size = incl_data_size + PointerTableSize(incl_count);
1548+
for (const auto &partition_idx : partition_indices) {
1549+
D_ASSERT(!completed_partitions.RowIsValidUnsafe(partition_idx));
1550+
const auto incl_count = count + partitions[partition_idx]->Count();
1551+
const auto incl_data_size = data_size + partitions[partition_idx]->SizeInBytes();
1552+
const auto incl_ht_size = incl_data_size + PointerTableSize(incl_count);
15051553
if (count > 0 && incl_ht_size > max_ht_size) {
1506-
break;
1554+
break; // Always add at least one partition
15071555
}
15081556
count = incl_count;
15091557
data_size = incl_data_size;
1510-
}
1511-
partition_end = partition_idx;
1512-
1513-
// Move the partitions to the main data collection
1514-
for (partition_idx = partition_start; partition_idx < partition_end; partition_idx++) {
1515-
data_collection->Combine(*partitions[partition_idx]);
1558+
current_partitions.SetValidUnsafe(partition_idx); // Mark as currently active
1559+
data_collection->Combine(*partitions[partition_idx]); // Move partition to the main data collection
1560+
completed_partitions.SetValidUnsafe(partition_idx); // Also already mark as done
15161561
}
15171562
D_ASSERT(Count() == count);
15181563

@@ -1531,7 +1576,7 @@ void JoinHashTable::ProbeAndSpill(ScanStructure &scan_structure, DataChunk &prob
15311576
SelectionVector false_sel(STANDARD_VECTOR_SIZE);
15321577
const auto true_count =
15331578
RadixPartitioning::Select(hashes, FlatVector::IncrementalSelectionVector(), probe_keys.size(), radix_bits,
1534-
partition_end, &true_sel, &false_sel);
1579+
current_partitions, &true_sel, &false_sel);
15351580
const auto false_count = probe_keys.size() - true_count;
15361581

15371582
// can't probe these values right now, append to spill
@@ -1596,21 +1641,25 @@ void ProbeSpill::Finalize() {
15961641
}
15971642

15981643
void ProbeSpill::PrepareNextProbe() {
1644+
global_spill_collection.reset();
15991645
auto &partitions = global_partitions->GetPartitions();
1600-
if (partitions.empty() || ht.partition_start == partitions.size()) {
1646+
if (partitions.empty() || ht.current_partitions.CheckAllInvalid(partitions.size())) {
16011647
// Can't probe, just make an empty one
16021648
global_spill_collection =
16031649
make_uniq<ColumnDataCollection>(BufferManager::GetBufferManager(context), probe_types);
16041650
} else {
1605-
// Move specific partitions to the global spill collection
1606-
global_spill_collection = std::move(partitions[ht.partition_start]);
1607-
for (idx_t i = ht.partition_start + 1; i < ht.partition_end; i++) {
1608-
auto &partition = partitions[i];
1609-
if (global_spill_collection->Count() == 0) {
1651+
// Move current partitions to the global spill collection
1652+
for (idx_t partition_idx = 0; partition_idx < partitions.size(); partition_idx++) {
1653+
if (!ht.current_partitions.RowIsValidUnsafe(partition_idx)) {
1654+
continue;
1655+
}
1656+
auto &partition = partitions[partition_idx];
1657+
if (!global_spill_collection) {
16101658
global_spill_collection = std::move(partition);
1611-
} else {
1659+
} else if (partition->Count() != 0) {
16121660
global_spill_collection->Combine(*partition);
16131661
}
1662+
partition.reset();
16141663
}
16151664
}
16161665
consumer = make_uniq<ColumnDataConsumer>(*global_spill_collection, column_ids);

src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp

+11-7
Original file line numberDiff line numberDiff line change
@@ -1330,17 +1330,21 @@ void StringValueScanner::ProcessOverBufferValue() {
13301330
value = string_t(over_buffer_string.c_str() + result.quoted_position,
13311331
UnsafeNumericCast<uint32_t>(over_buffer_string.size() - 1 - result.quoted_position));
13321332
if (result.escaped) {
1333-
const auto str_ptr = over_buffer_string.c_str() + result.quoted_position;
1334-
value = RemoveEscape(str_ptr, over_buffer_string.size() - 2,
1335-
state_machine->dialect_options.state_machine_options.escape.GetValue(),
1336-
result.parse_chunk.data[result.chunk_col_id]);
1333+
if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
1334+
const auto str_ptr = over_buffer_string.c_str() + result.quoted_position;
1335+
value = RemoveEscape(str_ptr, over_buffer_string.size() - 2,
1336+
state_machine->dialect_options.state_machine_options.escape.GetValue(),
1337+
result.parse_chunk.data[result.chunk_col_id]);
1338+
}
13371339
}
13381340
} else {
13391341
value = string_t(over_buffer_string.c_str(), UnsafeNumericCast<uint32_t>(over_buffer_string.size()));
13401342
if (result.escaped) {
1341-
value = RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(),
1342-
state_machine->dialect_options.state_machine_options.escape.GetValue(),
1343-
result.parse_chunk.data[result.chunk_col_id]);
1343+
if (!result.HandleTooManyColumnsError(over_buffer_string.c_str(), over_buffer_string.size())) {
1344+
value = RemoveEscape(over_buffer_string.c_str(), over_buffer_string.size(),
1345+
state_machine->dialect_options.state_machine_options.escape.GetValue(),
1346+
result.parse_chunk.data[result.chunk_col_id]);
1347+
}
13441348
}
13451349
}
13461350
if (states.EmptyLine() && state_machine->dialect_options.num_cols == 1) {

src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector<HeaderValue> &be
174174
CSVReaderOptions &options, CSVErrorHandler &error_handler) {
175175
vector<string> detected_names;
176176
auto &dialect_options = state_machine.dialect_options;
177+
dialect_options.num_cols = best_sql_types_candidates_per_column_idx.size();
177178
if (best_header_row.empty()) {
178179
dialect_options.header = false;
179180
for (idx_t col = 0; col < dialect_options.num_cols; col++) {
@@ -192,6 +193,19 @@ CSVSniffer::DetectHeaderInternal(ClientContext &context, vector<HeaderValue> &be
192193
// If null-padding is not allowed and there is a mismatch between our header candidate and the number of columns
193194
// We can't detect the dialect/type options properly
194195
if (!options.null_padding && best_sql_types_candidates_per_column_idx.size() != best_header_row.size()) {
196+
if (options.ignore_errors.GetValue()) {
197+
dialect_options.header = false;
198+
for (idx_t col = 0; col < dialect_options.num_cols; col++) {
199+
detected_names.push_back(GenerateColumnName(dialect_options.num_cols, col));
200+
}
201+
dialect_options.rows_until_header += 1;
202+
if (!options.columns_set) {
203+
for (idx_t i = 0; i < MinValue<idx_t>(detected_names.size(), options.name_list.size()); i++) {
204+
detected_names[i] = options.name_list[i];
205+
}
206+
}
207+
return detected_names;
208+
}
195209
auto error =
196210
CSVError::HeaderSniffingError(options, best_header_row, best_sql_types_candidates_per_column_idx.size(),
197211
state_machine.dialect_options.state_machine_options.delimiter.GetValue());

src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp

+15-5
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
8989
}
9090
}
9191
// 2) Field Separator State
92-
transition_array[quote][static_cast<uint8_t>(CSVState::DELIMITER)] = CSVState::QUOTED;
92+
if (quote != '\0') {
93+
transition_array[quote][static_cast<uint8_t>(CSVState::DELIMITER)] = CSVState::QUOTED;
94+
}
9395
if (delimiter_first_byte != ' ') {
9496
transition_array[' '][static_cast<uint8_t>(CSVState::DELIMITER)] = CSVState::EMPTY_SPACE;
9597
}
@@ -164,7 +166,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
164166
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] =
165167
CSVState::RECORD_SEPARATOR;
166168
}
167-
transition_array[quote][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] = CSVState::QUOTED;
169+
if (quote != '\0') {
170+
transition_array[quote][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] = CSVState::QUOTED;
171+
}
168172
if (delimiter_first_byte != ' ') {
169173
transition_array[' '][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] = CSVState::EMPTY_SPACE;
170174
}
@@ -180,7 +184,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
180184
CSVState::RECORD_SEPARATOR;
181185
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] =
182186
CSVState::CARRIAGE_RETURN;
183-
transition_array[quote][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] = CSVState::QUOTED;
187+
if (quote != '\0') {
188+
transition_array[quote][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] = CSVState::QUOTED;
189+
}
184190
if (delimiter_first_byte != ' ') {
185191
transition_array[' '][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] = CSVState::EMPTY_SPACE;
186192
}
@@ -240,7 +246,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
240246
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::NOT_SET)] =
241247
CSVState::RECORD_SEPARATOR;
242248
}
243-
transition_array[quote][static_cast<uint8_t>(CSVState::NOT_SET)] = CSVState::QUOTED;
249+
if (quote != '\0') {
250+
transition_array[quote][static_cast<uint8_t>(CSVState::NOT_SET)] = CSVState::QUOTED;
251+
}
244252
if (delimiter_first_byte != ' ') {
245253
transition_array[' '][static_cast<uint8_t>(CSVState::NOT_SET)] = CSVState::EMPTY_SPACE;
246254
}
@@ -274,7 +282,9 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
274282
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::EMPTY_SPACE)] =
275283
CSVState::RECORD_SEPARATOR;
276284
}
277-
transition_array[quote][static_cast<uint8_t>(CSVState::EMPTY_SPACE)] = CSVState::QUOTED;
285+
if (quote != '\0') {
286+
transition_array[quote][static_cast<uint8_t>(CSVState::EMPTY_SPACE)] = CSVState::QUOTED;
287+
}
278288
if (comment != '\0') {
279289
transition_array[comment][static_cast<uint8_t>(CSVState::EMPTY_SPACE)] = CSVState::COMMENT;
280290
}

src/duckdb/src/execution/operator/csv_scanner/util/csv_error.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -329,12 +329,13 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i
329329
}
330330
// How many columns were expected and how many were found
331331
error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1;
332+
idx_t byte_pos = byte_position.GetIndex() == 0 ? 0 : byte_position.GetIndex() - 1;
332333
if (actual_columns >= options.dialect_options.num_cols) {
333-
return CSVError(error.str(), TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, row_byte_position,
334-
byte_position.GetIndex() - 1, options, how_to_fix_it.str(), current_path);
334+
return CSVError(error.str(), TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, row_byte_position, byte_pos,
335+
options, how_to_fix_it.str(), current_path);
335336
} else {
336-
return CSVError(error.str(), TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, row_byte_position,
337-
byte_position.GetIndex() - 1, options, how_to_fix_it.str(), current_path);
337+
return CSVError(error.str(), TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, row_byte_position, byte_pos,
338+
options, how_to_fix_it.str(), current_path);
338339
}
339340
}
340341

0 commit comments

Comments
 (0)