Skip to content

Commit 887b5f5

Browse files
Update vendored DuckDB sources to 85803a6
1 parent 85803a6 commit 887b5f5

File tree

33 files changed

+16659
-16262
lines changed

33 files changed

+16659
-16262
lines changed

src/duckdb/extension/json/include/json_reader.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ struct JSONFileHandle {
7171

7272
private:
7373
idx_t ReadInternal(char *pointer, const idx_t requested_size);
74-
idx_t ReadFromCache(char *&pointer, idx_t &size, idx_t &position);
74+
idx_t ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position);
7575

7676
private:
7777
//! The JSON file handle
@@ -83,7 +83,7 @@ struct JSONFileHandle {
8383
const idx_t file_size;
8484

8585
//! Read properties
86-
idx_t read_position;
86+
atomic<idx_t> read_position;
8787
atomic<idx_t> requested_reads;
8888
atomic<idx_t> actual_reads;
8989
atomic<bool> last_read_requested;

src/duckdb/extension/json/json_reader.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ idx_t JSONFileHandle::ReadInternal(char *pointer, const idx_t requested_size) {
148148
return total_read_size;
149149
}
150150

151-
idx_t JSONFileHandle::ReadFromCache(char *&pointer, idx_t &size, idx_t &position) {
151+
idx_t JSONFileHandle::ReadFromCache(char *&pointer, idx_t &size, atomic<idx_t> &position) {
152152
idx_t read_size = 0;
153153
idx_t total_offset = 0;
154154

src/duckdb/extension/parquet/parquet_extension.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ shared_ptr<BaseUnionData> ParquetMultiFileInfo::GetUnionData(shared_ptr<BaseFile
542542
result->metadata = std::move(scan.metadata);
543543
}
544544

545-
return result;
545+
return std::move(result);
546546
}
547547

548548
void ParquetMultiFileInfo::FinalizeReader(ClientContext &context, BaseFileReader &reader, GlobalTableFunctionState &) {

src/duckdb/src/catalog/catalog_entry/duck_table_entry.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ unique_ptr<CatalogEntry> DuckTableEntry::AlterEntry(ClientContext &context, Alte
225225
}
226226
case AlterTableType::SET_PARTITIONED_BY:
227227
throw NotImplementedException("SET PARTITIONED BY is not supported for DuckDB tables");
228+
case AlterTableType::SET_SORTED_BY:
229+
throw NotImplementedException("SET SORTED BY is not supported for DuckDB tables");
228230
default:
229231
throw InternalException("Unrecognized alter table type!");
230232
}

src/duckdb/src/common/enum_util.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -351,19 +351,20 @@ const StringUtil::EnumStringLiteral *GetAlterTableTypeValues() {
351351
{ static_cast<uint32_t>(AlterTableType::DROP_NOT_NULL), "DROP_NOT_NULL" },
352352
{ static_cast<uint32_t>(AlterTableType::SET_COLUMN_COMMENT), "SET_COLUMN_COMMENT" },
353353
{ static_cast<uint32_t>(AlterTableType::ADD_CONSTRAINT), "ADD_CONSTRAINT" },
354-
{ static_cast<uint32_t>(AlterTableType::SET_PARTITIONED_BY), "SET_PARTITIONED_BY" }
354+
{ static_cast<uint32_t>(AlterTableType::SET_PARTITIONED_BY), "SET_PARTITIONED_BY" },
355+
{ static_cast<uint32_t>(AlterTableType::SET_SORTED_BY), "SET_SORTED_BY" }
355356
};
356357
return values;
357358
}
358359

359360
template<>
360361
const char* EnumUtil::ToChars<AlterTableType>(AlterTableType value) {
361-
return StringUtil::EnumToString(GetAlterTableTypeValues(), 13, "AlterTableType", static_cast<uint32_t>(value));
362+
return StringUtil::EnumToString(GetAlterTableTypeValues(), 14, "AlterTableType", static_cast<uint32_t>(value));
362363
}
363364

364365
template<>
365366
AlterTableType EnumUtil::FromString<AlterTableType>(const char *value) {
366-
return static_cast<AlterTableType>(StringUtil::StringToEnum(GetAlterTableTypeValues(), 13, "AlterTableType", value));
367+
return static_cast<AlterTableType>(StringUtil::StringToEnum(GetAlterTableTypeValues(), 14, "AlterTableType", value));
367368
}
368369

369370
const StringUtil::EnumStringLiteral *GetAlterTypeValues() {

src/duckdb/src/execution/index/art/art.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ ART::ART(const string &name, const IndexConstraintType index_constraint_type, co
4545
const shared_ptr<array<unsafe_unique_ptr<FixedSizeAllocator>, ALLOCATOR_COUNT>> &allocators_ptr,
4646
const IndexStorageInfo &info)
4747
: BoundIndex(name, ART::TYPE_NAME, index_constraint_type, column_ids, table_io_manager, unbound_expressions, db),
48-
allocators(allocators_ptr), owns_data(false) {
48+
allocators(allocators_ptr), owns_data(false), verify_max_key_len(false) {
4949

5050
// FIXME: Use the new byte representation function to support nested types.
5151
for (idx_t i = 0; i < types.size(); i++) {
@@ -70,6 +70,12 @@ ART::ART(const string &name, const IndexConstraintType index_constraint_type, co
7070
}
7171
}
7272

73+
if (types.size() > 1) {
74+
verify_max_key_len = true;
75+
} else if (types[0] == PhysicalType::VARCHAR) {
76+
verify_max_key_len = true;
77+
}
78+
7379
// Initialize the allocators.
7480
SetPrefixCount(info);
7581
if (!allocators) {
@@ -380,11 +386,25 @@ void GenerateKeysInternal(ArenaAllocator &allocator, DataChunk &input, unsafe_ve
380386
template <>
381387
void ART::GenerateKeys<>(ArenaAllocator &allocator, DataChunk &input, unsafe_vector<ARTKey> &keys) {
382388
GenerateKeysInternal<false>(allocator, input, keys);
389+
if (!verify_max_key_len) {
390+
return;
391+
}
392+
auto max_len = MAX_KEY_LEN * idx_t(prefix_count);
393+
for (idx_t i = 0; i < input.size(); i++) {
394+
keys[i].VerifyKeyLength(max_len);
395+
}
383396
}
384397

385398
template <>
386399
void ART::GenerateKeys<true>(ArenaAllocator &allocator, DataChunk &input, unsafe_vector<ARTKey> &keys) {
387400
GenerateKeysInternal<true>(allocator, input, keys);
401+
if (!verify_max_key_len) {
402+
return;
403+
}
404+
auto max_len = MAX_KEY_LEN * idx_t(prefix_count);
405+
for (idx_t i = 0; i < input.size(); i++) {
406+
keys[i].VerifyKeyLength(max_len);
407+
}
388408
}
389409

390410
void ART::GenerateKeyVectors(ArenaAllocator &allocator, DataChunk &input, Vector &row_ids, unsafe_vector<ARTKey> &keys,
@@ -976,6 +996,8 @@ bool ART::Scan(IndexScanState &state, const idx_t max_count, unsafe_vector<row_t
976996
D_ASSERT(scan_state.values[0].type().InternalType() == types[0]);
977997
ArenaAllocator arena_allocator(Allocator::Get(db));
978998
auto key = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[0]);
999+
auto max_len = MAX_KEY_LEN * prefix_count;
1000+
key.VerifyKeyLength(max_len);
9791001

9801002
if (scan_state.values[1].IsNull()) {
9811003
// Single predicate.
@@ -1000,6 +1022,8 @@ bool ART::Scan(IndexScanState &state, const idx_t max_count, unsafe_vector<row_t
10001022
lock_guard<mutex> l(lock);
10011023
D_ASSERT(scan_state.values[1].type().InternalType() == types[0]);
10021024
auto upper_bound = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[1]);
1025+
upper_bound.VerifyKeyLength(max_len);
1026+
10031027
bool left_equal = scan_state.expressions[0] == ExpressionType ::COMPARE_GREATERTHANOREQUALTO;
10041028
bool right_equal = scan_state.expressions[1] == ExpressionType ::COMPARE_LESSTHANOREQUALTO;
10051029
return SearchCloseRange(key, upper_bound, left_equal, right_equal, max_count, row_ids);
@@ -1305,11 +1329,6 @@ void ART::SetPrefixCount(const IndexStorageInfo &info) {
13051329
return;
13061330
}
13071331

1308-
if (!IsUnique()) {
1309-
prefix_count = Prefix::ROW_ID_COUNT;
1310-
return;
1311-
}
1312-
13131332
idx_t compound_size = 0;
13141333
for (const auto &type : types) {
13151334
compound_size += GetTypeIdSize(type);

src/duckdb/src/execution/index/art/art_key.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ ARTKey::ARTKey(ArenaAllocator &allocator, idx_t len) : len(len) {
1616
data = allocator.Allocate(len);
1717
}
1818

19+
void ARTKey::VerifyKeyLength(const idx_t max_len) const {
20+
if (len > max_len) {
21+
throw InvalidInputException("key size of %d bytes exceeds the maximum size of %d bytes for this ART", len,
22+
max_len);
23+
}
24+
}
25+
1926
template <>
2027
ARTKey ARTKey::CreateARTKey(ArenaAllocator &allocator, string_t value) {
2128
auto string_data = const_data_ptr_cast(value.GetData());
@@ -29,22 +36,22 @@ ARTKey ARTKey::CreateARTKey(ArenaAllocator &allocator, string_t value) {
2936
}
3037
}
3138

32-
idx_t len = string_len + escape_count + 1;
33-
auto data = allocator.Allocate(len);
39+
idx_t key_len = string_len + escape_count + 1;
40+
auto key_data = allocator.Allocate(key_len);
3441

3542
// Copy over the data and add escapes.
3643
idx_t pos = 0;
3744
for (idx_t i = 0; i < string_len; i++) {
3845
if (string_data[i] <= 1) {
3946
// Add escape.
40-
data[pos++] = '\01';
47+
key_data[pos++] = '\01';
4148
}
42-
data[pos++] = string_data[i];
49+
key_data[pos++] = string_data[i];
4350
}
4451

4552
// End with a null-terminator.
46-
data[pos] = '\0';
47-
return ARTKey(data, len);
53+
key_data[pos] = '\0';
54+
return ARTKey(key_data, key_len);
4855
}
4956

5057
template <>

src/duckdb/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,8 +1344,39 @@ void StringValueScanner::ProcessOverBufferValue() {
13441344
}
13451345
if (over_buffer_string.empty() &&
13461346
state_machine->dialect_options.state_machine_options.new_line == NewLineIdentifier::CARRY_ON) {
1347-
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n') {
1348-
iterator.pos.buffer_pos++;
1347+
if (!iterator.IsBoundarySet()) {
1348+
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\n') {
1349+
iterator.pos.buffer_pos++;
1350+
}
1351+
} else {
1352+
while (iterator.pos.buffer_pos < cur_buffer_handle->actual_size &&
1353+
(buffer_handle_ptr[iterator.pos.buffer_pos] == '\n' ||
1354+
buffer_handle_ptr[iterator.pos.buffer_pos] == '\r')) {
1355+
if (buffer_handle_ptr[iterator.pos.buffer_pos] == '\r') {
1356+
if (result.last_position.buffer_pos <= previous_buffer_handle->actual_size) {
1357+
// we add the value
1358+
result.AddValue(result, previous_buffer_handle->actual_size);
1359+
if (result.IsCommentSet(result)) {
1360+
result.UnsetComment(result, iterator.pos.buffer_pos);
1361+
} else {
1362+
result.AddRow(result, previous_buffer_handle->actual_size);
1363+
}
1364+
state_machine->Transition(states, buffer_handle_ptr[iterator.pos.buffer_pos++]);
1365+
while (iterator.pos.buffer_pos < cur_buffer_handle->actual_size &&
1366+
(buffer_handle_ptr[iterator.pos.buffer_pos] == '\r' ||
1367+
buffer_handle_ptr[iterator.pos.buffer_pos] == '\n')) {
1368+
state_machine->Transition(states, buffer_handle_ptr[iterator.pos.buffer_pos++]);
1369+
}
1370+
return;
1371+
}
1372+
} else {
1373+
if (iterator.pos.buffer_pos + 1 == cur_buffer_handle->actual_size) {
1374+
return;
1375+
}
1376+
}
1377+
state_machine->Transition(states, buffer_handle_ptr[iterator.pos.buffer_pos]);
1378+
iterator.pos.buffer_pos++;
1379+
}
13491380
}
13501381
}
13511382
// second buffer
@@ -1820,18 +1851,18 @@ void StringValueScanner::FinalizeChunkProcess() {
18201851
if (cur_buffer_handle->is_last_buffer && iterator.pos.buffer_pos >= cur_buffer_handle->actual_size) {
18211852
MoveToNextBuffer();
18221853
}
1823-
} else {
1824-
if (result.current_errors.HasErrorType(UNTERMINATED_QUOTES)) {
1825-
found_error = true;
1826-
type = UNTERMINATED_QUOTES;
1827-
} else if (result.current_errors.HasErrorType(INVALID_STATE)) {
1828-
found_error = true;
1829-
type = INVALID_STATE;
1830-
}
1831-
if (result.current_errors.HandleErrors(result)) {
1832-
result.number_of_rows++;
1833-
}
18341854
}
1855+
if (result.current_errors.HasErrorType(UNTERMINATED_QUOTES)) {
1856+
found_error = true;
1857+
type = UNTERMINATED_QUOTES;
1858+
} else if (result.current_errors.HasErrorType(INVALID_STATE)) {
1859+
found_error = true;
1860+
type = INVALID_STATE;
1861+
}
1862+
if (result.current_errors.HandleErrors(result)) {
1863+
result.number_of_rows++;
1864+
}
1865+
18351866
if (states.IsQuotedCurrent() && !found_error &&
18361867
state_machine->dialect_options.state_machine_options.strict_mode.GetValue()) {
18371868
// If we finish the execution of a buffer, and we end in a quoted state, it means we have unterminated

src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,15 @@ void CSVSniffer::RefineCandidates() {
525525
unique_ptr<ColumnCountScanner> cc_best_candidate = std::move(successful_candidates[i]);
526526
if (cc_best_candidate->state_machine->state_machine_options.quote != '\0' &&
527527
cc_best_candidate->ever_quoted) {
528+
// If we have multiple candidates with the same quote, but different escapes
529+
for (idx_t j = i + 1; j < successful_candidates.size(); j++) {
530+
// we give preference if it has the same character between escape and quote
531+
if (successful_candidates[j]->state_machine->state_machine_options.escape ==
532+
successful_candidates[j]->state_machine->state_machine_options.quote) {
533+
cc_best_candidate = std::move(successful_candidates[j]);
534+
break;
535+
}
536+
}
528537
candidates.clear();
529538
candidates.push_back(std::move(cc_best_candidate));
530539
return;

src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
4242
case CSVState::COMMENT:
4343
InitializeTransitionArray(transition_array, cur_state, CSVState::COMMENT);
4444
break;
45+
case CSVState::CARRIAGE_RETURN:
46+
if (state_machine_options.strict_mode.GetValue()) {
47+
// If we have an unquoted state, following rfc 4180, our base state is invalid
48+
InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID);
49+
} else {
50+
// This will allow us to accept unescaped quotes
51+
InitializeTransitionArray(transition_array, cur_state, CSVState::STANDARD);
52+
}
53+
break;
4554
default:
4655
InitializeTransitionArray(transition_array, cur_state, CSVState::STANDARD);
4756
break;
@@ -80,10 +89,25 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
8089
} else {
8190
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
8291
}
92+
} else if (new_line_id == NewLineIdentifier::SINGLE_N) {
93+
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
94+
if (!state_machine_options.strict_mode.GetValue()) {
95+
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::RECORD_SEPARATOR;
96+
} else {
97+
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::INVALID;
98+
}
99+
} else if (new_line_id == NewLineIdentifier::SINGLE_R) {
100+
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::RECORD_SEPARATOR;
101+
if (!state_machine_options.strict_mode.GetValue()) {
102+
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
103+
} else {
104+
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::INVALID;
105+
}
83106
} else {
84107
transition_array[static_cast<uint8_t>('\r')][state] = CSVState::RECORD_SEPARATOR;
85108
transition_array[static_cast<uint8_t>('\n')][state] = CSVState::RECORD_SEPARATOR;
86109
}
110+
87111
if (comment != '\0') {
88112
transition_array[comment][state] = CSVState::COMMENT;
89113
}
@@ -181,12 +205,34 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
181205
if (enable_unquoted_escape) {
182206
transition_array[escape][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] = CSVState::UNQUOTED_ESCAPE;
183207
}
208+
if (state_machine_options.strict_mode.GetValue()) {
209+
// strict rules to error on the new line delimiter
210+
switch (new_line_id) {
211+
case NewLineIdentifier::CARRY_ON:
212+
case NewLineIdentifier::SINGLE_R:
213+
transition_array[static_cast<uint8_t>('\n')][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] =
214+
CSVState::INVALID;
215+
break;
216+
case NewLineIdentifier::SINGLE_N:
217+
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::RECORD_SEPARATOR)] =
218+
CSVState::INVALID;
219+
break;
220+
default:
221+
break;
222+
}
223+
}
184224

185225
// 4) Carriage Return State
186226
transition_array[static_cast<uint8_t>('\n')][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] =
187227
CSVState::RECORD_SEPARATOR;
188-
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] =
189-
CSVState::CARRIAGE_RETURN;
228+
if (state_machine_options.strict_mode.GetValue()) {
229+
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] =
230+
CSVState::INVALID;
231+
} else {
232+
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] =
233+
CSVState::CARRIAGE_RETURN;
234+
}
235+
190236
if (quote != '\0') {
191237
transition_array[quote][static_cast<uint8_t>(CSVState::CARRIAGE_RETURN)] = CSVState::QUOTED;
192238
}
@@ -227,7 +273,20 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op
227273
if (state_machine_options.quote == state_machine_options.escape) {
228274
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::QUOTED;
229275
}
230-
if (state_machine_options.strict_mode == false) {
276+
if (state_machine_options.strict_mode.GetValue()) {
277+
// strict rules to error on the new line delimiter
278+
switch (new_line_id) {
279+
case NewLineIdentifier::CARRY_ON:
280+
case NewLineIdentifier::SINGLE_R:
281+
transition_array[static_cast<uint8_t>('\n')][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::INVALID;
282+
break;
283+
case NewLineIdentifier::SINGLE_N:
284+
transition_array[static_cast<uint8_t>('\r')][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::INVALID;
285+
break;
286+
default:
287+
break;
288+
}
289+
} else {
231290
if (escape == '\0') {
232291
// If escape is defined, it limits a bit how relaxed quotes can be in a reliable way.
233292
transition_array[quote][static_cast<uint8_t>(CSVState::UNQUOTED)] = CSVState::MAYBE_QUOTED;

0 commit comments

Comments
 (0)