Skip to content

Commit fcd50cd

Browse files
Update vendored DuckDB sources to 887b5f5
1 parent 887b5f5 commit fcd50cd

34 files changed

+1210
-503
lines changed

src/duckdb/extension/core_functions/function_list.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,10 @@ static const StaticFunctionDefinition core_functions[] = {
385385
DUCKDB_SCALAR_FUNCTION(UrlDecodeFun),
386386
DUCKDB_SCALAR_FUNCTION(UrlEncodeFun),
387387
DUCKDB_SCALAR_FUNCTION(UUIDFun),
388+
DUCKDB_SCALAR_FUNCTION_SET(ExtractUuidTimestampFun),
389+
DUCKDB_SCALAR_FUNCTION_SET(ExtractUuidVerisonFun),
390+
DUCKDB_SCALAR_FUNCTION(UUIDv4Fun),
391+
DUCKDB_SCALAR_FUNCTION(UUIDv7Fun),
388392
DUCKDB_AGGREGATE_FUNCTION(VarPopFun),
389393
DUCKDB_AGGREGATE_FUNCTION(VarSampFun),
390394
DUCKDB_AGGREGATE_FUNCTION_ALIAS(VarianceFun),

src/duckdb/extension/core_functions/include/core_functions/scalar/random_functions.hpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ struct SetseedFun {
3636
struct UUIDFun {
3737
static constexpr const char *Name = "uuid";
3838
static constexpr const char *Parameters = "";
39-
static constexpr const char *Description = "Returns a random UUID similar to this: eeccb8c5-9943-b2bb-bb5e-222f4e14b687";
39+
static constexpr const char *Description = "Returns a random UUID v4 similar to this: eeccb8c5-9943-b2bb-bb5e-222f4e14b687";
4040
static constexpr const char *Example = "uuid()";
4141

4242
static ScalarFunction GetFunction();
@@ -48,4 +48,40 @@ struct GenRandomUuidFun {
4848
static constexpr const char *Name = "gen_random_uuid";
4949
};
5050

51+
struct UUIDv4Fun {
52+
static constexpr const char *Name = "uuidv4";
53+
static constexpr const char *Parameters = "";
54+
static constexpr const char *Description = "Returns a random UUIDv4 similar to this: eeccb8c5-9943-b2bb-bb5e-222f4e14b687";
55+
static constexpr const char *Example = "uuidv4()";
56+
57+
static ScalarFunction GetFunction();
58+
};
59+
60+
struct UUIDv7Fun {
61+
static constexpr const char *Name = "uuidv7";
62+
static constexpr const char *Parameters = "";
63+
static constexpr const char *Description = "Returns a random UUID v7 similar to this: 019482e4-1441-7aad-8127-eec99573b0a0";
64+
static constexpr const char *Example = "uuidv7()";
65+
66+
static ScalarFunction GetFunction();
67+
};
68+
69+
struct ExtractUuidVerisonFun {
70+
static constexpr const char *Name = "uuid_extract_version";
71+
static constexpr const char *Parameters = "uuid";
72+
static constexpr const char *Description = "Extract a version for the given UUID.";
73+
static constexpr const char *Example = "uuid_extract_version('019482e4-1441-7aad-8127-eec99573b0a0')";
74+
75+
static ScalarFunctionSet GetFunctions();
76+
};
77+
78+
struct ExtractUuidTimestampFun {
79+
static constexpr const char *Name = "uuid_extract_timestamp";
80+
static constexpr const char *Parameters = "uuid";
81+
static constexpr const char *Description = "Extract the timestamp for the given UUID v7.";
82+
static constexpr const char *Example = "uuid_extract_timestamp('019482e4-1441-7aad-8127-eec99573b0a0')";
83+
84+
static ScalarFunctionSet GetFunctions();
85+
};
86+
5187
} // namespace duckdb

src/duckdb/extension/core_functions/scalar/random/random.cpp

Lines changed: 121 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,81 @@
55
#include "duckdb/planner/expression/bound_function_expression.hpp"
66
#include "duckdb/common/random_engine.hpp"
77
#include "duckdb/common/types/uuid.hpp"
8+
#include "duckdb/common/types/timestamp.hpp"
89

910
namespace duckdb {
1011

12+
struct ExtractVersionStrOperator {
13+
template <typename INPUT_TYPE, typename RESULT_TYPE>
14+
static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
15+
const idx_t len = input.GetSize();
16+
if (len != 36) {
17+
throw InvalidInputException("Given string '%s' is invalid UUID.", input.GetString());
18+
}
19+
// UUIDv4 and UUIDv7 stores version as the 15-th uint8_t.
20+
return input.GetPointer()[14] - '0';
21+
}
22+
};
23+
24+
struct ExtractVersionUuidOperator {
25+
template <typename INPUT_TYPE, typename RESULT_TYPE>
26+
static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
27+
char uuid[36]; // Intentionally no initialize.
28+
BaseUUID::ToString(input, uuid);
29+
// UUIDv4 and UUIDv7 stores version as the 15-th uint8_t.
30+
return uuid[14] - '0';
31+
}
32+
};
33+
34+
struct ExtractTimestampUuidOperator {
35+
template <typename INPUT_TYPE, typename RESULT_TYPE>
36+
static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
37+
// Validate whether the given UUID is v7.
38+
const uint8_t version = ((uint8_t)((input.upper) >> 8) & 0xf0) >> 4;
39+
if (version != 7) {
40+
throw InvalidInputException("Given UUID is with version %u, not version 7.", version);
41+
}
42+
43+
// UUID v7 begins with a 48 bit big-endian Unix Epoch timestamp with millisecond granularity.
44+
const int64_t upper = input.upper;
45+
int64_t unix_ts_milli = upper;
46+
unix_ts_milli = unix_ts_milli >> 16;
47+
48+
static constexpr uint64_t kMilliToMicro = 1000;
49+
const int64_t unix_ts_ms = unix_ts_milli * kMilliToMicro;
50+
return timestamp_t {unix_ts_ms};
51+
}
52+
};
53+
54+
struct ExtractTimestampStrOperator {
55+
template <typename INPUT_TYPE, typename RESULT_TYPE>
56+
static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) {
57+
// Validate whether the give input is a valid UUID.
58+
hugeint_t uuid_hugeint;
59+
if (!BaseUUID::FromCString(input.GetData(), input.GetSize(), uuid_hugeint)) {
60+
throw InvalidInputException("Given string '%s' is invalid UUID.", input.GetString());
61+
}
62+
63+
return ExtractTimestampUuidOperator::Operation<hugeint_t, RESULT_TYPE>(uuid_hugeint, result);
64+
}
65+
};
66+
67+
template <typename INPUT, typename OP>
68+
static void ExtractVersionFunction(DataChunk &args, ExpressionState &state, Vector &result) {
69+
D_ASSERT(args.ColumnCount() == 1);
70+
auto &input = args.data[0];
71+
idx_t count = args.size();
72+
UnaryExecutor::ExecuteString<INPUT, uint32_t, OP>(input, result, count);
73+
}
74+
75+
template <typename INPUT, typename OP>
76+
static void ExtractTimestampFunction(DataChunk &args, ExpressionState &state, Vector &result) {
77+
D_ASSERT(args.ColumnCount() == 1);
78+
auto &input = args.data[0];
79+
idx_t count = args.size();
80+
UnaryExecutor::ExecuteString<INPUT, timestamp_t, OP>(input, result, count);
81+
}
82+
1183
struct RandomLocalState : public FunctionLocalState {
1284
explicit RandomLocalState(uint64_t seed) : random_engine(0) {
1385
random_engine.SetSeed(seed);
@@ -41,24 +113,66 @@ ScalarFunction RandomFun::GetFunction() {
41113
return random;
42114
}
43115

44-
static void GenerateUUIDFunction(DataChunk &args, ExpressionState &state, Vector &result) {
116+
static void GenerateUUIDv4Function(DataChunk &args, ExpressionState &state, Vector &result) {
45117
D_ASSERT(args.ColumnCount() == 0);
46118
auto &lstate = ExecuteFunctionState::GetFunctionState(state)->Cast<RandomLocalState>();
47119

48120
result.SetVectorType(VectorType::FLAT_VECTOR);
49121
auto result_data = FlatVector::GetData<hugeint_t>(result);
50122

51123
for (idx_t i = 0; i < args.size(); i++) {
52-
result_data[i] = UUID::GenerateRandomUUID(lstate.random_engine);
124+
result_data[i] = UUIDv4::GenerateRandomUUID(lstate.random_engine);
125+
}
126+
}
127+
128+
static void GenerateUUIDv7Function(DataChunk &args, ExpressionState &state, Vector &result) {
129+
D_ASSERT(args.ColumnCount() == 0);
130+
auto &lstate = ExecuteFunctionState::GetFunctionState(state)->Cast<RandomLocalState>();
131+
132+
result.SetVectorType(VectorType::FLAT_VECTOR);
133+
auto result_data = FlatVector::GetData<hugeint_t>(result);
134+
135+
for (idx_t i = 0; i < args.size(); i++) {
136+
result_data[i] = UUIDv7::GenerateRandomUUID(lstate.random_engine);
53137
}
54138
}
55139

56140
ScalarFunction UUIDFun::GetFunction() {
57-
ScalarFunction uuid_function({}, LogicalType::UUID, GenerateUUIDFunction, nullptr, nullptr, nullptr,
58-
RandomInitLocalState);
59-
// generate a random uuid
60-
uuid_function.stability = FunctionStability::VOLATILE;
61-
return uuid_function;
141+
return UUIDv4Fun::GetFunction();
142+
}
143+
144+
ScalarFunction UUIDv4Fun::GetFunction() {
145+
ScalarFunction uuid_v4_function({}, LogicalType::UUID, GenerateUUIDv4Function, nullptr, nullptr, nullptr,
146+
RandomInitLocalState);
147+
// generate a random uuid v4
148+
uuid_v4_function.stability = FunctionStability::VOLATILE;
149+
return uuid_v4_function;
150+
}
151+
152+
ScalarFunction UUIDv7Fun::GetFunction() {
153+
ScalarFunction uuid_v7_function({}, LogicalType::UUID, GenerateUUIDv7Function, nullptr, nullptr, nullptr,
154+
RandomInitLocalState);
155+
// generate a random uuid v7
156+
uuid_v7_function.stability = FunctionStability::VOLATILE;
157+
return uuid_v7_function;
158+
}
159+
160+
ScalarFunctionSet ExtractUuidVerisonFun::GetFunctions() {
161+
ScalarFunctionSet version_extraction;
162+
version_extraction.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::UINTEGER,
163+
ExtractVersionFunction<string_t, ExtractVersionStrOperator>));
164+
version_extraction.AddFunction(ScalarFunction({LogicalType::UUID}, LogicalType::UINTEGER,
165+
ExtractVersionFunction<hugeint_t, ExtractVersionUuidOperator>));
166+
return version_extraction;
167+
}
168+
169+
ScalarFunctionSet ExtractUuidTimestampFun::GetFunctions() {
170+
ScalarFunctionSet timestamp_extraction;
171+
timestamp_extraction.AddFunction(ScalarFunction({LogicalType::VARCHAR}, LogicalType::TIMESTAMP_TZ,
172+
ExtractTimestampFunction<string_t, ExtractTimestampStrOperator>));
173+
timestamp_extraction.AddFunction(ScalarFunction({LogicalType::UUID}, LogicalType::TIMESTAMP_TZ,
174+
ExtractTimestampFunction<hugeint_t, ExtractTimestampUuidOperator>));
175+
return timestamp_extraction;
62176
}
63177

64178
} // namespace duckdb

src/duckdb/extension/parquet/column_writer.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,22 @@ string ColumnWriterStatistics::GetMaxValue() {
7676
return string();
7777
}
7878

79+
bool ColumnWriterStatistics::CanHaveNaN() {
80+
return false;
81+
}
82+
83+
bool ColumnWriterStatistics::HasNaN() {
84+
return false;
85+
}
86+
87+
bool ColumnWriterStatistics::MinIsExact() {
88+
return true;
89+
}
90+
91+
bool ColumnWriterStatistics::MaxIsExact() {
92+
return true;
93+
}
94+
7995
//===--------------------------------------------------------------------===//
8096
// ColumnWriter
8197
//===--------------------------------------------------------------------===//
@@ -556,11 +572,11 @@ ColumnWriter::CreateWriterRecursive(ClientContext &context, ParquetWriter &write
556572
return make_uniq<StandardColumnWriter<uint64_t, uint64_t>>(writer, schema, std::move(path_in_schema),
557573
can_have_nulls);
558574
case LogicalTypeId::FLOAT:
559-
return make_uniq<StandardColumnWriter<float_na_equal, float>>(writer, schema, std::move(path_in_schema),
560-
can_have_nulls);
575+
return make_uniq<StandardColumnWriter<float_na_equal, float, FloatingPointOperator>>(
576+
writer, schema, std::move(path_in_schema), can_have_nulls);
561577
case LogicalTypeId::DOUBLE:
562-
return make_uniq<StandardColumnWriter<double_na_equal, double>>(writer, schema, std::move(path_in_schema),
563-
can_have_nulls);
578+
return make_uniq<StandardColumnWriter<double_na_equal, double, FloatingPointOperator>>(
579+
writer, schema, std::move(path_in_schema), can_have_nulls);
564580
case LogicalTypeId::DECIMAL:
565581
switch (type.InternalType()) {
566582
case PhysicalType::INT16:

src/duckdb/extension/parquet/include/parquet_writer.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,31 @@
99
#pragma once
1010

1111
#include "duckdb.hpp"
12-
#ifndef DUCKDB_AMALGAMATION
1312
#include "duckdb/common/common.hpp"
1413
#include "duckdb/common/encryption_state.hpp"
1514
#include "duckdb/common/exception.hpp"
1615
#include "duckdb/common/mutex.hpp"
1716
#include "duckdb/common/serializer/buffered_file_writer.hpp"
1817
#include "duckdb/common/types/column/column_data_collection.hpp"
1918
#include "duckdb/function/copy_function.hpp"
20-
#endif
2119

2220
#include "parquet_statistics.hpp"
2321
#include "column_writer.hpp"
2422
#include "parquet_types.h"
2523
#include "geo_parquet.hpp"
24+
#include "writer/parquet_write_stats.hpp"
2625
#include "thrift/protocol/TCompactProtocol.h"
2726

2827
namespace duckdb {
2928
class FileSystem;
3029
class FileOpener;
3130
class ParquetEncryptionConfig;
31+
class ParquetStatsAccumulator;
3232

3333
class Serializer;
3434
class Deserializer;
3535

36+
class ColumnWriterStatistics;
3637
struct CopyFunctionFileStatistics;
3738

3839
struct PreparedRowGroup {
@@ -82,6 +83,7 @@ class ParquetWriter {
8283
shared_ptr<ParquetEncryptionConfig> encryption_config, idx_t dictionary_size_limit,
8384
idx_t string_dictionary_page_size_limit, double bloom_filter_false_positive_ratio,
8485
int64_t compression_level, bool debug_use_openssl, ParquetVersion parquet_version);
86+
~ParquetWriter();
8587

8688
public:
8789
void PrepareRowGroup(ColumnDataCollection &buffer, PreparedRowGroup &result);
@@ -144,6 +146,8 @@ class ParquetWriter {
144146

145147
void BufferBloomFilter(idx_t col_idx, unique_ptr<ParquetBloomFilter> bloom_filter);
146148
void SetWrittenStatistics(CopyFunctionFileStatistics &written_stats);
149+
void FlushColumnStats(idx_t col_idx, duckdb_parquet::ColumnChunk &chunk,
150+
optional_ptr<ColumnWriterStatistics> writer_stats);
147151

148152
private:
149153
void GatherWrittenStatistics();
@@ -176,6 +180,7 @@ class ParquetWriter {
176180
vector<ParquetBloomFilterEntry> bloom_filters;
177181

178182
optional_ptr<CopyFunctionFileStatistics> written_stats;
183+
unique_ptr<ParquetStatsAccumulator> stats_accumulator;
179184
};
180185

181186
} // namespace duckdb

src/duckdb/extension/parquet/include/writer/parquet_write_operators.hpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ struct ParquetCastOperator : public BaseParquetOperator {
5757

5858
template <class SRC, class TGT>
5959
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
60-
auto &numeric_stats = (NumericStatisticsState<SRC, TGT, BaseParquetOperator> &)*stats;
60+
auto &numeric_stats = stats->Cast<NumericStatisticsState<SRC, TGT, BaseParquetOperator>>();
6161
if (LessThan::Operation(target_value, numeric_stats.min)) {
6262
numeric_stats.min = target_value;
6363
}
@@ -67,6 +67,33 @@ struct ParquetCastOperator : public BaseParquetOperator {
6767
}
6868
};
6969

70+
struct FloatingPointOperator : public BaseParquetOperator {
71+
template <class SRC, class TGT>
72+
static TGT Operation(SRC input) {
73+
return TGT(input);
74+
}
75+
76+
template <class SRC, class TGT>
77+
static unique_ptr<ColumnWriterStatistics> InitializeStats() {
78+
return make_uniq<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
79+
}
80+
81+
template <class SRC, class TGT>
82+
static void HandleStats(ColumnWriterStatistics *stats, TGT target_value) {
83+
auto &numeric_stats = stats->Cast<FloatingPointStatisticsState<SRC, TGT, BaseParquetOperator>>();
84+
if (Value::IsNan(target_value)) {
85+
numeric_stats.has_nan = true;
86+
} else {
87+
if (LessThan::Operation(target_value, numeric_stats.min)) {
88+
numeric_stats.min = target_value;
89+
}
90+
if (GreaterThan::Operation(target_value, numeric_stats.max)) {
91+
numeric_stats.max = target_value;
92+
}
93+
}
94+
}
95+
};
96+
7097
struct ParquetTimestampNSOperator : public ParquetCastOperator {
7198
template <class SRC, class TGT>
7299
static TGT Operation(SRC input) {

src/duckdb/extension/parquet/include/writer/parquet_write_stats.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ class ColumnWriterStatistics {
2222
virtual string GetMax();
2323
virtual string GetMinValue();
2424
virtual string GetMaxValue();
25+
virtual bool CanHaveNaN();
26+
virtual bool HasNaN();
27+
virtual bool MinIsExact();
28+
virtual bool MaxIsExact();
2529

2630
public:
2731
template <class TARGET>
@@ -64,6 +68,20 @@ class NumericStatisticsState : public ColumnWriterStatistics {
6468
}
6569
};
6670

71+
template <class SRC, class T, class OP>
72+
class FloatingPointStatisticsState : public NumericStatisticsState<SRC, T, OP> {
73+
public:
74+
bool has_nan = false;
75+
76+
public:
77+
bool CanHaveNaN() override {
78+
return true;
79+
}
80+
bool HasNaN() override {
81+
return has_nan;
82+
}
83+
};
84+
6785
class StringStatisticsState : public ColumnWriterStatistics {
6886
static constexpr const idx_t MAX_STRING_STATISTICS_SIZE = 10000;
6987

0 commit comments

Comments
 (0)