Skip to content

Commit f5fb6cc

Browse files
authored
Merge branch 'main' into main
2 parents a861326 + db6ceda commit f5fb6cc

File tree

102 files changed

+3464
-624
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+3464
-624
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/duckdb/extension/parquet/column_writer.cpp

Lines changed: 141 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
#include "duckdb.hpp"
44
#include "geo_parquet.hpp"
55
#include "parquet_dbp_encoder.hpp"
6+
#include "parquet_dlba_encoder.hpp"
67
#include "parquet_rle_bp_decoder.hpp"
78
#include "parquet_rle_bp_encoder.hpp"
9+
#include "parquet_bss_encoder.hpp"
810
#include "parquet_statistics.hpp"
911
#include "parquet_writer.hpp"
1012
#ifndef DUCKDB_AMALGAMATION
@@ -1059,34 +1061,42 @@ class StandardColumnWriterState : public BasicColumnWriterState {
10591061
}
10601062
~StandardColumnWriterState() override = default;
10611063

1062-
// analysis state for integer values for DELTA_BINARY_PACKED
1064+
// analysis state for integer values for DELTA_BINARY_PACKED/DELTA_LENGTH_BYTE_ARRAY
10631065
idx_t total_value_count = 0;
1066+
idx_t total_string_size = 0;
10641067

10651068
unordered_map<T, uint32_t> dictionary;
10661069
duckdb_parquet::Encoding::type encoding;
10671070
};
10681071

1069-
template <class T>
1072+
template <class SRC, class TGT>
10701073
class StandardWriterPageState : public ColumnWriterPageState {
10711074
public:
1072-
explicit StandardWriterPageState(const idx_t total_value_count, Encoding::type encoding_p,
1073-
const unordered_map<T, uint32_t> &dictionary_p)
1074-
: encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dictionary(dictionary_p),
1075-
dict_written_value(false), dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.size())),
1076-
dict_encoder(dict_bit_width) {
1075+
explicit StandardWriterPageState(const idx_t total_value_count, const idx_t total_string_size,
1076+
Encoding::type encoding_p, const unordered_map<SRC, uint32_t> &dictionary_p)
1077+
: encoding(encoding_p), dbp_initialized(false), dbp_encoder(total_value_count), dlba_initialized(false),
1078+
dlba_encoder(total_value_count, total_string_size), bss_encoder(total_value_count, sizeof(TGT)),
1079+
dictionary(dictionary_p), dict_written_value(false),
1080+
dict_bit_width(RleBpDecoder::ComputeBitWidth(dictionary.size())), dict_encoder(dict_bit_width) {
10771081
}
10781082
duckdb_parquet::Encoding::type encoding;
10791083

10801084
bool dbp_initialized;
10811085
DbpEncoder dbp_encoder;
10821086

1083-
const unordered_map<T, uint32_t> &dictionary;
1087+
bool dlba_initialized;
1088+
DlbaEncoder dlba_encoder;
1089+
1090+
BssEncoder bss_encoder;
1091+
1092+
const unordered_map<SRC, uint32_t> &dictionary;
10841093
bool dict_written_value;
10851094
uint32_t dict_bit_width;
10861095
RleBpEncoder dict_encoder;
10871096
};
10881097

10891098
namespace dbp_encoder {
1099+
10901100
template <class T>
10911101
void BeginWrite(DbpEncoder &encoder, WriteStream &writer, const T &first_value) {
10921102
throw InternalException("Can't write type to DELTA_BINARY_PACKED column");
@@ -1139,6 +1149,60 @@ void WriteValue(DbpEncoder &encoder, WriteStream &writer, const uint32_t &value)
11391149

11401150
} // namespace dbp_encoder
11411151

1152+
namespace dlba_encoder {
1153+
1154+
template <class T>
1155+
void BeginWrite(DlbaEncoder &encoder, WriteStream &writer, const T &first_value) {
1156+
throw InternalException("Can't write type to DELTA_LENGTH_BYTE_ARRAY column");
1157+
}
1158+
1159+
template <>
1160+
void BeginWrite(DlbaEncoder &encoder, WriteStream &writer, const string_t &first_value) {
1161+
encoder.BeginWrite(writer, first_value);
1162+
}
1163+
1164+
template <class T>
1165+
void WriteValue(DlbaEncoder &encoder, WriteStream &writer, const T &value) {
1166+
throw InternalException("Can't write type to DELTA_LENGTH_BYTE_ARRAY column");
1167+
}
1168+
1169+
template <>
1170+
void WriteValue(DlbaEncoder &encoder, WriteStream &writer, const string_t &value) {
1171+
encoder.WriteValue(writer, value);
1172+
}
1173+
1174+
// helpers to get size from strings
1175+
template <class SRC>
1176+
static constexpr idx_t GetDlbaStringSize(const SRC &src_value) {
1177+
return 0;
1178+
}
1179+
1180+
template <>
1181+
idx_t GetDlbaStringSize(const string_t &src_value) {
1182+
return src_value.GetSize();
1183+
}
1184+
1185+
} // namespace dlba_encoder
1186+
1187+
namespace bss_encoder {
1188+
1189+
template <class T>
1190+
void WriteValue(BssEncoder &encoder, const T &value) {
1191+
throw InternalException("Can't write type to BYTE_STREAM_SPLIT column");
1192+
}
1193+
1194+
template <>
1195+
void WriteValue(BssEncoder &encoder, const float &value) {
1196+
encoder.WriteValue(value);
1197+
}
1198+
1199+
template <>
1200+
void WriteValue(BssEncoder &encoder, const double &value) {
1201+
encoder.WriteValue(value);
1202+
}
1203+
1204+
} // namespace bss_encoder
1205+
11421206
template <class SRC, class TGT, class OP = ParquetCastOperator>
11431207
class StandardColumnWriter : public BasicColumnWriter {
11441208
public:
@@ -1159,13 +1223,13 @@ class StandardColumnWriter : public BasicColumnWriter {
11591223
unique_ptr<ColumnWriterPageState> InitializePageState(BasicColumnWriterState &state_p) override {
11601224
auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
11611225

1162-
auto result =
1163-
make_uniq<StandardWriterPageState<SRC>>(state.total_value_count, state.encoding, state.dictionary);
1226+
auto result = make_uniq<StandardWriterPageState<SRC, TGT>>(state.total_value_count, state.total_string_size,
1227+
state.encoding, state.dictionary);
11641228
return std::move(result);
11651229
}
11661230

11671231
void FlushPageState(WriteStream &temp_writer, ColumnWriterPageState *state_p) override {
1168-
auto &page_state = state_p->Cast<StandardWriterPageState<SRC>>();
1232+
auto &page_state = state_p->Cast<StandardWriterPageState<SRC, TGT>>();
11691233
switch (page_state.encoding) {
11701234
case Encoding::DELTA_BINARY_PACKED:
11711235
if (!page_state.dbp_initialized) {
@@ -1182,7 +1246,15 @@ class StandardColumnWriter : public BasicColumnWriter {
11821246
return;
11831247
}
11841248
page_state.dict_encoder.FinishWrite(temp_writer);
1185-
1249+
break;
1250+
case Encoding::DELTA_LENGTH_BYTE_ARRAY:
1251+
if (!page_state.dlba_initialized) {
1252+
dlba_encoder::BeginWrite<string_t>(page_state.dlba_encoder, temp_writer, string_t(""));
1253+
}
1254+
page_state.dlba_encoder.FinishWrite(temp_writer);
1255+
break;
1256+
case Encoding::BYTE_STREAM_SPLIT:
1257+
page_state.bss_encoder.FinishWrite(temp_writer);
11861258
break;
11871259
case Encoding::PLAIN:
11881260
break;
@@ -1220,14 +1292,15 @@ class StandardColumnWriter : public BasicColumnWriter {
12201292
continue;
12211293
}
12221294
if (validity.RowIsValid(vector_index)) {
1295+
const auto &src_value = data_ptr[vector_index];
12231296
if (state.dictionary.size() <= writer.DictionarySizeLimit()) {
1224-
const auto &src_value = data_ptr[vector_index];
12251297
if (state.dictionary.find(src_value) == state.dictionary.end()) {
12261298
state.dictionary[src_value] = new_value_index;
12271299
new_value_index++;
12281300
}
12291301
}
12301302
state.total_value_count++;
1303+
state.total_string_size += dlba_encoder::GetDlbaStringSize(src_value);
12311304
}
12321305
vector_index++;
12331306
}
@@ -1238,9 +1311,22 @@ class StandardColumnWriter : public BasicColumnWriter {
12381311

12391312
auto &state = state_p.Cast<StandardColumnWriterState<SRC>>();
12401313
if (state.dictionary.size() == 0 || state.dictionary.size() > writer.DictionarySizeLimit()) {
1241-
// special handling for int column: dpb, otherwise plain
1242-
state.encoding = (type == Type::type::INT32 || type == Type::type::INT64) ? Encoding::DELTA_BINARY_PACKED
1243-
: Encoding::PLAIN;
1314+
// If we aren't doing dictionary encoding, the following encodings are virtually always better than PLAIN
1315+
switch (type) {
1316+
case Type::type::INT32:
1317+
case Type::type::INT64:
1318+
state.encoding = Encoding::DELTA_BINARY_PACKED;
1319+
break;
1320+
case Type::type::BYTE_ARRAY:
1321+
state.encoding = Encoding::DELTA_LENGTH_BYTE_ARRAY;
1322+
break;
1323+
case Type::type::FLOAT:
1324+
case Type::type::DOUBLE:
1325+
state.encoding = Encoding::BYTE_STREAM_SPLIT;
1326+
break;
1327+
default:
1328+
state.encoding = Encoding::PLAIN;
1329+
}
12441330
state.dictionary.clear();
12451331
}
12461332
}
@@ -1261,7 +1347,7 @@ class StandardColumnWriter : public BasicColumnWriter {
12611347

12621348
void WriteVector(WriteStream &temp_writer, ColumnWriterStatistics *stats, ColumnWriterPageState *page_state_p,
12631349
Vector &input_column, idx_t chunk_start, idx_t chunk_end) override {
1264-
auto &page_state = page_state_p->Cast<StandardWriterPageState<SRC>>();
1350+
auto &page_state = page_state_p->Cast<StandardWriterPageState<SRC, TGT>>();
12651351

12661352
const auto &mask = FlatVector::Validity(input_column);
12671353
const auto *data_ptr = FlatVector::GetData<SRC>(input_column);
@@ -1287,7 +1373,6 @@ class StandardColumnWriter : public BasicColumnWriter {
12871373
}
12881374
break;
12891375
}
1290-
12911376
case Encoding::DELTA_BINARY_PACKED: {
12921377
idx_t r = chunk_start;
12931378
if (!page_state.dbp_initialized) {
@@ -1315,6 +1400,44 @@ class StandardColumnWriter : public BasicColumnWriter {
13151400
}
13161401
break;
13171402
}
1403+
case Encoding::DELTA_LENGTH_BYTE_ARRAY: {
1404+
idx_t r = chunk_start;
1405+
if (!page_state.dlba_initialized) {
1406+
// find first non-null value
1407+
for (; r < chunk_end; r++) {
1408+
if (!mask.RowIsValid(r)) {
1409+
continue;
1410+
}
1411+
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
1412+
OP::template HandleStats<SRC, TGT>(stats, target_value);
1413+
dlba_encoder::BeginWrite(page_state.dlba_encoder, temp_writer, target_value);
1414+
page_state.dlba_initialized = true;
1415+
r++; // skip over
1416+
break;
1417+
}
1418+
}
1419+
1420+
for (; r < chunk_end; r++) {
1421+
if (!mask.RowIsValid(r)) {
1422+
continue;
1423+
}
1424+
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
1425+
OP::template HandleStats<SRC, TGT>(stats, target_value);
1426+
dlba_encoder::WriteValue(page_state.dlba_encoder, temp_writer, target_value);
1427+
}
1428+
break;
1429+
}
1430+
case Encoding::BYTE_STREAM_SPLIT: {
1431+
for (idx_t r = chunk_start; r < chunk_end; r++) {
1432+
if (!mask.RowIsValid(r)) {
1433+
continue;
1434+
}
1435+
const TGT target_value = OP::template Operation<SRC, TGT>(data_ptr[r]);
1436+
OP::template HandleStats<SRC, TGT>(stats, target_value);
1437+
bss_encoder::WriteValue(page_state.bss_encoder, target_value);
1438+
}
1439+
break;
1440+
}
13181441
case Encoding::PLAIN: {
13191442
D_ASSERT(page_state.encoding == Encoding::PLAIN);
13201443
TemplatedWritePlain<SRC, TGT, OP>(input_column, stats, chunk_start, chunk_end, mask, temp_writer);
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
//===----------------------------------------------------------------------===//
2+
// DuckDB
3+
//
4+
// parquet_bss_encoder.hpp
5+
//
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include "decode_utils.hpp"
12+
13+
namespace duckdb {
14+
15+
class BssEncoder {
16+
public:
17+
explicit BssEncoder(const idx_t total_value_count_p, const idx_t bit_width_p)
18+
: total_value_count(total_value_count_p), bit_width(bit_width_p), count(0),
19+
buffer(Allocator::DefaultAllocator().Allocate(total_value_count * bit_width + 1)) {
20+
}
21+
22+
public:
23+
template <class T>
24+
void WriteValue(const T &value) {
25+
D_ASSERT(sizeof(T) == bit_width);
26+
for (idx_t i = 0; i < sizeof(T); i++) {
27+
buffer.get()[i * total_value_count + count] = reinterpret_cast<const_data_ptr_t>(&value)[i];
28+
}
29+
count++;
30+
}
31+
32+
void FinishWrite(WriteStream &writer) {
33+
D_ASSERT(count == total_value_count);
34+
writer.WriteData(buffer.get(), total_value_count * bit_width);
35+
}
36+
37+
private:
38+
const idx_t total_value_count;
39+
const idx_t bit_width;
40+
41+
idx_t count;
42+
AllocatedData buffer;
43+
};
44+
45+
} // namespace duckdb
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//===----------------------------------------------------------------------===//
2+
// DuckDB
3+
//
4+
// parquet_dlba_encoder.hpp
5+
//
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include "parquet_dbp_encoder.hpp"
12+
#include "duckdb/common/serializer/memory_stream.hpp"
13+
14+
namespace duckdb {
15+
16+
class DlbaEncoder {
17+
public:
18+
DlbaEncoder(const idx_t total_value_count_p, const idx_t total_string_size_p)
19+
: dbp_encoder(total_value_count_p), total_string_size(total_string_size_p),
20+
buffer(Allocator::DefaultAllocator().Allocate(total_string_size + 1)),
21+
stream(make_unsafe_uniq<MemoryStream>(buffer.get(), buffer.GetSize())) {
22+
}
23+
24+
public:
25+
void BeginWrite(WriteStream &writer, const string_t &first_value) {
26+
dbp_encoder.BeginWrite(writer, UnsafeNumericCast<int64_t>(first_value.GetSize()));
27+
stream->WriteData(const_data_ptr_cast(first_value.GetData()), first_value.GetSize());
28+
}
29+
30+
void WriteValue(WriteStream &writer, const string_t &value) {
31+
dbp_encoder.WriteValue(writer, UnsafeNumericCast<int64_t>(value.GetSize()));
32+
stream->WriteData(const_data_ptr_cast(value.GetData()), value.GetSize());
33+
}
34+
35+
void FinishWrite(WriteStream &writer) {
36+
D_ASSERT(stream->GetPosition() == total_string_size);
37+
dbp_encoder.FinishWrite(writer);
38+
writer.WriteData(buffer.get(), total_string_size);
39+
}
40+
41+
private:
42+
DbpEncoder dbp_encoder;
43+
const idx_t total_string_size;
44+
AllocatedData buffer;
45+
unsafe_unique_ptr<MemoryStream> stream;
46+
};
47+
48+
} // namespace duckdb

src/duckdb/src/catalog/catalog.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626
#include "duckdb/parser/parsed_data/create_type_info.hpp"
2727
#include "duckdb/parser/parsed_data/create_view_info.hpp"
2828
#include "duckdb/parser/parsed_data/drop_info.hpp"
29+
#include "duckdb/parser/statement/create_statement.hpp"
2930
#include "duckdb/planner/parsed_data/bound_create_table_info.hpp"
3031
#include "duckdb/planner/binder.hpp"
32+
#include "duckdb/planner/expression_binder/index_binder.hpp"
3133
#include "duckdb/catalog/default/default_types.hpp"
3234
#include "duckdb/main/extension_entries.hpp"
3335
#include "duckdb/main/extension/generated_extension_loader.hpp"
@@ -304,6 +306,14 @@ optional_ptr<CatalogEntry> Catalog::CreateIndex(ClientContext &context, CreateIn
304306
return CreateIndex(GetCatalogTransaction(context), info);
305307
}
306308

309+
unique_ptr<LogicalOperator> Catalog::BindCreateIndex(Binder &binder, CreateStatement &stmt, TableCatalogEntry &table,
310+
unique_ptr<LogicalOperator> plan) {
311+
D_ASSERT(plan->type == LogicalOperatorType::LOGICAL_GET);
312+
auto create_index_info = unique_ptr_cast<CreateInfo, CreateIndexInfo>(std::move(stmt.info));
313+
IndexBinder index_binder(binder, binder.context);
314+
return index_binder.BindCreateIndex(binder.context, std::move(create_index_info), table, std::move(plan), nullptr);
315+
}
316+
307317
unique_ptr<LogicalOperator> Catalog::BindAlterAddIndex(Binder &binder, TableCatalogEntry &table_entry,
308318
unique_ptr<LogicalOperator> plan,
309319
unique_ptr<CreateIndexInfo> create_info,
@@ -953,9 +963,13 @@ optional_ptr<SchemaCatalogEntry> Catalog::GetSchema(CatalogEntryRetriever &retri
953963
QueryErrorContext error_context) {
954964
auto entries = GetCatalogEntries(retriever, catalog_name, schema_name);
955965
for (idx_t i = 0; i < entries.size(); i++) {
966+
auto catalog = Catalog::GetCatalogEntry(retriever, entries[i].catalog);
967+
if (!catalog) {
968+
// skip if it is not an attached database
969+
continue;
970+
}
956971
auto on_not_found = i + 1 == entries.size() ? if_not_found : OnEntryNotFound::RETURN_NULL;
957-
auto &catalog = Catalog::GetCatalog(retriever, entries[i].catalog);
958-
auto result = catalog.GetSchema(retriever.GetContext(), schema_name, on_not_found, error_context);
972+
auto result = catalog->GetSchema(retriever.GetContext(), schema_name, on_not_found, error_context);
959973
if (result) {
960974
return result;
961975
}

0 commit comments

Comments
 (0)