Skip to content

Commit 3158619

Browse files
authored
Merge pull request #111 from Mytherin/issue101
Fix #101: more robustly handle sparse row ids
2 parents 1b7c244 + 97cd982 commit 3158619

13 files changed

+110
-33
lines changed

data/db/giant_row_id.db

16 KB
Binary file not shown.

src/include/sqlite_db.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class SQLiteDB {
4545
idx_t RunPragma(string pragma_name);
4646
//! Gets the max row id of a table, returns false if the table does not have a
4747
//! rowid column
48-
bool GetMaxRowId(const string &table_name, idx_t &row_id);
48+
bool GetRowIdInfo(const string &table_name, RowIdInfo &info);
4949
bool ColumnExists(const string &table_name, const string &column_name);
5050
vector<IndexInfo> GetIndexInfo(const string &table_name);
5151

src/include/sqlite_scanner.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include "duckdb.hpp"
12+
#include "sqlite_utils.hpp"
1213

1314
namespace duckdb {
1415
class SQLiteDB;
@@ -20,10 +21,10 @@ struct SqliteBindData : public TableFunctionData {
2021
vector<string> names;
2122
vector<LogicalType> types;
2223

23-
idx_t max_rowid = 0;
24+
RowIdInfo row_id_info;
2425
bool all_varchar = false;
2526

26-
idx_t rows_per_group = 122880;
27+
optional_idx rows_per_group = 122880;
2728
SQLiteDB *global_db;
2829
};
2930

src/include/sqlite_utils.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,9 @@ class SQLiteUtils {
2424
string ToSQLiteTypeAlias(const LogicalType &input);
2525
};
2626

27+
struct RowIdInfo {
28+
optional_idx min_rowid;
29+
optional_idx max_rowid;
30+
};
31+
2732
} // namespace duckdb

src/include/storage/sqlite_options.hpp

-1
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,4 @@ struct SQLiteOpenOptions {
2222
string journal_mode;
2323
};
2424

25-
2625
} // namespace duckdb

src/sqlite_db.cpp

+13-6
Original file line numberDiff line numberDiff line change
@@ -222,21 +222,28 @@ bool SQLiteDB::ColumnExists(const string &table_name, const string &column_name)
222222
return false;
223223
}
224224

225-
bool SQLiteDB::GetMaxRowId(const string &table_name, idx_t &max_row_id) {
225+
bool SQLiteDB::GetRowIdInfo(const string &table_name, RowIdInfo &row_id_info) {
226226
SQLiteStatement stmt;
227-
if (!TryPrepare(StringUtil::Format("SELECT MAX(ROWID) FROM \"%s\"", SQLiteUtils::SanitizeIdentifier(table_name)),
227+
if (!TryPrepare(StringUtil::Format("SELECT MIN(ROWID), MAX(ROWID) FROM \"%s\"",
228+
SQLiteUtils::SanitizeIdentifier(table_name)),
228229
stmt)) {
229230
return false;
230231
}
231232
if (!stmt.Step()) {
232233
return false;
233234
}
234-
int64_t val = stmt.GetValue<int64_t>(0);
235-
;
236-
if (val <= 0) {
235+
int64_t min_val = stmt.GetValue<int64_t>(0);
236+
int64_t max_val = stmt.GetValue<int64_t>(1);
237+
if (min_val < 0 || max_val <= min_val) {
237238
return false;
238239
}
239-
max_row_id = idx_t(val);
240+
static constexpr int64_t MAX_ROWS = 20000000000000;
241+
if (max_val - min_val >= MAX_ROWS) {
242+
// too many rows - this cannot be dense enough to be accurate
243+
return false;
244+
}
245+
row_id_info.min_rowid = NumericCast<idx_t>(min_val);
246+
row_id_info.max_rowid = NumericCast<idx_t>(max_val);
240247
return true;
241248
}
242249

src/sqlite_scanner.cpp

+48-9
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,21 @@ struct SqliteLocalState : public LocalTableFunctionState {
2323
SQLiteStatement stmt;
2424
bool done = false;
2525
vector<column_t> column_ids;
26+
//! The amount of rows we scanned as part of this row group
27+
idx_t scan_count = 1;
2628

2729
~SqliteLocalState() {
2830
}
2931
};
3032

3133
struct SqliteGlobalState : public GlobalTableFunctionState {
32-
SqliteGlobalState(idx_t max_threads) : max_threads(max_threads) {
34+
explicit SqliteGlobalState(idx_t max_threads) : max_threads(max_threads) {
3335
}
3436

3537
mutex lock;
3638
idx_t position = 0;
3739
idx_t max_threads;
40+
idx_t rows_per_group = 0;
3841

3942
idx_t MaxThreads() const override {
4043
return max_threads;
@@ -72,9 +75,8 @@ static unique_ptr<FunctionData> SqliteBind(ClientContext &context, TableFunction
7275
throw std::runtime_error("no columns for table " + result->table_name);
7376
}
7477

75-
if (!db.GetMaxRowId(result->table_name, result->max_rowid)) {
76-
result->max_rowid = idx_t(-1);
77-
result->rows_per_group = idx_t(-1);
78+
if (!db.GetRowIdInfo(result->table_name, result->row_id_info)) {
79+
result->rows_per_group = optional_idx();
7880
}
7981

8082
result->names = names;
@@ -106,7 +108,7 @@ static void SqliteInitInternal(ClientContext &context, const SqliteBindData &bin
106108

107109
auto sql =
108110
StringUtil::Format("SELECT %s FROM \"%s\"", col_names, SQLiteUtils::SanitizeIdentifier(bind_data.table_name));
109-
if (bind_data.rows_per_group != idx_t(-1)) {
111+
if (bind_data.rows_per_group.IsValid()) {
110112
// we are scanning a subset of the rows - generate a WHERE clause based on
111113
// the rowid
112114
auto where_clause = StringUtil::Format(" WHERE ROWID BETWEEN %d AND %d", rowid_min, rowid_max);
@@ -121,7 +123,11 @@ static void SqliteInitInternal(ClientContext &context, const SqliteBindData &bin
121123
static unique_ptr<NodeStatistics> SqliteCardinality(ClientContext &context, const FunctionData *bind_data_p) {
122124
D_ASSERT(bind_data_p);
123125
auto &bind_data = bind_data_p->Cast<SqliteBindData>();
124-
return make_uniq<NodeStatistics>(bind_data.max_rowid);
126+
if (!bind_data.row_id_info.max_rowid.IsValid()) {
127+
return nullptr;
128+
}
129+
auto row_count = bind_data.row_id_info.max_rowid.GetIndex() - bind_data.row_id_info.min_rowid.GetIndex();
130+
return make_uniq<NodeStatistics>(row_count);
125131
}
126132

127133
static idx_t SqliteMaxThreads(ClientContext &context, const FunctionData *bind_data_p) {
@@ -130,17 +136,41 @@ static idx_t SqliteMaxThreads(ClientContext &context, const FunctionData *bind_d
130136
if (bind_data.global_db) {
131137
return 1;
132138
}
133-
return bind_data.max_rowid / bind_data.rows_per_group;
139+
if (!bind_data.row_id_info.max_rowid.IsValid()) {
140+
return 1;
141+
}
142+
auto row_count = bind_data.row_id_info.max_rowid.GetIndex() - bind_data.row_id_info.min_rowid.GetIndex();
143+
return row_count / bind_data.rows_per_group.GetIndex();
134144
}
135145

136146
static bool SqliteParallelStateNext(ClientContext &context, const SqliteBindData &bind_data, SqliteLocalState &lstate,
137147
SqliteGlobalState &gstate) {
138148
lock_guard<mutex> parallel_lock(gstate.lock);
139-
if (gstate.position < bind_data.max_rowid) {
149+
if (!bind_data.rows_per_group.IsValid()) {
150+
// not doing a parallel scan - scan everything at once
151+
if (gstate.position > 0) {
152+
// already scanned
153+
return false;
154+
}
155+
SqliteInitInternal(context, bind_data, lstate, 0, 0);
156+
gstate.position = static_cast<idx_t>(-1);
157+
lstate.scan_count = 0;
158+
return true;
159+
}
160+
auto max_row_id = bind_data.row_id_info.max_rowid.GetIndex();
161+
if (gstate.position < max_row_id) {
162+
if (lstate.scan_count == 0 && gstate.rows_per_group < max_row_id) {
163+
// we scanned no rows in our previous slice - double the rows per group
164+
gstate.rows_per_group *= 2;
165+
}
166+
if (gstate.rows_per_group == 0) {
167+
throw InternalException("SqliteParallelStateNext - gstate.rows_per_group not set");
168+
}
140169
auto start = gstate.position;
141-
auto end = start + bind_data.rows_per_group - 1;
170+
auto end = MinValue<idx_t>(max_row_id, start + gstate.rows_per_group - 1);
142171
SqliteInitInternal(context, bind_data, lstate, start, end);
143172
gstate.position = end + 1;
173+
lstate.scan_count = 0;
144174
return true;
145175
}
146176
return false;
@@ -161,8 +191,16 @@ SqliteInitLocalState(ExecutionContext &context, TableFunctionInitInput &input, G
161191

162192
static unique_ptr<GlobalTableFunctionState> SqliteInitGlobalState(ClientContext &context,
163193
TableFunctionInitInput &input) {
194+
auto &bind_data = input.bind_data->Cast<SqliteBindData>();
164195
auto result = make_uniq<SqliteGlobalState>(SqliteMaxThreads(context, input.bind_data.get()));
165196
result->position = 0;
197+
if (bind_data.rows_per_group.IsValid()) {
198+
auto min_row_id = bind_data.row_id_info.min_rowid.GetIndex();
199+
if (min_row_id > 0) {
200+
result->position = min_row_id - 1;
201+
}
202+
result->rows_per_group = bind_data.rows_per_group.GetIndex();
203+
}
166204
return std::move(result);
167205
}
168206

@@ -191,6 +229,7 @@ static void SqliteScan(ClientContext &context, TableFunctionInput &data, DataChu
191229
output.SetCardinality(out_idx);
192230
break;
193231
}
232+
state.scan_count++;
194233
for (idx_t col_idx = 0; col_idx < output.ColumnCount(); col_idx++) {
195234
auto &out_vec = output.data[col_idx];
196235
auto sqlite_column_type = stmt.GetType(col_idx);

src/sqlite_storage.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ static unique_ptr<Catalog> SQLiteAttach(StorageExtensionInfo *storage_info, Clie
1717
AccessMode access_mode) {
1818
SQLiteOpenOptions options;
1919
options.access_mode = access_mode;
20-
for(auto &entry : info.options) {
20+
for (auto &entry : info.options) {
2121
if (StringUtil::CIEquals(entry.first, "busy_timeout")) {
2222
options.busy_timeout = entry.second.GetValue<uint64_t>();
2323
} else if (StringUtil::CIEquals(entry.first, "journal_mode")) {

src/storage/sqlite_catalog.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ SQLiteCatalog::~SQLiteCatalog() {
1919
}
2020

2121
void SQLiteCatalog::Initialize(bool load_builtin) {
22-
CreateSchemaInfo info;
22+
CreateSchemaInfo info;
2323
main_schema = make_uniq<SQLiteSchemaEntry>(*this, info);
2424
}
2525

src/storage/sqlite_table_entry.cpp

+9-9
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ unique_ptr<BaseStatistics> SQLiteTableEntry::GetStatistics(ClientContext &contex
1717
}
1818

1919
void SQLiteTableEntry::BindUpdateConstraints(Binder &, LogicalGet &, LogicalProjection &, LogicalUpdate &,
20-
ClientContext &) {
20+
ClientContext &) {
2121
}
2222

2323
TableFunction SQLiteTableEntry::GetScanFunction(ClientContext &context, unique_ptr<FunctionData> &bind_data) {
@@ -34,33 +34,33 @@ TableFunction SQLiteTableEntry::GetScanFunction(ClientContext &context, unique_p
3434
auto &transaction = Transaction::Get(context, catalog).Cast<SQLiteTransaction>();
3535
auto &db = transaction.GetDB();
3636

37-
if (!db.GetMaxRowId(name, result->max_rowid)) {
38-
result->max_rowid = idx_t(-1);
39-
result->rows_per_group = idx_t(-1);
37+
if (!db.GetRowIdInfo(name, result->row_id_info)) {
38+
result->rows_per_group = optional_idx();
4039
}
4140
if (!transaction.IsReadOnly() || sqlite_catalog.InMemory()) {
4241
// for in-memory databases or if we have transaction-local changes we can
4342
// only do a single-threaded scan set up the transaction's connection object
4443
// as the global db
4544
result->global_db = &db;
46-
result->rows_per_group = idx_t(-1);
45+
result->rows_per_group = optional_idx();
4746
}
4847

4948
bind_data = std::move(result);
50-
return SqliteScanFunction();
49+
return static_cast<TableFunction>(SqliteScanFunction());
5150
}
5251

5352
TableStorageInfo SQLiteTableEntry::GetStorageInfo(ClientContext &context) {
5453
auto &transaction = Transaction::Get(context, catalog).Cast<SQLiteTransaction>();
5554
auto &db = transaction.GetDB();
5655
TableStorageInfo result;
5756

58-
idx_t cardinality;
59-
if (!db.GetMaxRowId(name, cardinality)) {
57+
RowIdInfo info;
58+
if (!db.GetRowIdInfo(name, info)) {
6059
// probably
6160
result.cardinality = 10000;
61+
} else {
62+
result.cardinality = info.max_rowid.GetIndex() - info.min_rowid.GetIndex();
6263
}
63-
result.cardinality = cardinality;
6464

6565
result.index_info = db.GetIndexInfo(name);
6666
return result;

src/storage/sqlite_transaction.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ optional_ptr<CatalogEntry> SQLiteTransaction::GetCatalogEntry(const string &entr
8282
case CatalogType::INDEX_ENTRY: {
8383
CreateIndexInfo info;
8484
info.index_name = entry_name;
85+
info.constraint_type = IndexConstraintType::NONE;
8586

8687
string table_name;
8788
string sql;
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# name: test/sql/storage/attach_giant_row_id.test
2+
# description:
3+
# group: [sqlite_storage]
4+
5+
require sqlite_scanner
6+
7+
statement ok
8+
ATTACH 'data/db/giant_row_id.db' AS s1 (TYPE SQLITE)
9+
10+
query I
11+
SELECT * FROM s1.big_row_id
12+
----
13+
1797657063271174144
14+
15+
query I
16+
SELECT * FROM s1.sparse_row_id
17+
----
18+
0
19+
1797657063271174144
20+
21+
query I
22+
SELECT * FROM s1.sparse_row_id_smaller
23+
----
24+
0
25+
10000000000000

test/sql/storage/attach_schema_functions.test

+3-3
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ index sqlite_autoindex_integers_1 integers 0
2828
table integers integers 0
2929
view v1 v1 0
3030

31-
query IIII
32-
SELECT database_name, table_name, has_primary_key, estimated_size FROM duckdb_tables()
31+
query III
32+
SELECT database_name, table_name, has_primary_key FROM duckdb_tables()
3333
----
34-
s integers true 3
34+
s integers true
3535

3636
statement ok
3737
SELECT * FROM duckdb_schemas()

0 commit comments

Comments
 (0)