Skip to content

Commit 8652f02

Browse files
github-actions[bot]Shelnutt2ypatia
authored
[Backport release-2.27] Improve readers by parallelizing I/O and compute operations (#5401) (#5451)
Backport of #5401 to release-2.27 --- TYPE: IMPROVEMENT DESC: Improve readers by parallelizing I/O and compute operations --------- Co-authored-by: Seth Shelnutt <[email protected]> Co-authored-by: Ypatia Tsavliri <[email protected]> Co-authored-by: Seth Shelnutt <[email protected]>
1 parent a2e10e1 commit 8652f02

23 files changed

+416
-143
lines changed

test/src/unit-ReadCellSlabIter.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,10 @@ void set_result_tile_dim(
183183
std::nullopt,
184184
std::nullopt,
185185
std::nullopt);
186-
ResultTile::TileData tile_data{nullptr, nullptr, nullptr};
186+
ResultTile::TileData tile_data{
187+
{nullptr, ThreadPool::SharedTask()},
188+
{nullptr, ThreadPool::SharedTask()},
189+
{nullptr, ThreadPool::SharedTask()}};
187190
result_tile.init_coord_tile(
188191
constants::format_version,
189192
array_schema,

test/src/unit-cppapi-consolidation-with-timestamps.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ TEST_CASE_METHOD(
636636

637637
// Will only allow to load two tiles out of 3.
638638
Config cfg;
639-
cfg.set("sm.mem.total_budget", "30000");
639+
cfg.set("sm.mem.total_budget", "50000");
640640
cfg.set("sm.mem.reader.sparse_global_order.ratio_coords", "0.15");
641641
ctx_ = Context(cfg);
642642

@@ -685,7 +685,7 @@ TEST_CASE_METHOD(
685685

686686
// Will only allow to load two tiles out of 3.
687687
Config cfg;
688-
cfg.set("sm.mem.total_budget", "30000");
688+
cfg.set("sm.mem.total_budget", "50000");
689689
cfg.set("sm.mem.reader.sparse_global_order.ratio_coords", "0.15");
690690
ctx_ = Context(cfg);
691691

test/src/unit-result-tile.cc

+16-4
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,10 @@ TEST_CASE_METHOD(
213213
0,
214214
std::nullopt,
215215
std::nullopt);
216-
ResultTile::TileData tile_data{nullptr, nullptr, nullptr};
216+
ResultTile::TileData tile_data{
217+
{nullptr, ThreadPool::SharedTask()},
218+
{nullptr, ThreadPool::SharedTask()},
219+
{nullptr, ThreadPool::SharedTask()}};
217220
rt.init_coord_tile(
218221
constants::format_version,
219222
array_schema,
@@ -230,7 +233,10 @@ TEST_CASE_METHOD(
230233
0,
231234
std::nullopt,
232235
std::nullopt);
233-
ResultTile::TileData tile_data{nullptr, nullptr, nullptr};
236+
ResultTile::TileData tile_data{
237+
{nullptr, ThreadPool::SharedTask()},
238+
{nullptr, ThreadPool::SharedTask()},
239+
{nullptr, ThreadPool::SharedTask()}};
234240
rt.init_coord_tile(
235241
constants::format_version,
236242
array_schema,
@@ -326,7 +332,10 @@ TEST_CASE_METHOD(
326332
0,
327333
std::nullopt,
328334
std::nullopt);
329-
ResultTile::TileData tile_data{nullptr, nullptr, nullptr};
335+
ResultTile::TileData tile_data{
336+
{nullptr, ThreadPool::SharedTask()},
337+
{nullptr, ThreadPool::SharedTask()},
338+
{nullptr, ThreadPool::SharedTask()}};
330339
rt.init_coord_tile(
331340
constants::format_version,
332341
array_schema,
@@ -343,7 +352,10 @@ TEST_CASE_METHOD(
343352
0,
344353
std::nullopt,
345354
std::nullopt);
346-
ResultTile::TileData tile_data{nullptr, nullptr, nullptr};
355+
ResultTile::TileData tile_data{
356+
{nullptr, ThreadPool::SharedTask()},
357+
{nullptr, ThreadPool::SharedTask()},
358+
{nullptr, ThreadPool::SharedTask()}};
347359
rt.init_coord_tile(
348360
constants::format_version,
349361
array_schema,

test/src/unit-sparse-global-order-reader.cc

+7-5
Original file line numberDiff line numberDiff line change
@@ -1993,9 +1993,10 @@ TEST_CASE_METHOD(
19931993
}
19941994

19951995
// FIXME: there is no per fragment budget anymore
1996-
// Two result tile (2 * (~3000 + 8) will be bigger than the per fragment
1997-
// budget (1000).
1998-
memory_.total_budget_ = "35000";
1996+
// Two result tiles (2 * (2842 + 8)) = 5700 will be bigger than the per
1997+
// fragment budget (50000 * 0.11 / 2 fragments = 2750), so only one result
1998+
// tile will be loaded each time.
1999+
memory_.total_budget_ = "60000";
19992000
memory_.ratio_coords_ = "0.11";
20002001
update_config();
20012002

@@ -2518,8 +2519,9 @@ TEST_CASE_METHOD(
25182519
}
25192520

25202521
// FIXME: there is no per fragment budget anymore
2521-
// Two result tile (2 * (~4000 + 8) will be bigger than the per fragment
2522-
// budget (1000).
2522+
// Two result tiles (2 * (2842 + 8)) = 5700 will be bigger than the per
2523+
// fragment budget (40000 * 0.22 /2 frag = 4400), so only one will be loaded
2524+
// each time.
25232525
memory_.total_budget_ = "40000";
25242526
memory_.ratio_coords_ = "0.22";
25252527
update_config();

test/src/unit-sparse-unordered-with-dups-reader.cc

+5-2
Original file line numberDiff line numberDiff line change
@@ -1064,9 +1064,12 @@ TEST_CASE_METHOD(
10641064

10651065
if (one_frag) {
10661066
CHECK(1 == loop_num->second);
1067-
} else {
1068-
CHECK(9 == loop_num->second);
10691067
}
1068+
/**
1069+
* We can't do a similar check for multiple fragments as it is architecture
1070+
* dependent how many tiles fit in the memory budget. And thus also
1071+
* architecture dependent as to how many internal loops we have.
1072+
*/
10701073

10711074
// Try to read multiple frags without partial tile offset reading. Should
10721075
// fail

tiledb/sm/filter/compression_filter.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ Status CompressionFilter::decompress_var_string_coords(
636636
auto output_view = span<std::byte>(
637637
reinterpret_cast<std::byte*>(output_buffer->data()), uncompressed_size);
638638
auto offsets_view = span<uint64_t>(
639-
offsets_tile->data_as<offsets_t>(), uncompressed_offsets_size);
639+
offsets_tile->data_as_unsafe<offsets_t>(), uncompressed_offsets_size);
640640

641641
if (compressor_ == Compressor::RLE) {
642642
uint8_t rle_len_bytesize, string_len_bytesize;

tiledb/sm/filter/filter_pipeline.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ Status FilterPipeline::run_reverse(
464464
// If the pipeline is empty, just copy input to output.
465465
if (filters_.empty()) {
466466
void* output_chunk_buffer =
467-
tile->data_as<char>() + chunk_data.chunk_offsets_[i];
467+
tile->data_as_unsafe<char>() + chunk_data.chunk_offsets_[i];
468468
RETURN_NOT_OK(input_data.copy_to(output_chunk_buffer));
469469
continue;
470470
}
@@ -487,7 +487,7 @@ Status FilterPipeline::run_reverse(
487487
bool last_filter = filter_idx == 0;
488488
if (last_filter) {
489489
void* output_chunk_buffer =
490-
tile->data_as<char>() + chunk_data.chunk_offsets_[i];
490+
tile->data_as_unsafe<char>() + chunk_data.chunk_offsets_[i];
491491
RETURN_NOT_OK(output_data.set_fixed_allocation(
492492
output_chunk_buffer, chunk.unfiltered_data_size_));
493493
reader_stats->add_counter(

tiledb/sm/filter/test/filter_test_support.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ Tile create_tile_for_unfiltering(
203203
tile->cell_size() * nelts,
204204
tile->filtered_buffer().data(),
205205
tile->filtered_buffer().size(),
206-
tracker};
206+
tracker,
207+
std::nullopt};
207208
}
208209

209210
void run_reverse(

tiledb/sm/filter/test/tile_data_generator.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ class TileDataGenerator {
9999
original_tile_size(),
100100
filtered_buffer.data(),
101101
filtered_buffer.size(),
102-
memory_tracker);
102+
memory_tracker,
103+
std::nullopt);
103104
}
104105

105106
/** Returns the size of the original unfiltered data. */

tiledb/sm/metadata/test/unit_metadata.cc

+6-3
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ TEST_CASE(
123123
tile1->size(),
124124
tile1->filtered_buffer().data(),
125125
tile1->filtered_buffer().size(),
126-
tracker);
126+
tracker,
127+
ThreadPool::SharedTask());
127128
memcpy(metadata_tiles[0]->data(), tile1->data(), tile1->size());
128129

129130
metadata_tiles[1] = tdb::make_shared<Tile>(
@@ -135,7 +136,8 @@ TEST_CASE(
135136
tile2->size(),
136137
tile2->filtered_buffer().data(),
137138
tile2->filtered_buffer().size(),
138-
tracker);
139+
tracker,
140+
ThreadPool::SharedTask());
139141
memcpy(metadata_tiles[1]->data(), tile2->data(), tile2->size());
140142

141143
metadata_tiles[2] = tdb::make_shared<Tile>(
@@ -147,7 +149,8 @@ TEST_CASE(
147149
tile3->size(),
148150
tile3->filtered_buffer().data(),
149151
tile3->filtered_buffer().size(),
150-
tracker);
152+
tracker,
153+
ThreadPool::SharedTask());
151154
memcpy(metadata_tiles[2]->data(), tile3->data(), tile3->size());
152155

153156
meta = Metadata::deserialize(metadata_tiles);

tiledb/sm/query/readers/dense_reader.cc

+32-27
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,9 @@ Status DenseReader::dense_read() {
453453
// processing.
454454
if (qc_coords_mode_) {
455455
t_start = t_end;
456+
if (compute_task.valid()) {
457+
throw_if_not_ok(compute_task.wait());
458+
}
456459
continue;
457460
}
458461

@@ -769,8 +772,8 @@ DenseReader::compute_result_space_tiles(
769772
const auto fragment_num = (unsigned)frag_tile_domains.size();
770773
const auto& tile_coords = subarray.tile_coords();
771774

772-
// Keep track of the required memory to load the result space tiles. Split up
773-
// filtered versus unfiltered. The memory budget is combined for all
775+
// Keep track of the required memory to load the result space tiles. Split
776+
// up filtered versus unfiltered. The memory budget is combined for all
774777
// query condition attributes.
775778
uint64_t required_memory_query_condition_unfiltered = 0;
776779
std::vector<uint64_t> required_memory_unfiltered(
@@ -786,28 +789,28 @@ DenseReader::compute_result_space_tiles(
786789
aggregate_only_field[n - condition_names.size()] = aggregate_only(name);
787790
}
788791

789-
// Here we estimate the size of the tile structures. First, we have to account
790-
// the size of the space tile structure. We could go deeper in the class to
791-
// account for other things but for now we keep it simpler. Second, we try to
792-
// account for the tile subarray (DenseTileSubarray). This class will have a
793-
// vector of ranges per dimensions, so 1 + dim_num * sizeof(vector). Here we
794-
// choose 32 for the size of the vector to anticipate the conversion to a PMR
795-
// vector. We also add dim_num * 2 * sizeof(DimType) to account for at least
796-
// one range per dimension (this should be improved by accounting for the
797-
// exact number of ranges). Finally for the original range index member, we
798-
// have to add 1 + dim_num * sizeof(vector) as well and one uint64_t per
799-
// dimension (this can also be improved by accounting for the
800-
// exact number of ranges).
792+
// Here we estimate the size of the tile structures. First, we have to
793+
// account the size of the space tile structure. We could go deeper in the
794+
// class to account for other things but for now we keep it simpler. Second,
795+
// we try to account for the tile subarray (DenseTileSubarray). This class
796+
// will have a vector of ranges per dimensions, so 1 + dim_num *
797+
// sizeof(vector). Here we choose 32 for the size of the vector to
798+
// anticipate the conversion to a PMR vector. We also add dim_num * 2 *
799+
// sizeof(DimType) to account for at least one range per dimension (this
800+
// should be improved by accounting for the exact number of ranges). Finally
801+
// for the original range index member, we have to add 1 + dim_num *
802+
// sizeof(vector) as well and one uint64_t per dimension (this can also be
803+
// improved by accounting for the exact number of ranges).
801804
uint64_t est_tile_structs_size =
802805
sizeof(ResultSpaceTile<DimType>) + (1 + dim_num) * 2 * 32 +
803806
dim_num * (2 * sizeof(DimType) + sizeof(uint64_t));
804807

805808
// Create the vector of result tiles to operate on. We stop once we reach
806-
// the end or the memory budget. We either reach the tile upper memory limit,
807-
// which is only for unfiltered data, or the limit of the available budget,
808-
// which is for filtered data, unfiltered data and the tile structs. We try to
809-
// process two tile batches at a time so the available memory is half of what
810-
// we have available.
809+
// the end or the memory budget. We either reach the tile upper memory
810+
// limit, which is only for unfiltered data, or the limit of the available
811+
// budget, which is for filtered data, unfiltered data and the tile structs.
812+
// We try to process two tile batches at a time so the available memory is
813+
// half of what we have available.
811814
uint64_t t_end = t_start;
812815
bool wait_compute_task_before_read = false;
813816
bool done = false;
@@ -895,8 +898,8 @@ DenseReader::compute_result_space_tiles(
895898
uint64_t tile_memory_filtered = 0;
896899
uint64_t r_idx = n - condition_names.size();
897900

898-
// We might not need to load this tile into memory at all for aggregation
899-
// only.
901+
// We might not need to load this tile into memory at all for
902+
// aggregation only.
900903
if (aggregate_only_field[r_idx] &&
901904
can_aggregate_tile_with_frag_md(
902905
names[n], result_space_tile, tiles_cell_num[t_end])) {
@@ -953,13 +956,14 @@ DenseReader::compute_result_space_tiles(
953956
required_memory_unfiltered[r_idx] +
954957
est_tile_structs_size;
955958

956-
// Disable the multiple iterations if the tiles don't fit in the iteration
957-
// budget.
959+
// Disable the multiple iterations if the tiles don't fit in the
960+
// iteration budget.
958961
if (total_memory > available_memory_iteration) {
959962
wait_compute_task_before_read = true;
960963
}
961964

962-
// If a single tile doesn't fit in the available memory, we can't proceed.
965+
// If a single tile doesn't fit in the available memory, we can't
966+
// proceed.
963967
if (total_memory > available_memory) {
964968
throw DenseReaderException(
965969
"Cannot process a single tile requiring " +
@@ -1003,7 +1007,8 @@ std::vector<ResultTile*> DenseReader::result_tiles_to_load(
10031007
const auto& tile_coords = subarray.tile_coords();
10041008
const bool agg_only = name.has_value() && aggregate_only(name.value());
10051009

1006-
// If the result is already loaded in query condition, return the empty list;
1010+
// If the result is already loaded in query condition, return the empty
1011+
// list;
10071012
std::vector<ResultTile*> ret;
10081013
if (name.has_value() && condition_names.count(name.value()) != 0) {
10091014
return ret;
@@ -1033,8 +1038,8 @@ std::vector<ResultTile*> DenseReader::result_tiles_to_load(
10331038

10341039
/**
10351040
* Apply the query condition. The computation will be pushed on the compute
1036-
* thread pool in `compute_task`. Callers should wait on this task before using
1037-
* the results of the query condition.
1041+
* thread pool in `compute_task`. Callers should wait on this task before
1042+
* using the results of the query condition.
10381043
*/
10391044
template <class DimType, class OffType>
10401045
Status DenseReader::apply_query_condition(

0 commit comments

Comments
 (0)