Skip to content

Commit c90cd8c

Browse files
committed
More precise estimation of column size for ColumnString
Andded PrettyPrintByteSize + some tests
1 parent 11604cf commit c90cd8c

File tree

4 files changed

+169
-15
lines changed

4 files changed

+169
-15
lines changed

ut/ColumnString_ut.cpp

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "utils.h"
99
#include "value_generators.h"
1010

11+
#include <algorithm>
1112
#include <cstddef>
1213
#include <initializer_list>
1314
#include <ios>
@@ -38,13 +39,25 @@ size_t EstimateColumnStringMemoryUsage(
3839
if (item_estimated_size == ColumnString::EstimatedValueSize{0} && total_items_size && number_of_items)
3940
item_estimated_size = ColumnString::EstimatedValueSize(static_cast<double>(*total_items_size) / number_of_items);
4041

41-
return number_of_items * sizeof(std::string_view)
42-
+ number_of_items * static_cast<size_t>(item_estimated_size) * value_to_estimation_average_size_ratio
42+
const size_t estimated_total_item_size = number_of_items * static_cast<size_t>(item_estimated_size) * value_to_estimation_average_size_ratio;
43+
const auto estimated_number_of_blocks = std::max<size_t>(1, estimated_total_item_size ? COLUMN_STRING_DEFAULT_BLOCK_SIZE / estimated_total_item_size : 1);
44+
45+
// space wasted in block since not all items can be fit perfectly, and there is some unused space at the end of the block.
46+
const auto estimate_lost_space_in_block = (static_cast<size_t>(item_estimated_size) != 0
47+
? COLUMN_STRING_DEFAULT_BLOCK_SIZE % static_cast<size_t>(static_cast<size_t>(item_estimated_size) * value_to_estimation_average_size_ratio)
48+
: COLUMN_STRING_DEFAULT_BLOCK_SIZE / 10);
49+
50+
const auto max_estimation_error_factor = item_estimated_size == ColumnString::NO_PREALLOCATE ? 2.5 : 2;
51+
52+
return (number_of_items * sizeof(std::string_view)
53+
+ estimated_total_item_size
54+
+ estimate_lost_space_in_block * estimated_number_of_blocks
4355
+ COLUMN_STRING_DEFAULT_BLOCK_SIZE
4456
// It is hard to compute overhead added by vector<ColumnString::Block>
4557
// (mostly because we don't know number of ColumnString::Block instances from outside, and this number depends on many factors),
4658
// so we just make a guess.
47-
+ COLUMN_STRING_MAX_EXPECTED_MEMORY_OVERHEAD;
59+
+ COLUMN_STRING_MAX_EXPECTED_MEMORY_OVERHEAD)
60+
* max_estimation_error_factor;
4861
}
4962

5063
std::string ScaleString(std::string str, size_t required_size) {
@@ -289,8 +302,15 @@ struct ColumnStringEstimatedValueSizeTest : public ::testing::TestWithParam<std:
289302
// Adjust number of items so the test doesn't use too much memory
290303
if (static_cast<size_t>(single_value_size_estimation) != 0
291304
// *2 since we store both reference values and values in column itself.
292-
&& EstimateColumnStringMemoryUsage(expected_number_of_items, single_value_size_estimation, size_ratio.average) * 2 > MAX_MEMORY_USAGE) {
305+
&& EstimateColumnStringMemoryUsage(expected_number_of_items, single_value_size_estimation, size_ratio.average) > MAX_MEMORY_USAGE) {
306+
const auto old_expected_number_of_items = expected_number_of_items;
293307
expected_number_of_items = MAX_MEMORY_USAGE / (static_cast<size_t>(single_value_size_estimation) * 2 * size_ratio.average);
308+
309+
std::cerr << "To avoid using too much memory, reduced number of items in test"
310+
<< " from " << old_expected_number_of_items
311+
<< " to " << expected_number_of_items
312+
<< ", expected item size is " << single_value_size_estimation
313+
<< std::endl;
294314
}
295315
}
296316

@@ -318,9 +338,9 @@ struct ColumnStringEstimatedValueSizeTest : public ::testing::TestWithParam<std:
318338
ASSERT_TRUE(CompareRecursive(values, column));
319339
}
320340

321-
size_t EstimateMemoryUsage(size_t total_values_size) {
341+
size_t EstimateMemoryUsage(size_t total_values_size, float expected_number_of_items_multiplier = 1.0) {
322342
const auto & [single_value_size_estimation, size_ratio] = GetParam();
323-
return EstimateColumnStringMemoryUsage(expected_number_of_items, single_value_size_estimation, size_ratio.average, total_values_size);
343+
return EstimateColumnStringMemoryUsage(expected_number_of_items * expected_number_of_items_multiplier, single_value_size_estimation, size_ratio.average, total_values_size);
324344
}
325345
};
326346

@@ -363,8 +383,10 @@ TEST_P(ColumnStringEstimatedValueSizeTest, AppendNoReserve)
363383

364384
EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size));
365385

366-
// since there was no Reserve call prior, there could be more some overallocations, hence *2
367-
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * 2);
386+
const auto max_estimation_error_factor = single_value_size_estimation == ColumnString::NO_PREALLOCATE ? 2.5 : 2;
387+
388+
// since there was no Reserve call prior, there could be more some overallocations, hence some estimation error
389+
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * max_estimation_error_factor);
368390
}
369391

370392
TEST_P(ColumnStringEstimatedValueSizeTest, ReserveExactAndAppend)
@@ -377,8 +399,10 @@ TEST_P(ColumnStringEstimatedValueSizeTest, ReserveExactAndAppend)
377399
EXPECT_NO_THROW(col.Reserve(expected_number_of_items));
378400
EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size));
379401

402+
const auto max_estimation_error_factor = single_value_size_estimation == ColumnString::NO_PREALLOCATE ? 2.5 : 2;
403+
380404
// Allow minor overallocations, hence * 1.2
381-
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * 2);
405+
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * max_estimation_error_factor);
382406
}
383407

384408
TEST_P(ColumnStringEstimatedValueSizeTest, ReserveLessAndAppend)
@@ -391,8 +415,10 @@ TEST_P(ColumnStringEstimatedValueSizeTest, ReserveLessAndAppend)
391415
EXPECT_NO_THROW(col.Reserve(expected_number_of_items * .8));
392416
EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size));
393417

418+
const auto max_estimation_error_factor = single_value_size_estimation == ColumnString::NO_PREALLOCATE ? 2.5 : 2;
419+
394420
// Allow minor overallocations, hence * 1.2
395-
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * 2);
421+
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size) * max_estimation_error_factor);
396422
}
397423

398424
TEST_P(ColumnStringEstimatedValueSizeTest, ReserveMoreAndAppend)
@@ -405,8 +431,10 @@ TEST_P(ColumnStringEstimatedValueSizeTest, ReserveMoreAndAppend)
405431
EXPECT_NO_THROW(col.Reserve(expected_number_of_items * 1.2));
406432
EXPECT_NO_FATAL_FAILURE(AppendStrings(col, total_values_size));
407433

434+
const auto max_estimation_error_factor = single_value_size_estimation == ColumnString::NO_PREALLOCATE ? 2.5 : 2;
435+
408436
// Allow minor overallocations, hence * 1.2
409-
EXPECT_LT(col.MemoryUsage(), EstimateColumnStringMemoryUsage(expected_number_of_items * 1.2, single_value_size_estimation, size_ratio.average, total_values_size) * 2);
437+
EXPECT_LT(col.MemoryUsage(), EstimateMemoryUsage(total_values_size, 1.2) * max_estimation_error_factor);
410438
}
411439

412440
/** TODO more tests

ut/utils.cpp

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
#include <cinttypes>
2222
#include <iomanip>
23+
#include <ios>
2324
#include <sstream>
25+
#include <string_view>
2426
#include <type_traits>
2527

2628

@@ -256,6 +258,48 @@ std::ostream& operator<<(std::ostream & ostr, const PrettyPrintBlock & pretty_pr
256258
return ostr;
257259
}
258260

261+
std::ostream& operator<<(std::ostream & ostr, const PrettyPrintByteSize & byte_size) {
262+
static const std::pair<size_t, const char *> FACTORS[] = {
263+
{ 1, "bytes" },
264+
{ 1024, "KiB" },
265+
{ 1024*1024, "MiB" },
266+
{ 1024*1024*1024, "GiB" },
267+
};
268+
269+
auto p = std::find_if(std::begin(FACTORS), std::end(FACTORS), [&byte_size](const auto v) {
270+
return byte_size.bytes < v.first;
271+
});
272+
273+
if (p != std::begin(FACTORS)) {
274+
--p;
275+
}
276+
277+
const float resulting_size = byte_size.bytes / static_cast<float>(p->first);
278+
279+
// Trim trailing zeroes after decimal point, if present.
280+
{
281+
std::stringstream sstr;
282+
sstr << std::fixed << std::setprecision(byte_size.max_decimal_points) << resulting_size;
283+
284+
auto s = sstr.str();
285+
286+
// here we completely ignore locales and just assume that '.' is used as decimal point
287+
const auto decimal_point_position = s.find_last_of('.');
288+
const auto last_non_zero_decimal_number_pos = s.find_last_not_of('0');
289+
290+
if (decimal_point_position != s.npos && last_non_zero_decimal_number_pos != s.npos) {
291+
if (decimal_point_position == last_non_zero_decimal_number_pos)
292+
s.erase(decimal_point_position);
293+
else
294+
s.erase(std::max(decimal_point_position, last_non_zero_decimal_number_pos + 1));
295+
}
296+
297+
ostr << s;
298+
}
299+
300+
return ostr << " " << p->second;
301+
}
302+
259303
std::ostream& operator<<(std::ostream& ostr, const in_addr& addr) {
260304
char buf[INET_ADDRSTRLEN];
261305
const char* ip_str = inet_ntop(AF_INET, &addr, buf, sizeof(buf));
@@ -332,6 +376,28 @@ std::ostream & operator<<(std::ostream & ostr, const Progress & progress) {
332376
<< " written_bytes : " << progress.written_bytes;
333377
}
334378

379+
std::ostream & operator<<(std::ostream & ostr, const ColumnString::EstimatedValueSize & estimation) {
380+
static const std::pair<ColumnString::EstimatedValueSize, const char *> NAMES_OF_DEFAULT_VALUES[] = {
381+
{ ColumnString::NO_PREALLOCATE, "DO NOT PREALLOCATE" },
382+
{ ColumnString::EstimatedValueSize::TINY, "TINY (8 bytes)" },
383+
{ ColumnString::EstimatedValueSize::SMALL, "SMALL (32 bytes)" },
384+
{ ColumnString::EstimatedValueSize::MEDIUM, "MEDIUM (128 bytes)" },
385+
{ ColumnString::EstimatedValueSize::LARGE, "LARGE (512 bytes)" }
386+
};
387+
388+
const auto p = std::find_if(std::begin(NAMES_OF_DEFAULT_VALUES), std::end(NAMES_OF_DEFAULT_VALUES), [&estimation](const auto v) {
389+
return v.first == estimation;
390+
});
391+
392+
ostr << "ColumnString::EstimatedValueSize{ ";
393+
if (p != std::end(NAMES_OF_DEFAULT_VALUES))
394+
ostr << p->second;
395+
else
396+
ostr << PrettyPrintByteSize{static_cast<size_t>(estimation)};
397+
398+
return ostr << " }";
399+
}
400+
335401
}
336402

337403
uint64_t versionNumber(const ServerInfo & server_info) {
@@ -350,5 +416,5 @@ std::string ToString(const clickhouse::UUID& v) {
350416
}
351417

352418
std::ostream & dumpMemoryUsage(const char * prefix, const clickhouse::ColumnRef col) {
353-
return std::cerr << prefix << " " << col->GetType().GetName() << " : " << col->MemoryUsage() << " bytes" << std::endl;
419+
return std::cerr << prefix << " " << col->GetType().GetName() << " : " << PrettyPrintByteSize{col->MemoryUsage()} << std::endl;
354420
}

ut/utils.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,19 +141,25 @@ struct PrettyPrintBlock {
141141
const clickhouse::Block & block;
142142
};
143143

144+
// Print byte size in either in bytes, KiB, MiB, or GiB.
145+
struct PrettyPrintByteSize {
146+
size_t bytes;
147+
size_t max_decimal_points = 3;
148+
};
149+
144150
namespace clickhouse {
145151
std::ostream& operator<<(std::ostream & ostr, const Block & block);
146152
std::ostream& operator<<(std::ostream & ostr, const Type & type);
147153
std::ostream & operator<<(std::ostream & ostr, const ServerInfo & server_info);
148154
std::ostream & operator<<(std::ostream & ostr, const Profile & profile);
149155
std::ostream & operator<<(std::ostream & ostr, const Progress & progress);
150-
inline std::ostream & operator<<(std::ostream & ostr, const ColumnString::EstimatedValueSize & estimation) {
151-
return ostr << "ColumnString::EstimatedValueSize{ " << static_cast<std::underlying_type_t<ColumnString::EstimatedValueSize>>(estimation) << " }";
152-
}
156+
std::ostream & operator<<(std::ostream & ostr, const ColumnString::EstimatedValueSize & estimation);
153157

154158
}
155159

156160
std::ostream& operator<<(std::ostream & ostr, const PrettyPrintBlock & block);
161+
std::ostream& operator<<(std::ostream & ostr, const PrettyPrintByteSize & block);
162+
157163
std::ostream& operator<<(std::ostream& ostr, const in_addr& addr);
158164
std::ostream& operator<<(std::ostream& ostr, const in6_addr& addr);
159165

ut/utils_ut.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "ut/value_generators.h"
33
#include "utils.h"
44

5+
#include <initializer_list>
56
#include <limits>
67
#include <optional>
78
#include <vector>
@@ -120,3 +121,56 @@ TEST(Generators, MakeArrays) {
120121
auto arrays = MakeArrays<std::string, MakeStrings>();
121122
ASSERT_LT(0u, arrays.size());
122123
}
124+
125+
class OutputTest : public ::testing::Test {
126+
public:
127+
template <typename T>
128+
static std::string ToString(const T & t) {
129+
std::stringstream sstr;
130+
sstr << t;
131+
132+
return sstr.str();
133+
}
134+
};
135+
136+
TEST_F(OutputTest, PrettyPrintByteSize)
137+
{
138+
EXPECT_EQ("3 bytes", ToString(PrettyPrintByteSize{3}));
139+
140+
EXPECT_EQ("30 bytes", ToString(PrettyPrintByteSize{30}));
141+
EXPECT_EQ("300 bytes", ToString(PrettyPrintByteSize{300}));
142+
143+
EXPECT_EQ("123 bytes", ToString(PrettyPrintByteSize{123}));
144+
145+
for (const auto & [base, base_name] : std::initializer_list<std::pair<size_t, const char*>>{
146+
// {1, "bytes"},
147+
{1024, "KiB"},
148+
{1024*1024, "MiB"},
149+
{1024*1024*1024, "GiB"},
150+
} )
151+
{
152+
for (const auto & [value, value_str] : std::initializer_list<std::pair<float, const char*>>{
153+
{1, "1"},
154+
{1.01, "1.01"},
155+
{1.10, "1.1"},
156+
{1.5, "1.5"},
157+
{3, "3"},
158+
{3.25, "3.25"},
159+
{13.75, "13.75"},
160+
{135.5, "135.5"},
161+
{135.125, "135.125"},
162+
{10, "10"},
163+
{100, "100"},
164+
{1000, "1000"},
165+
})
166+
{
167+
const auto bytes_value = static_cast<size_t>(base * value);
168+
const auto expected_str = std::string(value_str) + " " + base_name;
169+
EXPECT_EQ(expected_str, ToString(PrettyPrintByteSize{bytes_value}))
170+
<< "\n\tbase: " << base
171+
<< "\n\tbase_name: " << base_name
172+
<< "\n\tvalue: " << value
173+
<< "\n\tvalue_str: " << value_str;
174+
}
175+
}
176+
}

0 commit comments

Comments
 (0)