Skip to content

Commit 52668c3

Browse files
authored
Merge pull request #1241 from lplewa/benchmark_workload
benchmark workload redesign
2 parents dd51330 + a3522ee commit 52668c3

File tree

2 files changed

+237
-21
lines changed

2 files changed

+237
-21
lines changed

benchmark/benchmark.cpp

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@
3232
// The exact meaning of each argument depends on the benchmark, allocator, and size components used.
3333
// Refer to the 'argsName()' function in each component to find detailed descriptions of these arguments.
3434

35-
template <size_t max_threads = 12>
3635
static void multithreaded(benchmark::internal::Benchmark *benchmark) {
3736
benchmark->Threads(1);
38-
benchmark->DenseThreadRange(4, max_threads, 4);
37+
benchmark->Threads(4);
3938
}
4039

4140
static void singlethreaded(benchmark::internal::Benchmark *benchmark) {
@@ -92,16 +91,14 @@ UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, disjoint_pool_fix,
9291
pool_allocator<disjoint_pool<os_provider>>);
9392
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_fix)
9493
->Apply(&default_multiple_alloc_fix_size)
95-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
96-
->Apply(&multithreaded<4>);
94+
->Apply(&multithreaded);
9795

9896
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark,
9997
disjoint_pool_uniform, uniform_alloc_size,
10098
pool_allocator<disjoint_pool<os_provider>>);
10199
UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, disjoint_pool_uniform)
102100
->Apply(&default_multiple_alloc_uniform_size)
103-
// Limit benchmarks to 4 threads, as the disjoint pool scales poorly with higher thread counts.
104-
->Apply(&multithreaded<4>);
101+
->Apply(&multithreaded);
105102

106103
#ifdef UMF_POOL_JEMALLOC_ENABLED
107104
UMF_BENCHMARK_TEMPLATE_DEFINE(multiple_malloc_free_benchmark, jemalloc_pool_fix,
@@ -159,6 +156,70 @@ UMF_BENCHMARK_REGISTER_F(multiple_malloc_free_benchmark, fixed_provider)
159156
// reduce iterations, to match os_provider benchmark
160157
->Iterations(50000);
161158

159+
// peak
160+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_fix, fixed_alloc_size,
161+
glibc_malloc);
162+
163+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_fix)
164+
->Apply(&default_multiple_alloc_fix_size)
165+
->Apply(&multithreaded);
166+
167+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, glibc_uniform,
168+
uniform_alloc_size, glibc_malloc);
169+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, glibc_uniform)
170+
->Apply(&default_multiple_alloc_uniform_size)
171+
->Apply(&multithreaded);
172+
173+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_fix,
174+
fixed_alloc_size,
175+
pool_allocator<disjoint_pool<os_provider>>);
176+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_fix)
177+
->Apply(&default_multiple_alloc_fix_size)
178+
->Apply(&multithreaded);
179+
180+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, disjoint_pool_uniform,
181+
uniform_alloc_size,
182+
pool_allocator<disjoint_pool<os_provider>>);
183+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, disjoint_pool_uniform)
184+
->Apply(&default_multiple_alloc_uniform_size)
185+
->Apply(&multithreaded);
186+
187+
#ifdef UMF_POOL_JEMALLOC_ENABLED
188+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_fix,
189+
fixed_alloc_size,
190+
pool_allocator<jemalloc_pool<os_provider>>);
191+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_fix)
192+
->Apply(&default_multiple_alloc_fix_size)
193+
->Apply(&multithreaded);
194+
195+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, jemalloc_pool_uniform,
196+
uniform_alloc_size,
197+
pool_allocator<jemalloc_pool<os_provider>>);
198+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, jemalloc_pool_uniform)
199+
->Apply(&default_multiple_alloc_uniform_size)
200+
->Apply(&multithreaded);
201+
202+
#endif
203+
204+
#ifdef UMF_POOL_SCALABLE_ENABLED
205+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_fix,
206+
fixed_alloc_size,
207+
pool_allocator<scalable_pool<os_provider>>);
208+
209+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_fix)
210+
->Apply(&default_multiple_alloc_fix_size)
211+
->Apply(&multithreaded);
212+
213+
UMF_BENCHMARK_TEMPLATE_DEFINE(peak_alloc_benchmark, scalable_pool_uniform,
214+
uniform_alloc_size,
215+
pool_allocator<scalable_pool<os_provider>>);
216+
217+
UMF_BENCHMARK_REGISTER_F(peak_alloc_benchmark, scalable_pool_uniform)
218+
->Apply(&default_multiple_alloc_uniform_size)
219+
->Apply(&multithreaded);
220+
221+
#endif
222+
162223
//BENCHMARK_MAIN();
163224
int main(int argc, char **argv) {
164225
if (initAffinityMask()) {

benchmark/benchmark.hpp

Lines changed: 170 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
* - Additional benchmarking scenarios can be created by extending `benchmark_interface`.
7171
*/
7272

73+
#include <list>
7374
#include <malloc.h>
7475
#include <random>
7576

@@ -86,6 +87,7 @@ struct alloc_data {
8687
};
8788

8889
struct next_alloc_data {
90+
bool alloc; // true if allocation, false if deallocation
8991
size_t offset;
9092
size_t size;
9193
};
@@ -288,18 +290,17 @@ template <
288290
typename =
289291
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
290292
class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
291-
using distribution = std::uniform_int_distribution<size_t>;
293+
protected:
292294
template <class T> using vector2d = std::vector<std::vector<T>>;
293295
using base = benchmark_interface<Size, Alloc>;
294-
295296
int allocsPerIterations = 10;
296297
bool thread_local_allocations = true;
297298
size_t max_allocs = 0;
298299

299300
vector2d<alloc_data> allocations;
300301
vector2d<next_alloc_data> next;
301302
using next_alloc_data_iterator =
302-
std::vector<next_alloc_data>::const_iterator;
303+
typename std::vector<next_alloc_data>::const_iterator;
303304
std::vector<std::unique_ptr<next_alloc_data_iterator>> next_iter;
304305
int64_t iterations;
305306

@@ -386,15 +387,20 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
386387
auto tid = state.thread_index();
387388
auto &allocation = allocations[tid];
388389
auto &iter = next_iter[tid];
390+
389391
for (int i = 0; i < allocsPerIterations; i++) {
390392
auto &n = *(*iter)++;
391393
auto &alloc = allocation[n.offset];
392-
base::allocator.benchFree(alloc.ptr, alloc.size);
393-
alloc.size = n.size;
394-
alloc.ptr = base::allocator.benchAlloc(alloc.size);
395-
396-
if (alloc.ptr == NULL) {
397-
state.SkipWithError("allocation failed");
394+
if (n.alloc) {
395+
alloc.ptr = base::allocator.benchAlloc(n.size);
396+
if (alloc.ptr == NULL) {
397+
state.SkipWithError("allocation failed");
398+
}
399+
alloc.size = n.size;
400+
} else {
401+
base::allocator.benchFree(alloc.ptr, alloc.size);
402+
alloc.ptr = NULL;
403+
alloc.size = 0;
398404
}
399405
}
400406
}
@@ -412,13 +418,14 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
412418
}
413419

414420
private:
415-
void prealloc(benchmark::State &state) {
421+
virtual void prealloc(benchmark::State &state) {
416422
auto tid = state.thread_index();
417423
auto &i = allocations[tid];
418424
i.resize(max_allocs);
419425
auto sizeGenerator = base::alloc_sizes[tid];
420426

421-
for (size_t j = 0; j < max_allocs; j++) {
427+
// Preallocate half of the available slots, for allocations
428+
for (size_t j = 0; j < max_allocs / 2; j++) {
422429
auto size = sizeGenerator.nextSize();
423430
i[j].ptr = base::allocator.benchAlloc(size);
424431
if (i[j].ptr == NULL) {
@@ -441,20 +448,168 @@ class multiple_malloc_free_benchmark : public benchmark_interface<Size, Alloc> {
441448
}
442449
}
443450

444-
void prepareWorkload(benchmark::State &state) {
451+
virtual void prepareWorkload(benchmark::State &state) {
445452
auto tid = state.thread_index();
446453
auto &n = next[tid];
454+
455+
// Create generators for random index selection and binary decision.
456+
using distribution = std::uniform_int_distribution<size_t>;
447457
std::default_random_engine generator;
448-
distribution dist;
458+
distribution dist_offset(0, max_allocs - 1);
459+
distribution dist_opt_type(0, 1);
449460
generator.seed(0);
450-
dist.param(distribution::param_type(0, max_allocs - 1));
461+
451462
auto sizeGenerator = base::alloc_sizes[tid];
463+
std::vector<size_t> free;
464+
std::vector<size_t> allocated;
465+
free.reserve(max_allocs / 2);
466+
allocated.reserve(max_allocs / 2);
467+
// Preallocate memory: initially, half the indices are allocated.
468+
// See prealloc() function;
469+
size_t i = 0;
470+
while (i < max_allocs / 2) {
471+
allocated.push_back(i++);
472+
}
473+
// The remaining indices are marked as free.
474+
while (i < max_allocs) {
475+
free.push_back(i++);
476+
}
452477

453478
n.clear();
454479
for (int64_t j = 0; j < state.max_iterations * allocsPerIterations;
455480
j++) {
456-
n.push_back({dist(generator), sizeGenerator.nextSize()});
481+
// Decide whether to allocate or free:
482+
// - If no allocations exist, allocation is forced.
483+
// - If there is maximum number of allocation, free is forced
484+
// - Otherwise, use a binary random choice (0 or 1)
485+
if (allocated.empty() ||
486+
(dist_opt_type(generator) == 0 && !free.empty())) {
487+
// Allocation:
488+
std::swap(free[dist_offset(generator) % free.size()],
489+
free.back());
490+
auto offset = free.back();
491+
free.pop_back();
492+
493+
n.push_back({true, offset, sizeGenerator.nextSize()});
494+
allocated.push_back(offset);
495+
} else {
496+
// Free
497+
std::swap(allocated[dist_offset(generator) % allocated.size()],
498+
allocated.back());
499+
auto offset = allocated.back();
500+
allocated.pop_back();
501+
502+
n.push_back({false, offset, 0});
503+
free.push_back(offset);
504+
}
457505
}
506+
458507
next_iter[tid] = std::make_unique<next_alloc_data_iterator>(n.cbegin());
459508
}
460509
};
510+
// This class benchmarks performance by randomly allocating and freeing memory.
511+
// Initially, it slowly increases the memory footprint, and later decreases it.
512+
template <
513+
typename Size, typename Alloc,
514+
typename =
515+
std::enable_if_t<std::is_base_of<alloc_size_interface, Size>::value>,
516+
typename =
517+
std::enable_if_t<std::is_base_of<allocator_interface, Alloc>::value>>
518+
class peak_alloc_benchmark
519+
: public multiple_malloc_free_benchmark<Size, Alloc> {
520+
using base = multiple_malloc_free_benchmark<Size, Alloc>;
521+
virtual void prepareWorkload(benchmark::State &state) override {
522+
// Retrieve the thread index and corresponding operation buffer.
523+
auto tid = state.thread_index();
524+
auto &n = this->next[tid];
525+
526+
// Set up the random generators for index selection and decision making.
527+
std::default_random_engine generator;
528+
std::uniform_int_distribution<size_t> dist_offset(0,
529+
this->max_allocs - 1);
530+
std::uniform_real_distribution<double> dist_opt_type(0, 1);
531+
generator.seed(0);
532+
auto sizeGenerator = this->alloc_sizes[tid];
533+
534+
n.clear();
535+
std::vector<size_t> free;
536+
std::vector<size_t> allocated;
537+
free.reserve(this->max_allocs);
538+
// Initially, all indices are available.
539+
for (size_t i = 0; i < this->max_allocs; i++) {
540+
free.push_back(i);
541+
}
542+
543+
// Total number of allocation/free operations to simulate.
544+
int64_t operations_number =
545+
state.max_iterations * this->allocsPerIterations;
546+
for (int64_t j = 0; j < operations_number; j++) {
547+
int64_t target_allocation;
548+
549+
// Determine the target number of allocations based on the progress of the iterations.
550+
// In the first half of the iterations, the target allocation increases linearly.
551+
// In the second half, it decreases linearly.
552+
if (j < operations_number / 2) {
553+
target_allocation = 2 * static_cast<int64_t>(this->max_allocs) *
554+
j / operations_number;
555+
} else {
556+
target_allocation = -2 *
557+
static_cast<int64_t>(this->max_allocs) *
558+
j / operations_number +
559+
2 * static_cast<int64_t>(this->max_allocs);
560+
}
561+
562+
// x represents the gap between the target and current allocations.
563+
auto x = static_cast<double>(target_allocation -
564+
static_cast<double>(allocated.size()));
565+
566+
// Use a normal CDF with high sigma so that when x is positive,
567+
// we are slightly more likely to allocate,
568+
// and when x is negative, slightly more likely to free memory,
569+
// keeping the overall change gradual.
570+
571+
const double sigma = 1000;
572+
auto cdf = normalCDF(x, sigma);
573+
574+
// Decide whether to allocate or free:
575+
// - If no allocations exist, allocation is forced.
576+
// - If there is maximum number of allocation, free is forced
577+
// - Otherwise, Based on the computed probability, choose whether to allocate or free
578+
if (allocated.empty() ||
579+
(!free.empty() && cdf > dist_opt_type(generator))) {
580+
// Allocation
581+
std::swap(free[dist_offset(generator) % free.size()],
582+
free.back());
583+
auto offset = free.back();
584+
free.pop_back();
585+
n.push_back({true, offset, sizeGenerator.nextSize()});
586+
allocated.push_back(offset);
587+
} else {
588+
// Free
589+
std::swap(allocated[dist_offset(generator) % allocated.size()],
590+
allocated.back());
591+
auto offset = allocated.back();
592+
allocated.pop_back();
593+
n.push_back({false, offset, 0});
594+
free.push_back(offset);
595+
}
596+
}
597+
598+
this->next_iter[tid] =
599+
std::make_unique<std::vector<next_alloc_data>::const_iterator>(
600+
n.cbegin());
601+
}
602+
603+
virtual void prealloc(benchmark::State &state) {
604+
auto tid = state.thread_index();
605+
auto &i = base::allocations[tid];
606+
i.resize(base::max_allocs);
607+
}
608+
virtual std::string name() { return base::base::name() + "/peak_alloc"; }
609+
610+
private:
611+
// Function to calculate the CDF of a normal distribution
612+
double normalCDF(double x, double sigma = 1.0, double mu = 0.0) {
613+
return 0.5 * (1 + std::erf((x - mu) / (sigma * std::sqrt(2.0))));
614+
}
615+
};

0 commit comments

Comments
 (0)