From a2c56b0a57410357efe54225ae7606d4a80310f1 Mon Sep 17 00:00:00 2001 From: Michal Zurkowski Date: Sat, 15 May 2021 17:59:07 +0200 Subject: [PATCH] Properly flush memory in benchmark and after each block. --- algo/gr/cryptonote/cryptonight.c | 10 ---- algo/gr/gr-4way.c | 21 ++------ algo/gr/gr-gate.c | 83 ++++++++++++++++++++++---------- algo/gr/gr-gate.h | 5 +- algo/gr/gr.c | 9 ++-- configure | 20 ++++---- configure.ac | 2 +- cpu-miner.c | 22 +++------ virtual_memory.c | 69 ++++++++++++++++++++++---- virtual_memory.h | 9 +++- 10 files changed, 155 insertions(+), 95 deletions(-) diff --git a/algo/gr/cryptonote/cryptonight.c b/algo/gr/cryptonote/cryptonight.c index 8b226ba..924df4e 100755 --- a/algo/gr/cryptonote/cryptonight.c +++ b/algo/gr/cryptonote/cryptonight.c @@ -41,13 +41,6 @@ static void do_skein_hash(const void *input, size_t len, void *output) { static void (*const extra_hashes[4])(const void *, size_t, void *) = { do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; -static __attribute__((always_inline)) uint64_t -__umul128(const uint64_t *a, const uint64_t *b, uint64_t *hi) { - unsigned __int128 r = (unsigned __int128)(*a) * (unsigned __int128)(*b); - *hi = r >> 64; - return (uint64_t)r; -} - // This will shift and xor tmp1 into itself as 4 32-bit vals such as // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) static inline __m128i sl_xor(__m128i tmp1) { @@ -393,7 +386,6 @@ cryptonight_hash(const void *input, void *output, const uint32_t memory, cl = ((uint64_t *)(&l0[idx0]))[0]; ch = ((uint64_t *)(&l0[idx0]))[1]; - // lo = __umul128(&cxl0, &cl, &hi); __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc"); al0 += hi; @@ -535,7 +527,6 @@ cryptonight_2way_hash(const void *input0, const void *input1, void *output0, cl = ((uint64_t *)(&l0[idx0]))[0]; ch = ((uint64_t *)(&l0[idx0]))[1]; - // lo = __umul128(&cxl0, &cl, &hi); __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc"); al0 += hi; @@ -553,7 +544,6 @@ cryptonight_2way_hash(const void *input0, const void *input1, void *output0, cl = ((uint64_t *)(&l1[idx1]))[0]; ch = ((uint64_t *)(&l1[idx1]))[1]; - // lo = __umul128(&cxl1, &cl, &hi); __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl1), "rm"(cl) : "cc"); al1 += hi; diff --git a/algo/gr/gr-4way.c b/algo/gr/gr-4way.c index 94628b4..399a5de 100755 --- a/algo/gr/gr-4way.c +++ b/algo/gr/gr-4way.c @@ -1,5 +1,4 @@ #include "gr-gate.h" -#include "virtual_memory.h" #if defined(GR_4WAY) @@ -397,22 +396,6 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce, __m256i *noncev = (__m256i *)vdata + 9; // aligned volatile uint8_t *restart = &(work_restart[thr_id].restart); - if (hp_state == NULL) { - // Allocate 4MiB instead of 2MiB in case 2way Fast is used. - if (opt_tune || opt_benchmark_config || cn_config[Fast] == 1) { - hp_state = (uint8_t *)AllocateMemory(1 << 22); - } else if (opt_tuned) { - for (int i = 0; i < 20; i++) { - if (cn_tune[i][5] == 1) { - hp_state = (uint8_t *)AllocateMemory(1 << 22); - } - } - } - if (hp_state == NULL) { - hp_state = (uint8_t *)AllocateMemory(1 << 21); - } - } - if (opt_tune) { tune(pdata, thr_id); opt_tuned = true; // Tuned. @@ -453,6 +436,10 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce, } } + // Allocates hp_state for Cryptonight algorithms. + // Needs to be run AFTER gr_hash_order is set! + AllocateNeededMemory(); + *noncev = mm256_intrlv_blend_32( _mm256_set_epi32(n + 3, 0, n + 2, 0, n + 1, 0, n, 0), *noncev); diff --git a/algo/gr/gr-gate.c b/algo/gr/gr-gate.c index 4d53635..3eab2b5 100755 --- a/algo/gr/gr-gate.c +++ b/algo/gr/gr-gate.c @@ -1,5 +1,6 @@ #include "gr-gate.h" -#include // usleep +#include "virtual_memory.h" // Memory allocation. +#include // usleep // Only 3 CN algos are selected from available 6. __thread uint8_t gr_hash_order[GR_HASH_FUNC_COUNT - 3 + 1]; @@ -109,6 +110,40 @@ void gr_getAlgoString(const uint8_t *block, uint8_t *selectedAlgoOutput) { } } +// Mapping of gr_harh_order CN to cn-config - lightest to heaviest order. +// Config: Turtlelite, Turtle, Darklite, Dark, Lite, Fast. +// Gr_Hash: Dark, Darklite, Fast, Lite, Turtle, Turtlelite +static uint8_t cn_map[6] = {3, 2, 5, 4, 1, 0}; + +static size_t GetMaxCnSize() { + // Memory requirements for each CN variant + size_t cn_req[6] = {262144, 262144, 524288, 524288, 1048576, 2097152}; + // Check tune/config if given variant uses 2way that requires 2x memory. + // cn_config should contain only 0 values in non GR_4WAY. + for (int i = 0; i < 6; i++) { + cn_req[i] *= (cn_config[i] + 1); + } + + size_t order[3] = {cn_map[gr_hash_order[5] - 15], + cn_map[gr_hash_order[11] - 15], + cn_map[gr_hash_order[17] - 15]}; + size_t max = + cn_req[order[0]] > cn_req[order[1]] ? cn_req[order[0]] : cn_req[order[1]]; + max = max > cn_req[order[2]] ? max : cn_req[order[2]]; + + return max; +} + +void AllocateNeededMemory() { + size_t size = GetMaxCnSize(); + if (opt_debug) { + applog(LOG_DEBUG, "Current Cryptonight variants require: %lu memory", size); + } + + // Purges previous memory allocation and creates new one. + PrepareMemory((void **)&hp_state, size); +} + void select_tuned_config() { for (size_t i = 0; i < 20; i++) { if (cn[i][0] + 15 == gr_hash_order[5] || @@ -223,9 +258,7 @@ void *statistic_thread(void *arg) { } } -#ifdef __AVX2__ - -static uint8_t cn_map[6] = {3, 2, 5, 4, 1, 0}; +#if defined(GR_4WAY) static void tune_config(void *input, int thr_id, int rot) { srand(time(NULL) + thr_id); @@ -251,6 +284,10 @@ static void tune_config(void *input, int thr_id, int rot) { gr_hash_order[5] = cn[rot][0] + 15; gr_hash_order[11] = cn[rot][1] + 15; gr_hash_order[17] = cn[rot][2] + 15; + + // Purge memory for test. + AllocateNeededMemory(); + // Set desired CN config. sync_bench(); sync_bench(); @@ -309,28 +346,21 @@ void tune(void *input, int thr_id) { tune_config(input, thr_id, i); sync_conf(); if (thr_id == 0) { - // TODO - // Do not set the improvement if Fast variant is included. - // Possible bug/inaccuracy in benchmarking with it set as 1. - // Can be reproduced with 5000 series Ryzens. - if (cn_map[cn[i][0]] != 5 && cn_map[cn[i][1]] != 5 && - cn_map[cn[i][2]] != 5) { - if (best_hashrate < bench_hashrate) { - if (opt_debug) { - applog(LOG_DEBUG, "%d -> %d | %d -> %d | %d -> %d", cn[i][0], - (config & 1) >> 0, cn[i][1], (config & 2) >> 1, cn[i][2], - (config & 4) >> 2); - } - cn_tune[i][cn_map[cn[i][0]]] = (config & 1) >> 0; - cn_tune[i][cn_map[cn[i][1]]] = (config & 2) >> 1; - cn_tune[i][cn_map[cn[i][2]]] = (config & 4) >> 2; - if (opt_debug) { - applog(LOG_DEBUG, "Config for rotation %d: %d %d %d %d %d %d", i, - cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], - cn_tune[i][4], cn_tune[i][5]); - } - best_hashrate = bench_hashrate; + if (best_hashrate < bench_hashrate) { + if (opt_debug) { + applog(LOG_DEBUG, "%d -> %d | %d -> %d | %d -> %d", cn[i][0], + (config & 1) >> 0, cn[i][1], (config & 2) >> 1, cn[i][2], + (config & 4) >> 2); } + cn_tune[i][cn_map[cn[i][0]]] = (config & 1) >> 0; + cn_tune[i][cn_map[cn[i][1]]] = (config & 2) >> 1; + cn_tune[i][cn_map[cn[i][2]]] = (config & 4) >> 2; + if (opt_debug) { + applog(LOG_DEBUG, "Config for rotation %d: %d %d %d %d %d %d", i, + cn_tune[i][0], cn_tune[i][1], cn_tune[i][2], cn_tune[i][3], + cn_tune[i][4], cn_tune[i][5]); + } + best_hashrate = bench_hashrate; } bench_hashrate = 0; bench_time = 0; @@ -403,6 +433,9 @@ void benchmark(void *input, int thr_id, long sleep_time) { select_tuned_config(); } + // Purge memory for test. + AllocateNeededMemory(); + sync_bench(); // Rotation change sync. if (rotation == 0) { if (likely(stop_benchmark)) { diff --git a/algo/gr/gr-gate.h b/algo/gr/gr-gate.h index f1dad83..be844b7 100755 --- a/algo/gr/gr-gate.h +++ b/algo/gr/gr-gate.h @@ -159,11 +159,14 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, extern __thread uint8_t *hp_state; // Values for 20 CN rotations. -const static uint8_t cn[20][3] = { +static const uint8_t cn[20][3] = { {0, 1, 2}, {0, 1, 3}, {0, 1, 4}, {0, 1, 5}, {0, 2, 3}, {0, 2, 4}, {0, 2, 5}, {0, 3, 4}, {0, 3, 5}, {0, 4, 5}, {1, 2, 3}, {1, 2, 4}, {1, 2, 5}, {1, 3, 4}, {1, 3, 5}, {1, 4, 5}, {2, 3, 4}, {2, 3, 5}, {2, 4, 5}, {3, 4, 5}}; +// Uses hp_state as memory. +void AllocateNeededMemory(); + void select_tuned_config(); void tune(void *input, int thr_id); diff --git a/algo/gr/gr.c b/algo/gr/gr.c index 7b7e2fc..c7fba6f 100755 --- a/algo/gr/gr.c +++ b/algo/gr/gr.c @@ -1,5 +1,4 @@ #include "gr-gate.h" -#include "virtual_memory.h" int gr_hash(void *output, const void *input, int thrid) { uint64_t hash[8] __attribute__((aligned(64))); @@ -135,10 +134,6 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t nonce = first_nonce; volatile uint8_t *restart = &(work_restart[thr_id].restart); - if (hp_state == NULL) { - hp_state = (uint8_t *)AllocateMemory(1 << 21); - } - if (opt_benchmark_config) { benchmark_configs(pdata, thr_id); } @@ -165,6 +160,10 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done, } } + // Allocates hp_state for Cryptonight algorithms. + // Needs to be run AFTER gr_hash_order is set! + AllocateNeededMemory(); + do { edata[19] = nonce; if (gr_hash(hash32, edata, thr_id)) { diff --git a/configure b/configure index 84b2ecc..a3e422c 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt-gr 1.1.4g. +# Generated by GNU Autoconf 2.69 for cpuminer-opt-gr 1.1.5. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt-gr' PACKAGE_TARNAME='cpuminer-opt-gr' -PACKAGE_VERSION='1.1.4g' -PACKAGE_STRING='cpuminer-opt-gr 1.1.4g' +PACKAGE_VERSION='1.1.5' +PACKAGE_STRING='cpuminer-opt-gr 1.1.5' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt-gr 1.1.4g to adapt to many kinds of systems. +\`configure' configures cpuminer-opt-gr 1.1.5 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt-gr 1.1.4g:";; + short | recursive ) echo "Configuration of cpuminer-opt-gr 1.1.5:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt-gr configure 1.1.4g +cpuminer-opt-gr configure 1.1.5 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt-gr $as_me 1.1.4g, which was +It was created by cpuminer-opt-gr $as_me 1.1.5, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt-gr' - VERSION='1.1.4g' + VERSION='1.1.5' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt-gr $as_me 1.1.4g, which was +This file was extended by cpuminer-opt-gr $as_me 1.1.5, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt-gr config.status 1.1.4g +cpuminer-opt-gr config.status 1.1.5 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index cc1a7ac..aa19f4d 100755 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt-gr], [1.1.4g]) +AC_INIT([cpuminer-opt-gr], [1.1.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index ed86fb3..6a8bf92 100755 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -148,17 +148,8 @@ bool opt_verify = false; // Default config for CN variants. // 0 - Use default 1way/SSE // 1 - Use 2way algorithm. -// Use defines to keep compatibility with v1.1.1 -#if defined(GR_4WAY_HEAVY) -__thread uint8_t cn_config[6] = {1, 1, 1, 1, 1, 1}; -uint8_t cn_config_global[6] = {1, 1, 1, 1, 1, 1}; -#elif defined(GR_4WAY_MEDIUM) -__thread uint8_t cn_config[6] = {0, 1, 1, 1, 0, 0}; -uint8_t cn_config_global[6] = {0, 1, 1, 1, 0, 0}; -#else __thread uint8_t cn_config[6] = {0, 0, 0, 0, 0, 0}; uint8_t cn_config_global[6] = {0, 0, 0, 0, 0, 0}; -#endif bool opt_tune = false; bool opt_tuned = false; @@ -3200,10 +3191,12 @@ static bool load_tune_config(char *config_name) { return false; } for (int i = 0; i < 20; i++) { - fscanf(fd, "%hhd %hhd %hhd %hhd %hhd %hhd\n", &cn_tune[i][0], - &cn_tune[i][1], &cn_tune[i][2], &cn_tune[i][3], &cn_tune[i][4], - &cn_tune[i][5]); - if (ferror(fd) != 0) { + size_t read = fscanf(fd, + "%" SCNu8 " %" SCNu8 " %" SCNu8 " %" SCNu8 " %" SCNu8 + " %" SCNu8 "\n", + &cn_tune[i][0], &cn_tune[i][1], &cn_tune[i][2], + &cn_tune[i][3], &cn_tune[i][4], &cn_tune[i][5]); + if (ferror(fd) != 0 || read != 6) { applog(LOG_ERR, "Could not read from %s file", config_name); return false; } @@ -4002,8 +3995,7 @@ int main(int argc, char *argv[]) { #endif // Prepare and check Large Pages. At least 4MiB per thread. - huge_pages = InitHugePages(opt_n_threads * 2); - if (!huge_pages) { + if (!InitHugePages(opt_n_threads * 4)) { applog(LOG_ERR, "Could not prepare Huge Pages."); } else { applog(LOG_BLUE, "Huge Pages set up successfuly."); diff --git a/virtual_memory.c b/virtual_memory.c index ea8f53b..202ded8 100755 --- a/virtual_memory.c +++ b/virtual_memory.c @@ -2,6 +2,10 @@ #include "miner.h" // applog #include "stdio.h" +static bool huge_pages = false; +__thread bool allocated_hp = false; +__thread size_t currently_allocated = 0; + #ifdef __MINGW32__ // Windows #define UNICODE @@ -146,7 +150,10 @@ static BOOL TrySetLockPagesPrivilege() { return ObtainLockPagesPrivilege() && SetLockPagesPrivilege(); } -bool InitHugePages(size_t threads) { return TrySetLockPagesPrivilege(); } +bool InitHugePages(size_t threads) { + huge_pages = TrySetLockPagesPrivilege(); + return huge_pages +} void *AllocateLargePagesMemory(size_t size) { const size_t min = GetLargePageMinimum(); @@ -160,6 +167,12 @@ void *AllocateLargePagesMemory(size_t size) { return mem; } +void DeallocateLargePagesMemory(void **memory) { + VirtualFree(*memory, currently_allocated, MEM_RELEASE); + *memory = NULL; + allocated_hp = false; +} + #else // Linux #include @@ -171,9 +184,9 @@ static inline int read_hp(const char *path) { } uint64_t value = 0; - fscanf(fd, "%lu", &value); + size_t read = fscanf(fd, "%lu", &value); fclose(fd); - if (ferror(fd) != 0) { + if (ferror(fd) != 0 || read != 1) { return -2; } return (int)value; @@ -200,20 +213,25 @@ bool InitHugePages(size_t threads) { "hugepages-2048kB/free_hugepages"; int available_pages = read_hp(free_path); if (available_pages < 0) { - return false; + huge_pages = false; + return huge_pages; } if (available_pages >= (int)threads) { - return true; + huge_pages = true; + return huge_pages; } const char *nr_path = "/sys/devices/system/node/node0/hugepages/" "hugepages-2048kB/nr_hugepages"; int set_pages = read_hp(nr_path); set_pages = set_pages < 0 ? 0 : set_pages; - return write_hp(nr_path, (size_t)set_pages + threads - available_pages); + huge_pages = write_hp(nr_path, (size_t)set_pages + threads - available_pages); + return huge_pages; } #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) void *AllocateLargePagesMemory(size_t size) { + // Needs to be multiple of Large Pages (2 MiB). + size = ((size / 2097152) * 2097152) + 2097152; #if defined(__FreeBSD__) void *mem = mmap(0, size, PROT_READ | PROT_WRITE, @@ -245,6 +263,17 @@ void *AllocateLargePagesMemory(size_t size) { return mem == MAP_FAILED ? NULL : mem; } +void DeallocateLargePagesMemory(void **memory) { + // Needs to be multiple of Large Pages (2 MiB). + size_t size = ((currently_allocated / 2097152) * 2097152) + 2097152; + int status = munmap(*memory, size); + if (status != 0) { + applog(LOG_ERR, "Could not properly deallocate memory!"); + } + *memory = NULL; + allocated_hp = false; +} + #endif // __MINGW32__ void *AllocateMemory(size_t size) { @@ -254,10 +283,30 @@ void *AllocateMemory(size_t size) { applog(LOG_NOTICE, "Using malloc as allocation method"); } mem = malloc(size); + allocated_hp = false; + if (mem == NULL) { + applog(LOG_ERR, "Could not allocate any memory for thread"); + exit(1); + } + } else { + allocated_hp = true; } - if (mem == NULL) { - applog(LOG_ERR, "Could not allocate any memory for thread"); - exit(1); - } + currently_allocated = size; return mem; } + +void DeallocateMemory(void **memory) { + if (allocated_hp) { + DeallocateLargePagesMemory(memory); + } else if (*memory != NULL) { + // No special method of allocation was used. + free(*memory); + } +} + +void PrepareMemory(void **memory, size_t size) { + if (*memory != NULL) { + DeallocateMemory(memory); + } + *memory = (void *)AllocateMemory(size); +} diff --git a/virtual_memory.h b/virtual_memory.h index bf0fb69..84f3b23 100755 --- a/virtual_memory.h +++ b/virtual_memory.h @@ -5,12 +5,19 @@ #include #include -static bool huge_pages = false; +// Store allocation method and size. +extern __thread bool allocated_hp; +extern __thread size_t currently_allocated; bool InitHugePages(size_t threads); void *AllocateLargePagesMemory(size_t size); +void DeallocateLargePagesMemory(void **memory); void *AllocateMemory(size_t size); +void DeallocateMemory(void **memory); + +void PrepareMemory(void **memory, size_t size); + #endif // VIRTUAL_MEMORY_H_