Skip to content

Commit

Permalink
Change dev stratum switch. Cn improvements.
Browse files Browse the repository at this point in the history
Added minor improvements to Cryptonight algo. Around ~1-2%
Fixed issues with frequent stratum swithing from and to dev pool.
Do not switch stratum if it is in the dev pool list.
Update release build scripts. Include -mtune for Ryzen processors
  • Loading branch information
michal-zurkowski committed May 16, 2021
1 parent a2c56b0 commit 96cc604
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 108 deletions.
93 changes: 42 additions & 51 deletions algo/gr/cryptonote/cryptonight.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
#include "soft_aes.h"
#endif

// Replacement macro for architectures without SSE4.2
#ifndef __SSE42__
#define _mm_extract_epi64(a, b) \
b == 1 ? _mm_cvtsi128_si64(_mm_castps_si128( \
_mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(a)))) \
: _mm_cvtsi128_si64(a);
#endif // __SSE42__

extern __thread uint8_t *hp_state;

static void do_blake_hash(const void *input, size_t len, void *output) {
Expand Down Expand Up @@ -151,9 +159,9 @@ aes_round(const __m128i *key, __m128i *x0, __m128i *x1, __m128i *x2,
}
#endif

// Size is number of 64B words. // Must be multiple of 8.
// 128 -> 8 KiB per thread needed.
#define PREFETCH_SIZE 128
// Size in L1 prefetch. 4KiB per thread.
#define PREFETCH_SIZE_B 4096
#define PREFETCH_SIZE PREFETCH_SIZE_B / 64
#define PREFETCH_TYPE_R _MM_HINT_T0
#define PREFETCH_TYPE_W _MM_HINT_ET0

Expand All @@ -174,9 +182,9 @@ static inline void explode_scratchpad(const __m128i *input, __m128i *output,
xin7 = _mm_load_si128(input + 11);

size_t i;
// Prefetch first X KiB of output into L2 cache.
// Prefetch first X KiB of output into L1 cache.
for (i = 0; i < PREFETCH_SIZE; i += 4) {
_mm_prefetch(output + i, _MM_HINT_ET0);
_mm_prefetch(output + i, PREFETCH_TYPE_W);
}

for (i = 0; i < (memory / sizeof(__m128i)) - PREFETCH_SIZE; i += 8) {
Expand Down Expand Up @@ -246,7 +254,7 @@ static inline void implode_scratchpad(const __m128i *input, __m128i *output,
xout7 = _mm_load_si128(output + 11);

size_t i;
// Prefetch first X KiB of input into L2 cache.
// Prefetch first X KiB of input into L1 cache.
for (i = 0; i < PREFETCH_SIZE; i += 4) {
_mm_prefetch(input + i, PREFETCH_TYPE_R);
}
Expand Down Expand Up @@ -365,26 +373,23 @@ cryptonight_hash(const void *input, void *output, const uint32_t memory,
#endif

// Post AES
__m128i tmp = _mm_xor_si128(bx0, cx0);
const __m128i tmp = _mm_xor_si128(bx0, cx0);
((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp);

tmp = _mm_castps_si128(
_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = _mm_cvtsi128_si64(tmp);
const uint64_t vh = _mm_extract_epi64(tmp, 1);

const uint8_t x = (uint8_t)(vh >> 24);
static const uint16_t table = 0x7531;
const uint8_t index = (((x >> (3)) & 6) | (x & 1)) << 1;
vh ^= ((table >> index) & 0x3) << 28;

((uint64_t *)(&l0[idx0]))[1] = vh;
((uint64_t *)(&l0[idx0]))[1] = vh ^ (((table >> index) & 0x3) << 28);

const uint64_t cxl0 = (uint64_t)(_mm_cvtsi128_si64(cx0));
idx0 = cxl0 & mask;

uint64_t hi, lo, cl, ch;
cl = ((uint64_t *)(&l0[idx0]))[0];
ch = ((uint64_t *)(&l0[idx0]))[1];
register uint64_t hi, lo;
const uint64_t cl = ((const uint64_t *)(&l0[idx0]))[0];
const uint64_t ch = ((const uint64_t *)(&l0[idx0]))[1];

__asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc");

Expand Down Expand Up @@ -440,14 +445,6 @@ void cryptonight_turtlelite_hash(const void *input, void *output) {

#ifdef __AVX2__ // GR_4WAY

// GCC 7.5 Does not have _mm256_set_mi128i
#ifdef __GNUC__
#if __GNUC__ < 8
#define _mm256_set_m128i(a, b) \
_mm256_insertf128_si256(_mm256_castsi128_si256(b), a, 1)
#endif // __GNUC__ < 8
#endif // __GNUC__

// Requires 2x memory allocated in hp_state.
__attribute__((always_inline)) void
cryptonight_2way_hash(const void *input0, const void *input1, void *output0,
Expand Down Expand Up @@ -499,64 +496,58 @@ cryptonight_2way_hash(const void *input0, const void *input1, void *output0,
_mm_prefetch(&l1[cxl1 & mask], _MM_HINT_ET0);

// Post AES
__m128i tmp = _mm_xor_si128(bx0, cx0);
((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp);
tmp = _mm_castps_si128(
_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = _mm_cvtsi128_si64(tmp);
const uint8_t x0 = (uint8_t)(vh >> 24);
const __m128i tmp0 = _mm_xor_si128(bx0, cx0);
((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp0);
const uint64_t vh0 = _mm_extract_epi64(tmp0, 1);
const uint8_t x0 = (uint8_t)(vh0 >> 24);
static const uint16_t table = 0x7531;
const uint8_t index0 = (((x0 >> (3)) & 6) | (x0 & 1)) << 1;
vh ^= ((table >> index0) & 0x3) << 28;
((uint64_t *)(&l0[idx0]))[1] = vh;

tmp = _mm_xor_si128(bx1, cx1);
((uint64_t *)(&l1[idx1]))[0] = _mm_cvtsi128_si64(tmp);
tmp = _mm_castps_si128(
_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
vh = _mm_cvtsi128_si64(tmp);
const uint8_t x1 = (uint8_t)(vh >> 24);
((uint64_t *)(&l0[idx0]))[1] = vh0 ^ (((table >> index0) & 0x3) << 28);

const __m128i tmp1 = _mm_xor_si128(bx1, cx1);
((uint64_t *)(&l1[idx1]))[0] = _mm_cvtsi128_si64(tmp1);
const uint64_t vh1 = _mm_extract_epi64(tmp1, 1);
const uint8_t x1 = (uint8_t)(vh1 >> 24);
const uint8_t index1 = (((x1 >> (3)) & 6) | (x1 & 1)) << 1;
vh ^= ((table >> index1) & 0x3) << 28;
((uint64_t *)(&l1[idx1]))[1] = vh;
((uint64_t *)(&l1[idx1]))[1] = vh1 ^ (((table >> index1) & 0x3) << 28);

idx0 = cxl0 & mask;
idx1 = cxl1 & mask;

uint64_t hi, lo, cl, ch;
cl = ((uint64_t *)(&l0[idx0]))[0];
ch = ((uint64_t *)(&l0[idx0]))[1];
register uint64_t hi, lo;
const uint64_t cl0 = ((const uint64_t *)(&l0[idx0]))[0];
const uint64_t ch0 = ((const uint64_t *)(&l0[idx0]))[1];

__asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc");
__asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl0) : "cc");

al0 += hi;
ah0 += lo;

((uint64_t *)(&l0[idx0]))[0] = al0;
((uint64_t *)(&l0[idx0]))[1] = ah0 ^ tweak1_2_0;

al0 ^= cl;
al0 ^= cl0;
idx0 = al0 & mask;
_mm_prefetch(&l0[idx0], _MM_HINT_ET0);

ah0 ^= ch;
ah0 ^= ch0;

cl = ((uint64_t *)(&l1[idx1]))[0];
ch = ((uint64_t *)(&l1[idx1]))[1];
const uint64_t cl1 = ((const uint64_t *)(&l1[idx1]))[0];
const uint64_t ch1 = ((const uint64_t *)(&l1[idx1]))[1];

__asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl1), "rm"(cl) : "cc");
__asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl1), "rm"(cl1) : "cc");

al1 += hi;
ah1 += lo;

((uint64_t *)(&l1[idx1]))[0] = al1;
((uint64_t *)(&l1[idx1]))[1] = ah1 ^ tweak1_2_1;

al1 ^= cl;
al1 ^= cl1;
idx1 = al1 & mask;
_mm_prefetch(&l1[idx1], _MM_HINT_ET0);

ah1 ^= ch;
ah1 ^= ch1;

bx0 = cx0;
bx1 = cx1;
Expand Down
6 changes: 3 additions & 3 deletions algo/gr/gr-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
applog(LOG_DEBUG, "hash order %s (%08x)", order, ntime);
}
if (opt_tuned) {
select_tuned_config();
select_tuned_config(thr_id);
}
}

Expand All @@ -448,8 +448,8 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
for (int i = 0; i < 4; i++) {
if (unlikely(valid_hash(hash + (i << 3), ptarget))) {
if (opt_debug) {
applog(LOG_BLUE, "Solution: %u %.10lf", bswap_32(n + i),
hash_to_diff(hash + (i << 3)));
applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
bswap_32(n + i), hash_to_diff(hash + (i << 3)));
}
pdata[19] = bswap_32(n + i);
submit_solution(work, hash + (i << 3), mythr);
Expand Down
21 changes: 10 additions & 11 deletions algo/gr/gr-gate.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,12 @@ static size_t GetMaxCnSize() {

void AllocateNeededMemory() {
size_t size = GetMaxCnSize();
if (opt_debug) {
applog(LOG_DEBUG, "Current Cryptonight variants require: %lu memory", size);
}

// Purges previous memory allocation and creates new one.
PrepareMemory((void **)&hp_state, size);
}

void select_tuned_config() {
void select_tuned_config(int thr_id) {
for (size_t i = 0; i < 20; i++) {
if (cn[i][0] + 15 == gr_hash_order[5] ||
cn[i][0] + 15 == gr_hash_order[11] ||
Expand All @@ -156,7 +153,7 @@ void select_tuned_config() {
cn[i][2] + 15 == gr_hash_order[11] ||
cn[i][2] + 15 == gr_hash_order[17]) {
memcpy(cn_config, &cn_tune[i], 6);
if (opt_debug) {
if (opt_debug && !thr_id) {
applog(LOG_BLUE, "config %d: %d %d %d %d %d %d", i, cn_config[0],
cn_config[1], cn_config[2], cn_config[3], cn_config[4],
cn_config[5]);
Expand All @@ -166,9 +163,11 @@ void select_tuned_config() {
}
}
}
// Should not get to this point.
applog(LOG_ERR, "Could not find any config? %d %d %d", gr_hash_order[5],
gr_hash_order[11], gr_hash_order[17]);
if (!thr_id) {
// Should not get to this point.
applog(LOG_ERR, "Could not find any config? %d %d %d", gr_hash_order[5],
gr_hash_order[11], gr_hash_order[17]);
}
return;
}

Expand Down Expand Up @@ -430,7 +429,7 @@ void benchmark(void *input, int thr_id, long sleep_time) {
gr_hash_order[11] = cn[rotation][1] + 15;
gr_hash_order[17] = cn[rotation][2] + 15;
if (opt_tuned) {
select_tuned_config();
select_tuned_config(thr_id);
}

// Purge memory for test.
Expand Down Expand Up @@ -478,7 +477,7 @@ void benchmark_configs(void *input, int thr_id) {
cn_config[3] = (i & 8) >> 3;
cn_config[4] = (i & 16) >> 4;
cn_config[5] = (i & 32) >> 5;
if (thr_id == 0) {
if (!thr_id) {
applog(LOG_NOTICE, "Testing Cryptonigh --cn-config %d,%d,%d,%d,%d,%d",
cn_config[0], cn_config[1], cn_config[2], cn_config[3],
cn_config[4], cn_config[5]);
Expand All @@ -503,7 +502,7 @@ void benchmark_configs(void *input, int thr_id) {
}
}
// Show best config.
if (thr_id == 0) {
if (!thr_id) {
applog(LOG_NOTICE, "Best --cn-config %d,%d,%d,%d,%d,%d",
(best_config & 1) >> 0, (best_config & 2) >> 1,
(best_config & 4) >> 2, (best_config & 8) >> 3,
Expand Down
4 changes: 4 additions & 0 deletions algo/gr/gr.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done,
edata[19] = nonce;
if (gr_hash(hash32, edata, thr_id)) {
if (unlikely(valid_hash(hash32, ptarget))) {
if (opt_debug) {
applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
bswap_32(nonce), hash_to_diff(hash32));
}
pdata[19] = bswap_32(nonce);
submit_solution(work, hash32, mythr);
}
Expand Down
17 changes: 10 additions & 7 deletions build-allarch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,28 @@ compile "westmere" "aes-sse42" "-maes"
compile "corei7-avx" "avx" "-maes"


#AVX2+ Light
#AVX2+
# Haswell AVX2 AES
# GCC 9 doesn't include AES with core-avx2
compile "core-avx2" "avx2" "-maes"

# AMD Zen1 AVX2 SHA
compile "znver1" "zen"
compile "znver1" "zen" "-mtune=znver1"

# AMD Zen2 AVX2 SHA
compile "znver2" "zen2"
compile "znver2" "zen2" "-mtune=znver2"

# AMD Zen3 AVX2 SHA VAES
compile "znver2" "zen3" "-mvaes"
# GCC 10
compile "znver3" "zen3" "-mtune=znver3"
# GCC 9
# compile "znver2" "zen3" "-mvaes -mtune=znver2"

# Icelake AVX512 SHA VAES
compile "icelake-client" "avx512-sha-vaes"
compile "icelake-client" "avx512-sha-vaes" "-mtune=intel"

# Rocketlake AVX512 SHA AES
compile "cascadelake" "avx512-sha" "-msha"
compile "cascadelake" "avx512-sha" "-msha -mtune=intel"

# Slylake-X AVX512 AES
compile "skylake-avx512" "avx512"
compile "skylake-avx512" "avx512" "-mtune=intel"
Loading

0 comments on commit 96cc604

Please sign in to comment.