Change dev stratum switch. Cn improvements.

Added minor improvements to Cryptonight algo. Around ~1-2% Fixed issues with frequent stratum swithing from and to dev pool. Do not switch stratum if it is in the dev pool list. Update release build scripts. Include -mtune for Ryzen processors
WyvernTKC · May 16, 2021 · 96cc604 · 96cc604
1 parent a2c56b0
commit 96cc604
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 108 deletions.
diff --git a/algo/gr/cryptonote/cryptonight.c b/algo/gr/cryptonote/cryptonight.c
@@ -18,6 +18,14 @@
 #include "soft_aes.h"
 #endif
 
+// Replacement macro for architectures without SSE4.2
+#ifndef __SSE42__
+#define _mm_extract_epi64(a, b)                                                \
+  b == 1 ? _mm_cvtsi128_si64(_mm_castps_si128(                                 \
+               _mm_movehl_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(a))))       \
+         : _mm_cvtsi128_si64(a);
+#endif // __SSE42__
+
 extern __thread uint8_t *hp_state;
 
 static void do_blake_hash(const void *input, size_t len, void *output) {
@@ -151,9 +159,9 @@ aes_round(const __m128i *key, __m128i *x0, __m128i *x1, __m128i *x2,
 }
 #endif
 
-// Size is number of 64B words. // Must be multiple of 8.
-// 128 -> 8 KiB per thread needed.
-#define PREFETCH_SIZE 128
+// Size in L1 prefetch. 4KiB per thread.
+#define PREFETCH_SIZE_B 4096
+#define PREFETCH_SIZE PREFETCH_SIZE_B / 64
 #define PREFETCH_TYPE_R _MM_HINT_T0
 #define PREFETCH_TYPE_W _MM_HINT_ET0
 
@@ -174,9 +182,9 @@ static inline void explode_scratchpad(const __m128i *input, __m128i *output,
   xin7 = _mm_load_si128(input + 11);
 
   size_t i;
-  // Prefetch first X KiB of output into L2 cache.
+  // Prefetch first X KiB of output into L1 cache.
   for (i = 0; i < PREFETCH_SIZE; i += 4) {
-    _mm_prefetch(output + i, _MM_HINT_ET0);
+    _mm_prefetch(output + i, PREFETCH_TYPE_W);
   }
 
   for (i = 0; i < (memory / sizeof(__m128i)) - PREFETCH_SIZE; i += 8) {
@@ -246,7 +254,7 @@ static inline void implode_scratchpad(const __m128i *input, __m128i *output,
   xout7 = _mm_load_si128(output + 11);
 
   size_t i;
-  // Prefetch first X KiB of input into L2 cache.
+  // Prefetch first X KiB of input into L1 cache.
   for (i = 0; i < PREFETCH_SIZE; i += 4) {
     _mm_prefetch(input + i, PREFETCH_TYPE_R);
   }
@@ -365,26 +373,23 @@ cryptonight_hash(const void *input, void *output, const uint32_t memory,
 #endif
 
     // Post AES
-    __m128i tmp = _mm_xor_si128(bx0, cx0);
+    const __m128i tmp = _mm_xor_si128(bx0, cx0);
     ((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp);
 
-    tmp = _mm_castps_si128(
-        _mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
-    uint64_t vh = _mm_cvtsi128_si64(tmp);
+    const uint64_t vh = _mm_extract_epi64(tmp, 1);
 
     const uint8_t x = (uint8_t)(vh >> 24);
     static const uint16_t table = 0x7531;
     const uint8_t index = (((x >> (3)) & 6) | (x & 1)) << 1;
-    vh ^= ((table >> index) & 0x3) << 28;
 
-    ((uint64_t *)(&l0[idx0]))[1] = vh;
+    ((uint64_t *)(&l0[idx0]))[1] = vh ^ (((table >> index) & 0x3) << 28);
 
     const uint64_t cxl0 = (uint64_t)(_mm_cvtsi128_si64(cx0));
     idx0 = cxl0 & mask;
 
-    uint64_t hi, lo, cl, ch;
-    cl = ((uint64_t *)(&l0[idx0]))[0];
-    ch = ((uint64_t *)(&l0[idx0]))[1];
+    register uint64_t hi, lo;
+    const uint64_t cl = ((const uint64_t *)(&l0[idx0]))[0];
+    const uint64_t ch = ((const uint64_t *)(&l0[idx0]))[1];
 
     __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc");
 
@@ -440,14 +445,6 @@ void cryptonight_turtlelite_hash(const void *input, void *output) {
 
 #ifdef __AVX2__ // GR_4WAY
 
-// GCC 7.5 Does not have _mm256_set_mi128i
-#ifdef __GNUC__
-#if __GNUC__ < 8
-#define _mm256_set_m128i(a, b)                                                 \
-  _mm256_insertf128_si256(_mm256_castsi128_si256(b), a, 1)
-#endif // __GNUC__ < 8
-#endif // __GNUC__
-
 // Requires 2x memory allocated in hp_state.
 __attribute__((always_inline)) void
 cryptonight_2way_hash(const void *input0, const void *input1, void *output0,
@@ -499,64 +496,58 @@ cryptonight_2way_hash(const void *input0, const void *input1, void *output0,
     _mm_prefetch(&l1[cxl1 & mask], _MM_HINT_ET0);
 
     // Post AES
-    __m128i tmp = _mm_xor_si128(bx0, cx0);
-    ((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp);
-    tmp = _mm_castps_si128(
-        _mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
-    uint64_t vh = _mm_cvtsi128_si64(tmp);
-    const uint8_t x0 = (uint8_t)(vh >> 24);
+    const __m128i tmp0 = _mm_xor_si128(bx0, cx0);
+    ((uint64_t *)(&l0[idx0]))[0] = _mm_cvtsi128_si64(tmp0);
+    const uint64_t vh0 = _mm_extract_epi64(tmp0, 1);
+    const uint8_t x0 = (uint8_t)(vh0 >> 24);
     static const uint16_t table = 0x7531;
     const uint8_t index0 = (((x0 >> (3)) & 6) | (x0 & 1)) << 1;
-    vh ^= ((table >> index0) & 0x3) << 28;
-    ((uint64_t *)(&l0[idx0]))[1] = vh;
-
-    tmp = _mm_xor_si128(bx1, cx1);
-    ((uint64_t *)(&l1[idx1]))[0] = _mm_cvtsi128_si64(tmp);
-    tmp = _mm_castps_si128(
-        _mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
-    vh = _mm_cvtsi128_si64(tmp);
-    const uint8_t x1 = (uint8_t)(vh >> 24);
+    ((uint64_t *)(&l0[idx0]))[1] = vh0 ^ (((table >> index0) & 0x3) << 28);
+
+    const __m128i tmp1 = _mm_xor_si128(bx1, cx1);
+    ((uint64_t *)(&l1[idx1]))[0] = _mm_cvtsi128_si64(tmp1);
+    const uint64_t vh1 = _mm_extract_epi64(tmp1, 1);
+    const uint8_t x1 = (uint8_t)(vh1 >> 24);
     const uint8_t index1 = (((x1 >> (3)) & 6) | (x1 & 1)) << 1;
-    vh ^= ((table >> index1) & 0x3) << 28;
-    ((uint64_t *)(&l1[idx1]))[1] = vh;
+    ((uint64_t *)(&l1[idx1]))[1] = vh1 ^ (((table >> index1) & 0x3) << 28);
 
     idx0 = cxl0 & mask;
     idx1 = cxl1 & mask;
 
-    uint64_t hi, lo, cl, ch;
-    cl = ((uint64_t *)(&l0[idx0]))[0];
-    ch = ((uint64_t *)(&l0[idx0]))[1];
+    register uint64_t hi, lo;
+    const uint64_t cl0 = ((const uint64_t *)(&l0[idx0]))[0];
+    const uint64_t ch0 = ((const uint64_t *)(&l0[idx0]))[1];
 
-    __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl) : "cc");
+    __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl0), "rm"(cl0) : "cc");
 
     al0 += hi;
     ah0 += lo;
 
     ((uint64_t *)(&l0[idx0]))[0] = al0;
     ((uint64_t *)(&l0[idx0]))[1] = ah0 ^ tweak1_2_0;
 
-    al0 ^= cl;
+    al0 ^= cl0;
     idx0 = al0 & mask;
     _mm_prefetch(&l0[idx0], _MM_HINT_ET0);
 
-    ah0 ^= ch;
+    ah0 ^= ch0;
 
-    cl = ((uint64_t *)(&l1[idx1]))[0];
-    ch = ((uint64_t *)(&l1[idx1]))[1];
+    const uint64_t cl1 = ((const uint64_t *)(&l1[idx1]))[0];
+    const uint64_t ch1 = ((const uint64_t *)(&l1[idx1]))[1];
 
-    __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl1), "rm"(cl) : "cc");
+    __asm("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "1"(cxl1), "rm"(cl1) : "cc");
 
     al1 += hi;
     ah1 += lo;
 
     ((uint64_t *)(&l1[idx1]))[0] = al1;
     ((uint64_t *)(&l1[idx1]))[1] = ah1 ^ tweak1_2_1;
 
-    al1 ^= cl;
+    al1 ^= cl1;
     idx1 = al1 & mask;
     _mm_prefetch(&l1[idx1], _MM_HINT_ET0);
 
-    ah1 ^= ch;
+    ah1 ^= ch1;
 
     bx0 = cx0;
     bx1 = cx1;

diff --git a/algo/gr/gr-4way.c b/algo/gr/gr-4way.c
@@ -432,7 +432,7 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
       applog(LOG_DEBUG, "hash order %s (%08x)", order, ntime);
     }
     if (opt_tuned) {
-      select_tuned_config();
+      select_tuned_config(thr_id);
     }
   }
 
@@ -448,8 +448,8 @@ int scanhash_gr_4way(struct work *work, uint32_t max_nonce,
       for (int i = 0; i < 4; i++) {
         if (unlikely(valid_hash(hash + (i << 3), ptarget))) {
           if (opt_debug) {
-            applog(LOG_BLUE, "Solution: %u %.10lf", bswap_32(n + i),
-                   hash_to_diff(hash + (i << 3)));
+            applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
+                   bswap_32(n + i), hash_to_diff(hash + (i << 3)));
           }
           pdata[19] = bswap_32(n + i);
           submit_solution(work, hash + (i << 3), mythr);

diff --git a/algo/gr/gr-gate.c b/algo/gr/gr-gate.c
@@ -136,15 +136,12 @@ static size_t GetMaxCnSize() {
 
 void AllocateNeededMemory() {
   size_t size = GetMaxCnSize();
-  if (opt_debug) {
-    applog(LOG_DEBUG, "Current Cryptonight variants require: %lu memory", size);
-  }
 
   // Purges previous memory allocation and creates new one.
   PrepareMemory((void **)&hp_state, size);
 }
 
-void select_tuned_config() {
+void select_tuned_config(int thr_id) {
   for (size_t i = 0; i < 20; i++) {
     if (cn[i][0] + 15 == gr_hash_order[5] ||
         cn[i][0] + 15 == gr_hash_order[11] ||
@@ -156,7 +153,7 @@ void select_tuned_config() {
             cn[i][2] + 15 == gr_hash_order[11] ||
             cn[i][2] + 15 == gr_hash_order[17]) {
           memcpy(cn_config, &cn_tune[i], 6);
-          if (opt_debug) {
+          if (opt_debug && !thr_id) {
             applog(LOG_BLUE, "config %d: %d %d %d %d %d %d", i, cn_config[0],
                    cn_config[1], cn_config[2], cn_config[3], cn_config[4],
                    cn_config[5]);
@@ -166,9 +163,11 @@ void select_tuned_config() {
       }
     }
   }
-  // Should not get to this point.
-  applog(LOG_ERR, "Could not find any config? %d %d %d", gr_hash_order[5],
-         gr_hash_order[11], gr_hash_order[17]);
+  if (!thr_id) {
+    // Should not get to this point.
+    applog(LOG_ERR, "Could not find any config? %d %d %d", gr_hash_order[5],
+           gr_hash_order[11], gr_hash_order[17]);
+  }
   return;
 }
 
@@ -430,7 +429,7 @@ void benchmark(void *input, int thr_id, long sleep_time) {
       gr_hash_order[11] = cn[rotation][1] + 15;
       gr_hash_order[17] = cn[rotation][2] + 15;
       if (opt_tuned) {
-        select_tuned_config();
+        select_tuned_config(thr_id);
       }
 
       // Purge memory for test.
@@ -478,7 +477,7 @@ void benchmark_configs(void *input, int thr_id) {
     cn_config[3] = (i & 8) >> 3;
     cn_config[4] = (i & 16) >> 4;
     cn_config[5] = (i & 32) >> 5;
-    if (thr_id == 0) {
+    if (!thr_id) {
       applog(LOG_NOTICE, "Testing Cryptonigh --cn-config %d,%d,%d,%d,%d,%d",
              cn_config[0], cn_config[1], cn_config[2], cn_config[3],
              cn_config[4], cn_config[5]);
@@ -503,7 +502,7 @@ void benchmark_configs(void *input, int thr_id) {
     }
   }
   // Show best config.
-  if (thr_id == 0) {
+  if (!thr_id) {
     applog(LOG_NOTICE, "Best --cn-config %d,%d,%d,%d,%d,%d",
            (best_config & 1) >> 0, (best_config & 2) >> 1,
            (best_config & 4) >> 2, (best_config & 8) >> 3,

diff --git a/algo/gr/gr.c b/algo/gr/gr.c
@@ -168,6 +168,10 @@ int scanhash_gr(struct work *work, uint32_t max_nonce, uint64_t *hashes_done,
     edata[19] = nonce;
     if (gr_hash(hash32, edata, thr_id)) {
       if (unlikely(valid_hash(hash32, ptarget))) {
+        if (opt_debug) {
+          applog(LOG_BLUE, "Solution found. Nonce: %u | Diff: %.10lf",
+                 bswap_32(nonce), hash_to_diff(hash32));
+        }
         pdata[19] = bswap_32(nonce);
         submit_solution(work, hash32, mythr);
       }

diff --git a/build-allarch.sh b/build-allarch.sh
@@ -50,25 +50,28 @@ compile "westmere" "aes-sse42" "-maes"
 compile "corei7-avx" "avx" "-maes"
 
 
-#AVX2+ Light
+#AVX2+
 # Haswell AVX2 AES
 # GCC 9 doesn't include AES with core-avx2
 compile "core-avx2" "avx2" "-maes"
 
 # AMD Zen1 AVX2 SHA
-compile "znver1" "zen"
+compile "znver1" "zen" "-mtune=znver1"
 
 # AMD Zen2 AVX2 SHA
-compile "znver2" "zen2"
+compile "znver2" "zen2" "-mtune=znver2"
 
 # AMD Zen3 AVX2 SHA VAES
-compile "znver2" "zen3" "-mvaes"
+# GCC 10
+compile "znver3" "zen3" "-mtune=znver3"
+# GCC 9
+# compile "znver2" "zen3" "-mvaes -mtune=znver2"
 
 # Icelake AVX512 SHA VAES
-compile "icelake-client" "avx512-sha-vaes"
+compile "icelake-client" "avx512-sha-vaes" "-mtune=intel"
 
 # Rocketlake AVX512 SHA AES
-compile "cascadelake" "avx512-sha" "-msha"
+compile "cascadelake" "avx512-sha" "-msha -mtune=intel"
 
 # Slylake-X AVX512 AES
-compile "skylake-avx512" "avx512"
+compile "skylake-avx512" "avx512" "-mtune=intel"