@@ -40,21 +40,30 @@ uint64_t K_512[80] = {
40
40
#undef xor3
41
41
#define xor3 (a,b,c ) (a^b^c)
42
42
43
+ // #define ROR64_8(x) ROTR64(x,8)
44
+ __device__ __inline__
45
+ uint64_t ROR64_8 (const uint64_t u64 ) {
46
+ const uint2 a = vectorize (u64 );
47
+ uint2 result;
48
+ result.x = __byte_perm (a.y , a.x , 0x0765 );
49
+ result.y = __byte_perm (a.y , a.x , 0x4321 );
50
+ return devectorize (result);
51
+ }
52
+
43
53
#define bsg5_0 (x ) xor3(ROTR64(x,28 ),ROTR64(x,34 ),ROTR64(x,39 ))
44
54
#define bsg5_1 (x ) xor3(ROTR64(x,14 ),ROTR64(x,18 ),ROTR64(x,41 ))
45
- #define ssg5_0 (x ) xor3(ROTR64(x,1 ),ROTR64(x,8 ),x>>7 )
46
- #define ssg5_1 (x ) xor3(ROTR64(x,19 ),ROTR64(x,61 ),x>>6 )
47
-
55
+ #define ssg5_0 (x ) xor3(ROTR64(x,1 ), ROR64_8(x), x>>7 )
56
+ #define ssg5_1 (x ) xor3(ROTR64(x,19 ),ROTR64(x,61 ), x>>6 )
48
57
49
58
#define andor64 (a,b,c ) ((a & (b | c)) | (b & c))
50
59
#define xandx64 (e,f,g ) (g ^ (e & (g ^ f)))
51
60
52
61
static __device__ __forceinline__
53
- void sha512_step2 (uint64_t * r,const uint64_t W,const uint64_t K, const int ord)
62
+ void sha512_step2 (uint64_t * r, const uint64_t W, const uint64_t K, const int ord)
54
63
{
55
64
const uint64_t T1 = r[(15 -ord) & 7 ] + K + W + bsg5_1 (r[(12 -ord) & 7 ]) + xandx64 (r[(12 -ord) & 7 ],r[(13 -ord) & 7 ],r[(14 -ord) & 7 ]);
56
- r[(15 -ord)& 7 ] = andor64 (r[( 8 -ord) & 7 ],r[( 9 -ord) & 7 ],r[(10 -ord) & 7 ]) + bsg5_0 (r[( 8 -ord) & 7 ]) + T1;
57
- r[(11 -ord)& 7 ]+= T1;
65
+ r[(15 -ord) & 7 ] = andor64 (r[(8 -ord) & 7 ],r[(9 -ord) & 7 ],r[(10 -ord) & 7 ]) + bsg5_0 (r[(8 -ord) & 7 ]) + T1;
66
+ r[(11 -ord) & 7 ] += T1;
58
67
}
59
68
60
69
/* *************************************************************************************************/
@@ -67,16 +76,17 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
67
76
0x6A09E667F3BCC908 , 0xBB67AE8584CAA73B , 0x3C6EF372FE94F82B , 0xA54FF53A5F1D36F1 ,
68
77
0x510E527FADE682D1 , 0x9B05688C2B3E6C1F , 0x1F83D9ABFB41BD6B , 0x5BE0CD19137E2179
69
78
};
79
+
70
80
uint64_t r[8 ];
71
81
uint64_t W[16 ];
72
82
if (thread < threads)
73
83
{
74
84
uint64_t *pHash = &g_hash[thread<<3 ];
75
85
76
- *(uint2x4*)&r[ 0 ] = *(uint2x4*)&IV512[ 0 ];
77
- *(uint2x4*)&r[ 4 ] = *(uint2x4*)&IV512[ 4 ];
86
+ *(uint2x4*)&r[0 ] = *(uint2x4*)&IV512[0 ];
87
+ *(uint2x4*)&r[4 ] = *(uint2x4*)&IV512[4 ];
78
88
79
- *(uint2x4*)&W[ 0 ] = __ldg4 ((uint2x4*)& pHash[ 0 ] );
89
+ *(uint2x4*)&W[0 ] = __ldg4 ((uint2x4*)pHash);
80
90
81
91
W[4 ] = 0x8000000000000000 ; // end tag
82
92
@@ -91,7 +101,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
91
101
}
92
102
93
103
#pragma unroll
94
- for (int i = 16 ; i < 80 ; i+=16 ){
104
+ for (int i = 16 ; i < 80 ; i+=16 ) {
95
105
#pragma unroll
96
106
for (int j = 0 ; j<16 ; j++) {
97
107
W[(i + j) & 15 ] += W[((i + j) - 7 ) & 15 ] + ssg5_0 (W[((i + j) - 15 ) & 15 ]) + ssg5_1 (W[((i + j) - 2 ) & 15 ]);
0 commit comments