diff --git a/binding.gyp b/binding.gyp index f8266880..46bb2482 100644 --- a/binding.gyp +++ b/binding.gyp @@ -58,6 +58,14 @@ "cflags_cc": [ "-std=c++0x" ], + 'conditions': [ + ['OS == "mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': ['-no-integrated-as'] + } + } + ] + ], } ] } diff --git a/crypto/oaes_lib.c b/crypto/oaes_lib.c index f3f2aac8..f615035f 100644 --- a/crypto/oaes_lib.c +++ b/crypto/oaes_lib.c @@ -34,7 +34,9 @@ static const char _NR[] = { #include #include #include +#ifdef __linux__ #include +#endif #include #include #include diff --git a/package-lock.json b/package-lock.json index e1e27baa..09dfc51f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2,6 +2,7 @@ "name": "multi-hashing", "version": "0.0.9", "lockfileVersion": 1, + "requires": true, "dependencies": { "bindings": { "version": "1.3.0", @@ -9,9 +10,9 @@ "integrity": "sha512-DpLh5EzMR2kzvX1KIlVC0VkC3iZtHKTgdtZ0a3pglBZdaQFjt5S9g9xd1lE+YvXyfd6mtCeRnrUfOLYiTMlNSw==" }, "nan": { - "version": "2.6.2", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.6.2.tgz", - "integrity": "sha1-5P805slf37WuzAjeZZb0NgWn20U=" + "version": "2.8.0", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.8.0.tgz", + "integrity": "sha1-7XFfP+neArV6XmJS2QqWZ14fCFo=" } } } diff --git a/package.json b/package.json index a375cc8b..211348aa 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,7 @@ }, "dependencies": { "bindings": "*", - "nan": "^2.6.2" + "nan": "^2.8.0" }, "keywords": [ "scrypt", diff --git a/scryptjane.c b/scryptjane.c index 1cd702e2..de656e22 100644 --- a/scryptjane.c +++ b/scryptjane.c @@ -12,8 +12,7 @@ #include "scryptjane/scrypt-jane-romix.h" #include "scryptjane/scrypt-jane-test-vectors.h" - -#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */ +#define scrypt_maxNfactor 30 /* (1 << (30 + 1)) = ~2 billion */ #if (SCRYPT_BLOCK_BYTES == 64) #define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */ #elif (SCRYPT_BLOCK_BYTES == 128) @@ -23,64 +22,64 @@ #elif (SCRYPT_BLOCK_BYTES == 512) #define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */ #endif -#define scrypt_maxr scrypt_r_32kb /* 32kb */ -#define scrypt_maxp 25 /* (1 << 25) = ~33 million */ +#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */ +#define scrypt_maxpfactor 25 /* (1 << 25) = ~33 million */ #include -#include +//#include -static void +static void NORETURN scrypt_fatal_error_default(const char *msg) { - fprintf(stderr, "%s\n", msg); - exit(1); + fprintf(stderr, "%s\n", msg); + exit(1); } static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; void -scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { - scrypt_fatal_error = fn; +scrypt_set_fatal_error(scrypt_fatal_errorfn fn) { + scrypt_fatal_error = fn; } static int -scrypt_power_on_self_test() { - const scrypt_test_setting *t; - uint8_t test_digest[64]; - uint32_t i; - int res = 7, scrypt_valid; +scrypt_power_on_self_test(void) { + const scrypt_test_setting *t; + uint8_t test_digest[64]; + uint32_t i; + int res = 7, scrypt_valid; - if (!scrypt_test_mix()) { + if (!scrypt_test_mix()) { #if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); + scrypt_fatal_error("scrypt: mix function power-on-self-test failed"); #endif - res &= ~1; - } + res &= ~1; + } - if (!scrypt_test_hash()) { + if (!scrypt_test_hash()) { #if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); + scrypt_fatal_error("scrypt: hash function power-on-self-test failed"); #endif - res &= ~2; - } - - for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { - t = post_settings + i; - scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); - scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); - } - - if (!scrypt_valid) { + res &= ~2; + } + + for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) { + t = post_settings + i; + scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest)); + scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest)); + } + + if (!scrypt_valid) { #if !defined(SCRYPT_TEST) - scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); + scrypt_fatal_error("scrypt: scrypt power-on-self-test failed"); #endif - res &= ~4; - } + res &= ~4; + } - return res; + return res; } typedef struct scrypt_aligned_alloc_t { - uint8_t *mem, *ptr; + uint8_t *mem, *ptr; } scrypt_aligned_alloc; #if defined(SCRYPT_TEST_SPEED) @@ -90,95 +89,95 @@ static size_t mem_bump = 0; /* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ static scrypt_aligned_alloc scrypt_alloc(uint64_t size) { - scrypt_aligned_alloc aa; - if (!mem_base) { - mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); - if (!mem_base) - scrypt_fatal_error("scrypt: out of memory"); - mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - } - aa.mem = mem_base + mem_bump; - aa.ptr = aa.mem; - mem_bump += (size_t)size; - return aa; + scrypt_aligned_alloc aa; + if (!mem_base) { + mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); + if (!mem_base) + scrypt_fatal_error("scrypt: out of memory"); + mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + } + aa.mem = mem_base + mem_bump; + aa.ptr = aa.mem; + mem_bump += (size_t)size; + return aa; } static void scrypt_free(scrypt_aligned_alloc *aa) { - mem_bump = 0; + mem_bump = 0; } #else static scrypt_aligned_alloc scrypt_alloc(uint64_t size) { - static const size_t max_alloc = (size_t)-1; - scrypt_aligned_alloc aa; - size += (SCRYPT_BLOCK_BYTES - 1); - if (size > max_alloc) - scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa.mem = (uint8_t *)malloc((size_t)size); - aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa.mem) - scrypt_fatal_error("scrypt: out of memory"); - return aa; + static const size_t max_alloc = (size_t)-1; + scrypt_aligned_alloc aa; + size += (SCRYPT_BLOCK_BYTES - 1); + if (size > max_alloc) + scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); + aa.mem = (uint8_t *)malloc((size_t)size); + aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); + if (!aa.mem) + scrypt_fatal_error("scrypt: out of memory"); + return aa; } static void scrypt_free(scrypt_aligned_alloc *aa) { - free(aa->mem); + free(aa->mem); } #endif void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) { - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t N, r, p, chunk_bytes, i; + scrypt_aligned_alloc YX, V; + uint8_t *X, *Y; + uint32_t N, r, p, chunk_bytes, i; #if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); + scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); #endif #if !defined(SCRYPT_TEST) - static int power_on_self_test = 0; - if (!power_on_self_test) { - power_on_self_test = 1; - if (!scrypt_power_on_self_test()) - scrypt_fatal_error("scrypt: power on self test failed"); - } + static int power_on_self_test = 0; + if (!power_on_self_test) { + power_on_self_test = 1; + if (!scrypt_power_on_self_test()) + scrypt_fatal_error("scrypt: power on self test failed"); + } #endif - if (Nfactor > scrypt_maxN) - scrypt_fatal_error("scrypt: N out of range"); - if (rfactor > scrypt_maxr) - scrypt_fatal_error("scrypt: r out of range"); - if (pfactor > scrypt_maxp) - scrypt_fatal_error("scrypt: p out of range"); + if (Nfactor > scrypt_maxNfactor) + scrypt_fatal_error("scrypt: N out of range"); + if (rfactor > scrypt_maxrfactor) + scrypt_fatal_error("scrypt: r out of range"); + if (pfactor > scrypt_maxpfactor) + scrypt_fatal_error("scrypt: p out of range"); - N = (1 << (Nfactor + 1)); - r = (1 << rfactor); - p = (1 << pfactor); + N = (1 << (Nfactor + 1)); + r = (1 << rfactor); + p = (1 << pfactor); - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - V = scrypt_alloc((uint64_t)N * chunk_bytes); - YX = scrypt_alloc((p + 1) * chunk_bytes); + chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; + V = scrypt_alloc((uint64_t)N * chunk_bytes); + YX = scrypt_alloc((p + 1) * chunk_bytes); - /* 1: X = PBKDF2(password, salt) */ - Y = YX.ptr; - X = Y + chunk_bytes; - scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); + /* 1: X = PBKDF2(password, salt) */ + Y = YX.ptr; + X = Y + chunk_bytes; + scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p); - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); + /* 2: X = ROMix(X) */ + for (i = 0; i < p; i++) + scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r); - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); + /* 3: Out = PBKDF2(password, X) */ + scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes); - scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); + scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes); - scrypt_free(&V); - scrypt_free(&YX); + scrypt_free(&V); + scrypt_free(&YX); } #define max(a,b) (((a) > (b)) ? (a) : (b)) diff --git a/scryptjane/scrypt-conf.h b/scryptjane/scrypt-conf.h new file mode 100644 index 00000000..46685a51 --- /dev/null +++ b/scryptjane/scrypt-conf.h @@ -0,0 +1,28 @@ +/* + pick the best algo at runtime or compile time? + ---------------------------------------------- + SCRYPT_CHOOSE_COMPILETIME (gcc only!) + SCRYPT_CHOOSE_RUNTIME +*/ +#define SCRYPT_CHOOSE_RUNTIME + + +/* + hash function to use + ------------------------------- + SCRYPT_BLAKE256 + SCRYPT_BLAKE512 + SCRYPT_SHA256 + SCRYPT_SHA512 + SCRYPT_SKEIN512 +*/ +//#define SCRYPT_SHA256 + + +/* + block mixer to use + ----------------------------- + SCRYPT_CHACHA + SCRYPT_SALSA +*/ +//#define SCRYPT_SALSA diff --git a/scryptjane/scrypt-jane-chacha.h b/scryptjane/scrypt-jane-chacha.h index 41d96e5e..54234576 100644 --- a/scryptjane/scrypt-jane-chacha.h +++ b/scryptjane/scrypt-jane-chacha.h @@ -11,11 +11,21 @@ typedef uint32_t scrypt_mix_word_t; /* must have these here in case block bytes is ever != 64 */ #include "scrypt-jane-romix-basic.h" +#include "scrypt-jane-mix_chacha-xop.h" #include "scrypt-jane-mix_chacha-avx.h" #include "scrypt-jane-mix_chacha-ssse3.h" #include "scrypt-jane-mix_chacha-sse2.h" #include "scrypt-jane-mix_chacha.h" +#if defined(SCRYPT_CHACHA_XOP) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop + #define SCRYPT_ROMIX_FN scrypt_ROMix_xop + #define SCRYPT_MIX_FN chacha_core_xop + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + #if defined(SCRYPT_CHACHA_AVX) #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx #define SCRYPT_ROMIX_FN scrypt_ROMix_avx @@ -52,9 +62,15 @@ typedef uint32_t scrypt_mix_word_t; #if !defined(SCRYPT_CHOOSE_COMPILETIME) static scrypt_ROMixfn -scrypt_getROMix() { +scrypt_getROMix(void) { size_t cpuflags = detect_cpu(); +#if defined(SCRYPT_CHACHA_XOP) + if (cpuflags & cpu_xop) + return scrypt_ROMix_xop; + else +#endif + #if defined(SCRYPT_CHACHA_AVX) if (cpuflags & cpu_avx) return scrypt_ROMix_avx; @@ -80,18 +96,27 @@ scrypt_getROMix() { #if defined(SCRYPT_TEST_SPEED) static size_t -available_implementations() { +available_implementations(void) { + size_t cpuflags = detect_cpu(); size_t flags = 0; +#if defined(SCRYPT_CHACHA_XOP) + if (cpuflags & cpu_xop) + flags |= cpu_xop; +#endif + #if defined(SCRYPT_CHACHA_AVX) - flags |= cpu_avx; + if (cpuflags & cpu_avx) + flags |= cpu_avx; #endif #if defined(SCRYPT_CHACHA_SSSE3) - flags |= cpu_ssse3; + if (cpuflags & cpu_ssse3) + flags |= cpu_ssse3; #endif #if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) flags |= cpu_sse2; #endif @@ -100,7 +125,7 @@ available_implementations() { #endif static int -scrypt_test_mix() { +scrypt_test_mix(void) { static const uint8_t expected[16] = { 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, }; @@ -108,6 +133,11 @@ scrypt_test_mix() { int ret = 1; size_t cpuflags = detect_cpu(); +#if defined(SCRYPT_CHACHA_XOP) + if (cpuflags & cpu_xop) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + #if defined(SCRYPT_CHACHA_AVX) if (cpuflags & cpu_avx) ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); diff --git a/scryptjane/scrypt-jane-hash.h b/scryptjane/scrypt-jane-hash.h index db5c1db3..e7278148 100644 --- a/scryptjane/scrypt-jane-hash.h +++ b/scryptjane/scrypt-jane-hash.h @@ -28,7 +28,7 @@ #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ static int -scrypt_test_hash() { +scrypt_test_hash(void) { scrypt_hash_state st; scrypt_hash_digest hash, final; uint8_t msg[SCRYPT_TEST_HASH_LEN]; diff --git a/scryptjane/scrypt-jane-hash_blake256.h b/scryptjane/scrypt-jane-hash_blake256.h new file mode 100644 index 00000000..4690b114 --- /dev/null +++ b/scryptjane/scrypt-jane-hash_blake256.h @@ -0,0 +1,177 @@ +#define SCRYPT_HASH "BLAKE-256" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +const uint8_t blake256_sigma[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, + 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, + 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, + 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, + 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, + 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, + 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, + 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, +}; + +const uint32_t blake256_constants[16] = { + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, + 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 +}; + +typedef struct scrypt_hash_state_t { + uint32_t H[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static void +blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); + uint32_t m[16], v[16], h[8], t[2]; + uint32_t i; + + for (i = 0; i < 8; i++) h[i] = S->H[i]; + for (i = 0; i < 2; i++) t[i] = S->T[i]; + + while (blocks--) { + t[0] += 512; + t[1] += (t[0] < 512) ? 1 : 0; + + for (i = 0; i < 8; i++) v[i ] = h[i]; + for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i]; + for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0]; + for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1]; + + for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]); + in += 64; + + #define G(a,b,c,d,e) \ + v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c],12); \ + v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); + + for (i = 0, sigma = blake256_sigma; i < 14; i++) { + G(0, 4, 8,12, 0); + G(1, 5, 9,13, 2); + G(2, 6,10,14, 4); + G(3, 7,11,15, 6); + + G(0, 5,10,15, 8); + G(1, 6,11,12,10); + G(2, 7, 8,13,12); + G(3, 4, 9,14,14); + + sigma += 16; + if (sigma == sigma_end) + sigma = blake256_sigma; + } + + #undef G + + for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); + } + + for (i = 0; i < 8; i++) S->H[i] = h[i]; + for (i = 0; i < 2; i++) S->T[i] = t[i]; +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667ULL; + S->H[1] = 0xbb67ae85ULL; + S->H[2] = 0x3c6ef372ULL; + S->H[3] = 0xa54ff53aULL; + S->H[4] = 0x510e527fULL; + S->H[5] = 0x9b05688cULL; + S->H[6] = 0x1f83d9abULL; + S->H[7] = 0x5be0cd19ULL; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + blake256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint32_t th, tl, bits; + + bits = (S->leftover << 3); + tl = S->T[0] + bits; + th = S->T[1]; + if (S->leftover == 0) { + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + } else if (S->T[0] == 0) { + S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; + S->T[1] = S->T[1] - 1; + } else { + S->T[0] -= (512 - bits); + } + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + blake256_blocks(S, S->buffer, 1); + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + memset(S->buffer, 0, 56); + } + S->buffer[55] |= 1; + U32TO8_BE(S->buffer + 56, th); + U32TO8_BE(S->buffer + 60, tl); + blake256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20, + 0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8, +}; diff --git a/scryptjane/scrypt-jane-hash_blake512.h b/scryptjane/scrypt-jane-hash_blake512.h new file mode 100644 index 00000000..ea2a583d --- /dev/null +++ b/scryptjane/scrypt-jane-hash_blake512.h @@ -0,0 +1,181 @@ +#define SCRYPT_HASH "BLAKE-512" +#define SCRYPT_HASH_BLOCK_SIZE 128 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +const uint8_t blake512_sigma[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, + 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, + 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, + 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, + 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, + 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, + 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, + 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, +}; + +const uint64_t blake512_constants[16] = { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL +}; + +typedef struct scrypt_hash_state_t { + uint64_t H[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static void +blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16); + uint64_t m[16], v[16], h[8], t[2]; + uint32_t i; + + for (i = 0; i < 8; i++) h[i] = S->H[i]; + for (i = 0; i < 2; i++) t[i] = S->T[i]; + + while (blocks--) { + t[0] += 1024; + t[1] += (t[0] < 1024) ? 1 : 0; + + for (i = 0; i < 8; i++) v[i ] = h[i]; + for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i]; + for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0]; + for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1]; + + for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]); + in += 128; + + #define G(a,b,c,d,e) \ + v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c],25); \ + v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c],11); + + for (i = 0, sigma = blake512_sigma; i < 16; i++) { + G(0, 4, 8,12, 0); + G(1, 5, 9,13, 2); + G(2, 6,10,14, 4); + G(3, 7,11,15, 6); + G(0, 5,10,15, 8); + G(1, 6,11,12,10); + G(2, 7, 8,13,12); + G(3, 4, 9,14,14); + + sigma += 16; + if (sigma == sigma_end) + sigma = blake512_sigma; + } + + #undef G + + for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); + } + + for (i = 0; i < 8; i++) S->H[i] = h[i]; + for (i = 0; i < 2; i++) S->T[i] = t[i]; +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667f3bcc908ULL; + S->H[1] = 0xbb67ae8584caa73bULL; + S->H[2] = 0x3c6ef372fe94f82bULL; + S->H[3] = 0xa54ff53a5f1d36f1ULL; + S->H[4] = 0x510e527fade682d1ULL; + S->H[5] = 0x9b05688c2b3e6c1fULL; + S->H[6] = 0x1f83d9abfb41bd6bULL; + S->H[7] = 0x5be0cd19137e2179ULL; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + blake512_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t th, tl; + size_t bits; + + bits = (S->leftover << 3); + tl = S->T[0] + bits; + th = S->T[1]; + if (S->leftover == 0) { + S->T[0] = (uint64_t)0 - (uint64_t)1024; + S->T[1] = (uint64_t)0 - (uint64_t)1; + } else if (S->T[0] == 0) { + S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits; + S->T[1] = S->T[1] - 1; + } else { + S->T[0] -= (1024 - bits); + } + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 111) { + memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); + blake512_blocks(S, S->buffer, 1); + S->T[0] = (uint64_t)0 - (uint64_t)1024; + S->T[1] = (uint64_t)0 - (uint64_t)1; + memset(S->buffer, 0, 112); + } + S->buffer[111] |= 1; + U64TO8_BE(S->buffer + 112, th); + U64TO8_BE(S->buffer + 120, tl); + blake512_blocks(S, S->buffer, 1); + + U64TO8_BE(&hash[ 0], S->H[0]); + U64TO8_BE(&hash[ 8], S->H[1]); + U64TO8_BE(&hash[16], S->H[2]); + U64TO8_BE(&hash[24], S->H[3]); + U64TO8_BE(&hash[32], S->H[4]); + U64TO8_BE(&hash[40], S->H[5]); + U64TO8_BE(&hash[48], S->H[6]); + U64TO8_BE(&hash[56], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f, + 0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83, + 0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a, + 0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac +}; diff --git a/scryptjane/scrypt-jane-hash_sha512.h b/scryptjane/scrypt-jane-hash_sha512.h new file mode 100644 index 00000000..3e3997d0 --- /dev/null +++ b/scryptjane/scrypt-jane-hash_sha512.h @@ -0,0 +1,152 @@ +#define SCRYPT_HASH "SHA-2-512" +#define SCRYPT_HASH_BLOCK_SIZE 128 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t H[8]; + uint64_t T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t sha512_constants[80] = { + 0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull, + 0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull, + 0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull, + 0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull, + 0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull, + 0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull, + 0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull, + 0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull, + 0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull, + 0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull, + 0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull, + 0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull, + 0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull, + 0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull, + 0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull, + 0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull, + 0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull, + 0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull, + 0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull, + 0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) +#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) +#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7)) +#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6)) +#define W0(in,i) (U8TO64_BE(&in[i * 8])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + +static void +sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + uint64_t r[8], w[80], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) r[i] = S->H[i]; + + while (blocks--) { + for (i = 0; i < 16; i++) { w[i] = W0(in, i); } + for (i = 16; i < 80; i++) { w[i] = W1(i); } + for (i = 0; i < 80; i++) { STEP(i); } + for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } + S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8; + S->T[1] += (!S->T[0]) ? 1 : 0; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667f3bcc908ull; + S->H[1] = 0xbb67ae8584caa73bull; + S->H[2] = 0x3c6ef372fe94f82bull; + S->H[3] = 0xa54ff53a5f1d36f1ull; + S->H[4] = 0x510e527fade682d1ull; + S->H[5] = 0x9b05688c2b3e6c1full; + S->H[6] = 0x1f83d9abfb41bd6bull; + S->H[7] = 0x5be0cd19137e2179ull; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha512_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1]; + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 111) { + memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); + sha512_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 112); + } + + U64TO8_BE(S->buffer + 112, t1); + U64TO8_BE(S->buffer + 120, t0); + sha512_blocks(S, S->buffer, 1); + + U64TO8_BE(&hash[ 0], S->H[0]); + U64TO8_BE(&hash[ 8], S->H[1]); + U64TO8_BE(&hash[16], S->H[2]); + U64TO8_BE(&hash[24], S->H[3]); + U64TO8_BE(&hash[32], S->H[4]); + U64TO8_BE(&hash[40], S->H[5]); + U64TO8_BE(&hash[48], S->H[6]); + U64TO8_BE(&hash[56], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d, + 0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec, + 0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c, + 0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c, +}; diff --git a/scryptjane/scrypt-jane-hash_skein512.h b/scryptjane/scrypt-jane-hash_skein512.h new file mode 100644 index 00000000..736d893d --- /dev/null +++ b/scryptjane/scrypt-jane-hash_skein512.h @@ -0,0 +1,188 @@ +#define SCRYPT_HASH "Skein-512" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 64 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t X[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +#include + +static void +skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { + uint64_t X[8], key[8], Xt[9+18], T[3+1]; + size_t r; + + while (blocks--) { + T[0] = S->T[0] + add; + T[1] = S->T[1]; + T[2] = T[0] ^ T[1]; + key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; + key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; + key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; + key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; + key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; + key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; + key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; + key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; + Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; + in += SCRYPT_HASH_BLOCK_SIZE; + + for (r = 0; r < 18; r++) + Xt[r + 9] = Xt[r + 0]; + + for (r = 0; r < 18; r += 2) { + X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; + + X[0] += Xt[r + 1]; + X[1] += Xt[r + 2]; + X[2] += Xt[r + 3]; + X[3] += Xt[r + 4]; + X[4] += Xt[r + 5]; + X[5] += Xt[r + 6] + T[1]; + X[6] += Xt[r + 7] + T[2]; + X[7] += Xt[r + 8] + r + 1; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + + X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; + X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; + X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; + X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; + X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; + X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; + X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; + X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; + X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; + X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; + X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; + X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; + X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; + X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; + X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; + X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; + + X[0] += Xt[r + 2]; + X[1] += Xt[r + 3]; + X[2] += Xt[r + 4]; + X[3] += Xt[r + 5]; + X[4] += Xt[r + 6]; + X[5] += Xt[r + 7] + T[1]; + X[6] += Xt[r + 8] + T[2]; + X[7] += Xt[r + 9] + r + 2; + + T[3] = T[0]; + T[0] = T[1]; + T[1] = T[2]; + T[2] = T[3]; + } + + S->X[0] = key[0] ^ X[0]; + S->X[1] = key[1] ^ X[1]; + S->X[2] = key[2] ^ X[2]; + S->X[3] = key[3] ^ X[3]; + S->X[4] = key[4] ^ X[4]; + S->X[5] = key[5] ^ X[5]; + S->X[6] = key[6] ^ X[6]; + S->X[7] = key[7] ^ X[7]; + + S->T[0] = T[0]; + S->T[1] = T[1] & ~0x4000000000000000ull; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->X[0] = 0x4903ADFF749C51CEull; + S->X[1] = 0x0D95DE399746DF03ull; + S->X[2] = 0x8FD1934127C79BCEull; + S->X[3] = 0x9A255629FF352CB1ull; + S->X[4] = 0x5DB62599DF6CA7B0ull; + S->X[5] = 0xEABE394CA9D5C3F4ull; + S->X[6] = 0x991112C71A75B523ull; + S->X[7] = 0xAE18A40B660FCC33ull; + S->T[0] = 0x0000000000000000ull; + S->T[1] = 0x7000000000000000ull; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ + if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { + /* handle the previous data, we know there is enough for at least one block */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + memcpy(S->buffer + S->leftover, in, want); + in += want; + inlen -= want; + S->leftover = 0; + skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); + } + + /* handle the current data if there's more than one block */ + if (inlen > SCRYPT_HASH_BLOCK_SIZE) { + blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); + inlen -= blocks; + in += blocks; + } + } + + /* handle leftover data */ + memcpy(S->buffer + S->leftover, in, inlen); + S->leftover += inlen; +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); + S->T[1] |= 0x8000000000000000ull; + skein512_blocks(S, S->buffer, 1, S->leftover); + + memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); + S->T[0] = 0; + S->T[1] = 0xff00000000000000ull; + skein512_blocks(S, S->buffer, 1, 8); + + U64TO8_LE(&hash[ 0], S->X[0]); + U64TO8_LE(&hash[ 8], S->X[1]); + U64TO8_LE(&hash[16], S->X[2]); + U64TO8_LE(&hash[24], S->X[3]); + U64TO8_LE(&hash[32], S->X[4]); + U64TO8_LE(&hash[40], S->X[5]); + U64TO8_LE(&hash[48], S->X[6]); + U64TO8_LE(&hash[56], S->X[7]); +} + + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, + 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, + 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, + 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, +}; diff --git a/scryptjane/scrypt-jane-mix_chacha-avx.h b/scryptjane/scrypt-jane-mix_chacha-avx.h index 50d6e2d2..ddd3ee11 100644 --- a/scryptjane/scrypt-jane-mix_chacha-avx.h +++ b/scryptjane/scrypt-jane-mix_chacha-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_AVX @@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(shl edx,6) a2(lea ecx,[edx-64]) a2(and eax, eax) - a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) - a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(mov ebx, 0x01000302) + a2(vmovd xmm4, ebx) + a2(mov ebx, 0x05040706) + a2(vmovd xmm0, ebx) + a2(mov ebx, 0x09080b0a) + a2(vmovd xmm1, ebx) + a2(mov ebx, 0x0d0c0f0e) + a2(vmovd xmm2, ebx) + a2(mov ebx, 0x02010003) + a2(vmovd xmm5, ebx) + a2(mov ebx, 0x06050407) + a2(vmovd xmm3, ebx) + a2(mov ebx, 0x0a09080b) + a2(vmovd xmm6, ebx) + a2(mov ebx, 0x0e0d0c0f) + a2(vmovd xmm7, ebx) + a3(vpunpckldq xmm4, xmm4, xmm0) + a3(vpunpckldq xmm5, xmm5, xmm3) + a3(vpunpckldq xmm1, xmm1, xmm2) + a3(vpunpckldq xmm6, xmm6, xmm7) + a3(vpunpcklqdq xmm4, xmm4, xmm1) + a3(vpunpcklqdq xmm5, xmm5, xmm6) a2(vmovdqa xmm0,[ecx+esi+0]) a2(vmovdqa xmm1,[ecx+esi+16]) a2(vmovdqa xmm2,[ecx+esi+32]) a2(vmovdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) + aj(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[ecx+eax+0]) a3(vpxor xmm1,xmm1,[ecx+eax+16]) a3(vpxor xmm2,xmm2,[ecx+eax+32]) @@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[esi+ecx+16]) a3(vpxor xmm2,xmm2,[esi+ecx+32]) a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) + aj(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[eax+ecx+0]) a3(vpxor xmm1,xmm1,[eax+ecx+16]) a3(vpxor xmm2,xmm2,[eax+ecx+32]) @@ -71,7 +91,6 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm6,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm6) - a2(sub eax,2) a3(vpaddd xmm0,xmm0,xmm1) a3(vpxor xmm3,xmm3,xmm0) a3(vpshufb xmm3,xmm3,xmm4) @@ -85,13 +104,14 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpshufb xmm3,xmm3,xmm5) a3(vpshufd xmm0,xmm0,0x39) a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) + a3(vpshufd xmm3,xmm3,0x4e) a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) + a3(vpshufd xmm2,xmm2,0x93) a3(vpsrld xmm6,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm6) - a1(ja scrypt_chacha_avx_loop) + a2(sub eax,2) + aj(ja scrypt_chacha_avx_loop) a3(vpaddd xmm0,xmm0,[esp+0]) a3(vpaddd xmm1,xmm1,[esp+16]) a3(vpaddd xmm2,xmm2,[esp+32]) @@ -108,13 +128,13 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [eax+32],xmm2) a2(vmovdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) + aj(jne scrypt_ChunkMix_avx_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_avx) #endif @@ -122,25 +142,33 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_AVX asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ a2(shl rcx,6) a2(lea r9,[rcx-64]) a2(lea rax,[rsi+r9]) a2(lea r9,[rdx+r9]) a2(and rdx, rdx) - a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) - a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) a2(vmovdqa xmm0,[rax+0]) a2(vmovdqa xmm1,[rax+16]) a2(vmovdqa xmm2,[rax+32]) a2(vmovdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) + a2(mov r8, 0x0504070601000302) + a2(mov rax, 0x0d0c0f0e09080b0a) + a2(movd xmm4, r8) + a2(movd xmm6, rax) + a2(mov r8, 0x0605040702010003) + a2(mov rax, 0x0e0d0c0f0a09080b) + a2(movd xmm5, r8) + a2(movd xmm7, rax) + a3(vpunpcklqdq xmm4, xmm4, xmm6) + a3(vpunpcklqdq xmm5, xmm5, xmm7) + aj(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) a3(vpxor xmm2,xmm2,[r9+32]) @@ -154,7 +182,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[rsi+r9+16]) a3(vpxor xmm2,xmm2,[rsi+r9+32]) a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) + aj(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[rdx+r9+0]) a3(vpxor xmm1,xmm1,[rdx+r9+16]) a3(vpxor xmm2,xmm2,[rdx+r9+32]) @@ -185,7 +213,6 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm12,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm12) - a2(sub rax,2) a3(vpaddd xmm0,xmm0,xmm1) a3(vpxor xmm3,xmm3,xmm0) a3(vpshufb xmm3,xmm3,xmm4) @@ -199,13 +226,14 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpshufb xmm3,xmm3,xmm5) a3(vpshufd xmm0,xmm0,0x39) a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) + a3(vpshufd xmm3,xmm3,0x4e) a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) + a3(vpshufd xmm2,xmm2,0x93) a3(vpsrld xmm12,xmm1,25) a3(vpslld xmm1,xmm1,7) a3(vpxor xmm1,xmm1,xmm12) - a1(ja scrypt_chacha_avx_loop) + a2(sub rax,2) + aj(ja scrypt_chacha_avx_loop) a3(vpaddd xmm0,xmm0,xmm8) a3(vpaddd xmm1,xmm1,xmm9) a3(vpaddd xmm2,xmm2,xmm10) @@ -221,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [rax+16],xmm1) a2(vmovdqa [rax+32],xmm2) a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) + aj(jne scrypt_ChunkMix_avx_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_avx) @@ -233,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_CHACHA_AVX -static void NOINLINE +static void asm_calling_convention NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; diff --git a/scryptjane/scrypt-jane-mix_chacha-sse2.h b/scryptjane/scrypt-jane-mix_chacha-sse2.h index d2192c8f..a8c2197a 100644 --- a/scryptjane/scrypt-jane-mix_chacha-sse2.h +++ b/scryptjane/scrypt-jane-mix_chacha-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSE2 @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) + aj(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) + aj(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -52,10 +52,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a1(scrypt_chacha_sse2_loop: ) a2(paddd xmm0,xmm1) a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) + a3(pshuflw xmm3,xmm3,0xb1) + a3(pshufhw xmm3,xmm3,0xb1) a2(paddd xmm2,xmm3) a2(pxor xmm1,xmm2) a2(movdqa xmm6,xmm1) @@ -80,10 +78,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(sub eax,2) a2(paddd xmm0,xmm1) a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) + a3(pshuflw xmm3,xmm3,0xb1) + a3(pshufhw xmm3,xmm3,0xb1) a2(paddd xmm2,xmm3) a2(pxor xmm1,xmm2) a2(movdqa xmm6,xmm1) @@ -105,7 +101,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) + aj(ja scrypt_chacha_sse2_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,xmm4) a2(paddd xmm2,xmm5) @@ -122,13 +118,13 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) + aj(jne scrypt_ChunkMix_sse2_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_sse2) #endif @@ -136,13 +132,13 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSE2 asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ a2(shl rcx,6) a2(lea r9,[rcx-64]) a2(lea rax,[rsi+r9]) @@ -152,7 +148,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) + aj(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -166,7 +162,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) + aj(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -180,10 +176,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a1(scrypt_chacha_sse2_loop: ) a2(paddd xmm0,xmm1) a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) + a3(pshuflw xmm3,xmm3,0xb1) + a3(pshufhw xmm3,xmm3,0xb1) a2(paddd xmm2,xmm3) a2(pxor xmm1,xmm2) a2(movdqa xmm6,xmm1) @@ -208,10 +202,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(sub rax,2) a2(paddd xmm0,xmm1) a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) + a3(pshuflw xmm3,xmm3,0xb1) + a3(pshufhw xmm3,xmm3,0xb1) a2(paddd xmm2,xmm3) a2(pxor xmm1,xmm2) a2(movdqa xmm6,xmm1) @@ -233,7 +225,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) + aj(ja scrypt_chacha_sse2_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -249,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) + aj(jne scrypt_ChunkMix_sse2_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_sse2) @@ -261,7 +253,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_CHACHA_SSE2 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; @@ -308,7 +300,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes] x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x4 = x1; @@ -327,7 +319,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes] x0 = _mm_add_epi32(x0, x1); x3 = _mm_xor_si128(x3, x0); x4 = x3; - x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1); x2 = _mm_add_epi32(x2, x3); x1 = _mm_xor_si128(x1, x2); x4 = x1; diff --git a/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/scryptjane/scrypt-jane-mix_chacha-ssse3.h index b25e3567..894312e6 100644 --- a/scryptjane/scrypt-jane-mix_chacha-ssse3.h +++ b/scryptjane/scrypt-jane-mix_chacha-ssse3.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSSE3 @@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(shl edx,6) a2(lea ecx,[edx-64]) a2(and eax, eax) - a2(movdqa xmm4,[ssse3_rotl16_32bit]) - a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(mov ebx, 0x01000302) + a2(movd xmm4, ebx) + a2(mov ebx, 0x05040706) + a2(movd xmm0, ebx) + a2(mov ebx, 0x09080b0a) + a2(movd xmm1, ebx) + a2(mov ebx, 0x0d0c0f0e) + a2(movd xmm2, ebx) + a2(mov ebx, 0x02010003) + a2(movd xmm5, ebx) + a2(mov ebx, 0x06050407) + a2(movd xmm3, ebx) + a2(mov ebx, 0x0a09080b) + a2(movd xmm6, ebx) + a2(mov ebx, 0x0e0d0c0f) + a2(movd xmm7, ebx) + a2(punpckldq xmm4, xmm0) + a2(punpckldq xmm5, xmm3) + a2(punpckldq xmm1, xmm2) + a2(punpckldq xmm6, xmm7) + a2(punpcklqdq xmm4, xmm1) + a2(punpcklqdq xmm5, xmm6) a2(movdqa xmm0,[ecx+esi+0]) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) + aj(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) + aj(jz scrypt_ChunkMix_ssse3_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -95,7 +115,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pslld xmm1,7) a2(psrld xmm6,25) a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_ssse3_loop) + aj(ja scrypt_chacha_ssse3_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,[esp+16]) a2(paddd xmm2,[esp+32]) @@ -112,13 +132,13 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_ssse3_loop) + aj(jne scrypt_ChunkMix_ssse3_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_ssse3) #endif @@ -126,25 +146,33 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) /* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_CHACHA_SSSE3 asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) asm_naked_fn(scrypt_ChunkMix_ssse3) - a2(lea rcx,[rcx*2]) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ a2(shl rcx,6) a2(lea r9,[rcx-64]) a2(lea rax,[rsi+r9]) a2(lea r9,[rdx+r9]) a2(and rdx, rdx) - a2(movdqa xmm4,[ssse3_rotl16_32bit]) - a2(movdqa xmm5,[ssse3_rotl8_32bit]) a2(movdqa xmm0,[rax+0]) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(mov r8, 0x0504070601000302) + a2(mov rax, 0x0d0c0f0e09080b0a) + a2(movd xmm4, r8) + a2(movd xmm6, rax) + a2(mov r8, 0x0605040702010003) + a2(mov rax, 0x0e0d0c0f0a09080b) + a2(movd xmm5, r8) + a2(movd xmm7, rax) + a2(punpcklqdq xmm4, xmm6) + a2(punpcklqdq xmm5, xmm7) + aj(jz scrypt_ChunkMix_ssse3_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -158,7 +186,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) + aj(jz scrypt_ChunkMix_ssse3_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -213,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(pslld xmm1,7) a2(psrld xmm12,25) a2(pxor xmm1,xmm12) - a1(ja scrypt_chacha_ssse3_loop) + aj(ja scrypt_chacha_ssse3_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -229,7 +257,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_ssse3_loop) + aj(jne scrypt_ChunkMix_ssse3_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_ssse3) @@ -241,7 +269,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3) #define SCRYPT_CHACHA_SSSE3 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; diff --git a/scryptjane/scrypt-jane-mix_chacha-xop.h b/scryptjane/scrypt-jane-mix_chacha-xop.h new file mode 100644 index 00000000..4c25d887 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-xop.h @@ -0,0 +1,315 @@ +/* x86 */ +#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_CHACHA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(vmovdqa xmm0,[ecx+esi+0]) + a2(vmovdqa xmm1,[ecx+esi+16]) + a2(vmovdqa xmm2,[ecx+esi+32]) + a2(vmovdqa xmm3,[ecx+esi+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_xop_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa xmm4,xmm0) + a2(vmovdqa xmm5,xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_xop_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,16) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,8) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,7) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,16) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,8) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm0,xmm0,0x39) + a3(vprotd xmm1,xmm1,7) + a3(pshufd xmm3,xmm3,0x4e) + a3(pshufd xmm2,xmm2,0x93) + a2(sub eax,2) + aj(ja scrypt_chacha_xop_loop) + a3(vpaddd xmm0,xmm0,xmm4) + a3(vpaddd xmm1,xmm1,xmm5) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + aj(jne scrypt_ChunkMix_xop_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + aret(16) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_CHACHA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_xop_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa xmm4,xmm0) + a2(vmovdqa xmm5,xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_xop_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,16) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,8) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,7) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,16) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vprotd xmm1,xmm1,12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vprotd xmm3,xmm3,8) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm0,xmm0,0x39) + a3(vprotd xmm1,xmm1,7) + a3(pshufd xmm3,xmm3,0x4e) + a3(pshufd xmm2,xmm2,0x93) + a2(sub rax,2) + aj(ja scrypt_chacha_xop_loop) + a3(vpaddd xmm0,xmm0,xmm4) + a3(vpaddd xmm1,xmm1,xmm5) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + aj(jne scrypt_ChunkMix_xop_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_XOP + +static void asm_calling_convention NOINLINE +scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_roti_epi32(x3, 16); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_roti_epi32(x1, 12); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_roti_epi32(x3, 8); + x2 = _mm_add_epi32(x2, x3); + x0 = _mm_shuffle_epi32(x0, 0x93); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_roti_epi32(x1, 7); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x0 = _mm_add_epi32(x0, x1); + x2 = _mm_shuffle_epi32(x2, 0x39); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_roti_epi32(x3, 16); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_roti_epi32(x1, 12); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_roti_epi32(x3, 8); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x0 = _mm_shuffle_epi32(x0, 0x39); + x1 = _mm_roti_epi32(x1, 7); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x2 = _mm_shuffle_epi32(x2, 0x93); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_XOP) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-XOP" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa-avx.h b/scryptjane/scrypt-jane-mix_salsa-avx.h index 15fb48e3..259fae46 100644 --- a/scryptjane/scrypt-jane-mix_salsa-avx.h +++ b/scryptjane/scrypt-jane-mix_salsa-avx.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_AVX @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) + aj(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[ecx+eax+0]) a3(vpxor xmm1,xmm1,[ecx+eax+16]) a3(vpxor xmm2,xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[esi+ecx+16]) a3(vpxor xmm2,xmm2,[esi+ecx+32]) a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) + aj(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[eax+ecx+0]) a3(vpxor xmm1,xmm1,[eax+ecx+16]) a3(vpxor xmm2,xmm2,[eax+ecx+32]) @@ -64,17 +64,16 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm5, xmm4, 19) a3(vpslld xmm4, xmm4, 13) a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) + a3(vpshufd xmm3, xmm3, 0x93) a3(vpxor xmm1, xmm1, xmm4) a3(vpaddd xmm4, xmm2, xmm1) a3(vpsrld xmm5, xmm4, 14) a3(vpslld xmm4, xmm4, 18) a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) + a3(vpshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) - a2(sub eax, 2) a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) + a3(vpshufd xmm1, xmm1, 0x39) a3(vpsrld xmm5, xmm4, 25) a3(vpslld xmm4, xmm4, 7) a3(vpxor xmm1, xmm1, xmm5) @@ -88,16 +87,17 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm5, xmm4, 19) a3(vpslld xmm4, xmm4, 13) a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) + a3(vpshufd xmm1, xmm1, 0x93) a3(vpxor xmm3, xmm3, xmm4) a3(vpaddd xmm4, xmm2, xmm3) a3(vpsrld xmm5, xmm4, 14) a3(vpslld xmm4, xmm4, 18) a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) + a3(vpshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) + a3(vpshufd xmm3, xmm3, 0x39) + a2(sub eax, 2) + aj(ja scrypt_salsa_avx_loop) a3(vpaddd xmm0,xmm0,[esp+0]) a3(vpaddd xmm1,xmm1,[esp+16]) a3(vpaddd xmm2,xmm2,xmm6) @@ -114,13 +114,13 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [eax+32],xmm2) a2(vmovdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) + aj(jne scrypt_ChunkMix_avx_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_avx) #endif @@ -128,13 +128,13 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) /* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_AVX asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ a2(shl rcx,6) a2(lea r9,[rcx-64]) a2(lea rax,[rsi+r9]) @@ -144,7 +144,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa xmm1,[rax+16]) a2(vmovdqa xmm2,[rax+32]) a2(vmovdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) + aj(jz scrypt_ChunkMix_avx_no_xor1) a3(vpxor xmm0,xmm0,[r9+0]) a3(vpxor xmm1,xmm1,[r9+16]) a3(vpxor xmm2,xmm2,[r9+32]) @@ -158,7 +158,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpxor xmm1,xmm1,[rsi+r9+16]) a3(vpxor xmm2,xmm2,[rsi+r9+32]) a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) + aj(jz scrypt_ChunkMix_avx_no_xor2) a3(vpxor xmm0,xmm0,[rdx+r9+0]) a3(vpxor xmm1,xmm1,[rdx+r9+16]) a3(vpxor xmm2,xmm2,[rdx+r9+32]) @@ -184,17 +184,16 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm5, xmm4, 19) a3(vpslld xmm4, xmm4, 13) a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) + a3(vpshufd xmm3, xmm3, 0x93) a3(vpxor xmm1, xmm1, xmm4) a3(vpaddd xmm4, xmm2, xmm1) a3(vpsrld xmm5, xmm4, 14) a3(vpslld xmm4, xmm4, 18) a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) + a3(vpshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) - a2(sub rax, 2) a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) + a3(vpshufd xmm1, xmm1, 0x39) a3(vpsrld xmm5, xmm4, 25) a3(vpslld xmm4, xmm4, 7) a3(vpxor xmm1, xmm1, xmm5) @@ -208,16 +207,17 @@ asm_naked_fn(scrypt_ChunkMix_avx) a3(vpsrld xmm5, xmm4, 19) a3(vpslld xmm4, xmm4, 13) a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) + a3(vpshufd xmm1, xmm1, 0x93) a3(vpxor xmm3, xmm3, xmm4) a3(vpaddd xmm4, xmm2, xmm3) a3(vpsrld xmm5, xmm4, 14) a3(vpslld xmm4, xmm4, 18) a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) + a3(vpshufd xmm2, xmm2, 0x4e) a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) + a3(vpshufd xmm3, xmm3, 0x39) + a2(sub rax, 2) + aj(ja scrypt_salsa_avx_loop) a3(vpaddd xmm0,xmm0,xmm8) a3(vpaddd xmm1,xmm1,xmm9) a3(vpaddd xmm2,xmm2,xmm10) @@ -233,7 +233,7 @@ asm_naked_fn(scrypt_ChunkMix_avx) a2(vmovdqa [rax+16],xmm1) a2(vmovdqa [rax+32],xmm2) a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) + aj(jne scrypt_ChunkMix_avx_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_avx) @@ -245,7 +245,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx) #define SCRYPT_SALSA_AVX -static void NOINLINE +static void asm_calling_convention NOINLINE scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; diff --git a/scryptjane/scrypt-jane-mix_salsa-sse2.h b/scryptjane/scrypt-jane-mix_salsa-sse2.h index 4898659e..d7ef969c 100644 --- a/scryptjane/scrypt-jane-mix_salsa-sse2.h +++ b/scryptjane/scrypt-jane-mix_salsa-sse2.h @@ -1,5 +1,5 @@ /* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_SSE2 @@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[ecx+esi+16]) a2(movdqa xmm2,[ecx+esi+32]) a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) + aj(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[ecx+eax+0]) a2(pxor xmm1,[ecx+eax+16]) a2(pxor xmm2,[ecx+eax+32]) @@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[esi+ecx+16]) a2(pxor xmm2,[esi+ecx+32]) a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) + aj(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[eax+ecx+0]) a2(pxor xmm1,[eax+ecx+16]) a2(pxor xmm2,[eax+ecx+32]) @@ -113,7 +113,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) + aj(ja scrypt_salsa_sse2_loop) a2(paddd xmm0,[esp+0]) a2(paddd xmm1,[esp+16]) a2(paddd xmm2,xmm6) @@ -130,13 +130,13 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [eax+32],xmm2) a2(movdqa [eax+48],xmm3) a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) + aj(jne scrypt_ChunkMix_sse2_loop) a2(mov esp,ebp) a1(pop ebp) a1(pop esi) a1(pop edi) a1(pop ebx) - a1(ret 16) + aret(16) asm_naked_fn_end(scrypt_ChunkMix_sse2) #endif @@ -144,13 +144,13 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) /* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) #define SCRYPT_SALSA_SSE2 asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ a2(shl rcx,6) a2(lea r9,[rcx-64]) a2(lea rax,[rsi+r9]) @@ -160,7 +160,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa xmm1,[rax+16]) a2(movdqa xmm2,[rax+32]) a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) + aj(jz scrypt_ChunkMix_sse2_no_xor1) a2(pxor xmm0,[r9+0]) a2(pxor xmm1,[r9+16]) a2(pxor xmm2,[r9+32]) @@ -174,7 +174,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm1,[rsi+r9+16]) a2(pxor xmm2,[rsi+r9+32]) a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) + aj(jz scrypt_ChunkMix_sse2_no_xor2) a2(pxor xmm0,[rdx+r9+0]) a2(pxor xmm1,[rdx+r9+16]) a2(pxor xmm2,[rdx+r9+32]) @@ -249,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(pxor xmm0, xmm4) a3(pshufd xmm3, xmm3, 0x39) a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) + aj(ja scrypt_salsa_sse2_loop) a2(paddd xmm0,xmm8) a2(paddd xmm1,xmm9) a2(paddd xmm2,xmm10) @@ -265,7 +265,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2) a2(movdqa [rax+16],xmm1) a2(movdqa [rax+32],xmm2) a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) + aj(jne scrypt_ChunkMix_sse2_loop) a1(ret) asm_naked_fn_end(scrypt_ChunkMix_sse2) @@ -277,7 +277,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2) #define SCRYPT_SALSA_SSE2 -static void NOINLINE +static void NOINLINE asm_calling_convention scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { uint32_t i, blocksPerChunk = r * 2, half = 0; xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; @@ -426,7 +426,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes] 4 9 14 3 */ - static void STDCALL + static void asm_calling_convention salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { uint32_t t; while (count--) { diff --git a/scryptjane/scrypt-jane-mix_salsa-xop.h b/scryptjane/scrypt-jane-mix_salsa-xop.h new file mode 100644 index 00000000..1d014d2a --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa-xop.h @@ -0,0 +1,317 @@ +/* x86 */ +#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_xop_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_xop_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(pshufd xmm3, xmm3, 0x93) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm4) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpaddd xmm4, xmm2, xmm3) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm3, xmm3, 0x39) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub eax, 2) + aj(ja scrypt_salsa_xop_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + aj(jne scrypt_ChunkMix_xop_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + aret(16) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_xop_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_xop_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(pshufd xmm3, xmm3, 0x93) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpaddd xmm4, xmm3, xmm0) + a3(vprotd xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vprotd xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vprotd xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm4) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpaddd xmm4, xmm2, xmm3) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vprotd xmm4, xmm4, 18) + a3(pshufd xmm3, xmm3, 0x39) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub rax, 2) + aj(ja scrypt_salsa_xop_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + aj(jne scrypt_ChunkMix_xop_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_XOP + +static void asm_calling_convention NOINLINE +scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = _mm_add_epi32(x1, x0); + x4 = _mm_roti_epi32(x4, 7); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_add_epi32(x0, x3); + x4 = _mm_roti_epi32(x4, 9); + x2 = _mm_xor_si128(x2, x4); + x4 = _mm_add_epi32(x3, x2); + x4 = _mm_roti_epi32(x4, 13); + x1 = _mm_xor_si128(x1, x4); + x4 = _mm_add_epi32(x2, x1); + x4 = _mm_roti_epi32(x4, 18); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x93); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x3, x0); + x4 = _mm_roti_epi32(x4, 7); + x1 = _mm_xor_si128(x1, x4); + x4 = _mm_add_epi32(x0, x1); + x4 = _mm_roti_epi32(x4, 9); + x2 = _mm_xor_si128(x2, x4); + x4 = _mm_add_epi32(x1, x2); + x4 = _mm_roti_epi32(x4, 13); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_add_epi32(x2, x3); + x4 = _mm_roti_epi32(x4, 18); + x0 = _mm_xor_si128(x0, x4); + x1 = _mm_shuffle_epi32(x1, 0x93); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x3 = _mm_shuffle_epi32(x3, 0x39); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_XOP) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-XOP" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa64-avx.h b/scryptjane/scrypt-jane-mix_salsa64-avx.h new file mode 100644 index 00000000..c6e41dc3 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64-avx.h @@ -0,0 +1,367 @@ +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a2(vmovdqa xmm4,[rax+64]) + a2(vmovdqa xmm5,[rax+80]) + a2(vmovdqa xmm6,[rax+96]) + a2(vmovdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a3(vpxor xmm4,xmm4,[r9+64]) + a3(vpxor xmm5,xmm5,[r9+80]) + a3(vpxor xmm6,xmm6,[r9+96]) + a3(vpxor xmm7,xmm7,[r9+112]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a3(vpxor xmm4,xmm4,[rsi+r9+64]) + a3(vpxor xmm5,xmm5,[rsi+r9+80]) + a3(vpxor xmm6,xmm6,[rsi+r9+96]) + a3(vpxor xmm7,xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a3(vpxor xmm4,xmm4,[rdx+r9+64]) + a3(vpxor xmm5,xmm5,[rdx+r9+80]) + a3(vpxor xmm6,xmm6,[rdx+r9+96]) + a3(vpxor xmm7,xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [rsp+0],xmm0) + a2(vmovdqa [rsp+16],xmm1) + a2(vmovdqa [rsp+32],xmm2) + a2(vmovdqa [rsp+48],xmm3) + a2(vmovdqa [rsp+64],xmm4) + a2(vmovdqa [rsp+80],xmm5) + a2(vmovdqa [rsp+96],xmm6) + a2(vmovdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_avx_loop: ) + a3(vpaddq xmm8, xmm0, xmm2) + a3(vpaddq xmm9, xmm1, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm6, xmm6, xmm8) + a3(vpxor xmm7, xmm7, xmm9) + a3(vpaddq xmm10, xmm0, xmm6) + a3(vpaddq xmm11, xmm1, xmm7) + a3(vpsrlq xmm8, xmm10, 51) + a3(vpsrlq xmm9, xmm11, 51) + a3(vpsllq xmm10, xmm10, 13) + a3(vpsllq xmm11, xmm11, 13) + a3(vpxor xmm4, xmm4, xmm8) + a3(vpxor xmm5, xmm5, xmm9) + a3(vpxor xmm4, xmm4, xmm10) + a3(vpxor xmm5, xmm5, xmm11) + a3(vpaddq xmm8, xmm6, xmm4) + a3(vpaddq xmm9, xmm7, xmm5) + a3(vpsrlq xmm10, xmm8, 25) + a3(vpsrlq xmm11, xmm9, 25) + a3(vpsllq xmm8, xmm8, 39) + a3(vpsllq xmm9, xmm9, 39) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpaddq xmm10, xmm4, xmm2) + a3(vpaddq xmm11, xmm5, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm0, xmm0, xmm10) + a3(vpxor xmm1, xmm1, xmm11) + a2(vmovdqa xmm8, xmm2) + a2(vmovdqa xmm9, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm9, xmm8, 8) + a4(vpalignr xmm7, xmm8, xmm9, 8) + a3(vpaddq xmm10, xmm0, xmm2) + a3(vpaddq xmm11, xmm1, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm6, xmm6, xmm10) + a3(vpxor xmm7, xmm7, xmm11) + a3(vpaddq xmm8, xmm0, xmm6) + a3(vpaddq xmm9, xmm1, xmm7) + a3(vpsrlq xmm10, xmm8, 51) + a3(vpsrlq xmm11, xmm9, 51) + a3(vpsllq xmm8, xmm8, 13) + a3(vpsllq xmm9, xmm9, 13) + a3(vpxor xmm5, xmm5, xmm10) + a3(vpxor xmm4, xmm4, xmm11) + a3(vpxor xmm5, xmm5, xmm8) + a3(vpxor xmm4, xmm4, xmm9) + a3(vpaddq xmm10, xmm6, xmm5) + a3(vpaddq xmm11, xmm7, xmm4) + a3(vpsrlq xmm8, xmm10, 25) + a3(vpsrlq xmm9, xmm11, 25) + a3(vpsllq xmm10, xmm10, 39) + a3(vpsllq xmm11, xmm11, 39) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpaddq xmm8, xmm5, xmm2) + a3(vpaddq xmm9, xmm4, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm0, xmm0, xmm8) + a3(vpxor xmm1, xmm1, xmm9) + a2(vmovdqa xmm10, xmm2) + a2(vmovdqa xmm11, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm11, xmm10, 8) + a4(vpalignr xmm7, xmm10, xmm11, 8) + a2(sub rax, 2) + aj(ja scrypt_salsa64_avx_loop) + a3(vpaddq xmm0,xmm0,[rsp+0]) + a3(vpaddq xmm1,xmm1,[rsp+16]) + a3(vpaddq xmm2,xmm2,[rsp+32]) + a3(vpaddq xmm3,xmm3,[rsp+48]) + a3(vpaddq xmm4,xmm4,[rsp+64]) + a3(vpaddq xmm5,xmm5,[rsp+80]) + a3(vpaddq xmm6,xmm6,[rsp+96]) + a3(vpaddq xmm7,xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a2(vmovdqa [rax+64],xmm4) + a2(vmovdqa [rax+80],xmm5) + a2(vmovdqa [rax+96],xmm6) + a2(vmovdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_avx_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_AVX + +static void asm_calling_convention +scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_AVX) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-AVX" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa64-avx2.h b/scryptjane/scrypt-jane-mix_salsa64-avx2.h new file mode 100644 index 00000000..a42e808b --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64-avx2.h @@ -0,0 +1,221 @@ +/* x64 */ +#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_AVX2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx2) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa ymm0,[rax+0]) + a2(vmovdqa ymm1,[rax+32]) + a2(vmovdqa ymm2,[rax+64]) + a2(vmovdqa ymm3,[rax+96]) + aj(jz scrypt_ChunkMix_avx2_no_xor1) + a3(vpxor ymm0,ymm0,[r9+0]) + a3(vpxor ymm1,ymm1,[r9+32]) + a3(vpxor ymm2,ymm2,[r9+64]) + a3(vpxor ymm3,ymm3,[r9+96]) + a1(scrypt_ChunkMix_avx2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx2_loop:) + a2(and rdx, rdx) + a3(vpxor ymm0,ymm0,[rsi+r9+0]) + a3(vpxor ymm1,ymm1,[rsi+r9+32]) + a3(vpxor ymm2,ymm2,[rsi+r9+64]) + a3(vpxor ymm3,ymm3,[rsi+r9+96]) + aj(jz scrypt_ChunkMix_avx2_no_xor2) + a3(vpxor ymm0,ymm0,[rdx+r9+0]) + a3(vpxor ymm1,ymm1,[rdx+r9+32]) + a3(vpxor ymm2,ymm2,[rdx+r9+64]) + a3(vpxor ymm3,ymm3,[rdx+r9+96]) + a1(scrypt_ChunkMix_avx2_no_xor2:) + a2(vmovdqa ymm6,ymm0) + a2(vmovdqa ymm7,ymm1) + a2(vmovdqa ymm8,ymm2) + a2(vmovdqa ymm9,ymm3) + a2(mov rax,4) + a1(scrypt_salsa64_avx2_loop: ) + a3(vpaddq ymm4, ymm1, ymm0) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpxor ymm3, ymm3, ymm4) + a3(vpaddq ymm4, ymm0, ymm3) + a3(vpsrlq ymm5, ymm4, 51) + a3(vpxor ymm2, ymm2, ymm5) + a3(vpsllq ymm4, ymm4, 13) + a3(vpxor ymm2, ymm2, ymm4) + a3(vpaddq ymm4, ymm3, ymm2) + a3(vpsrlq ymm5, ymm4, 25) + a3(vpxor ymm1, ymm1, ymm5) + a3(vpsllq ymm4, ymm4, 39) + a3(vpxor ymm1, ymm1, ymm4) + a3(vpaddq ymm4, ymm2, ymm1) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpermq ymm1, ymm1, 0x39) + a3(vpermq ymm10, ymm2, 0x4e) + a3(vpxor ymm0, ymm0, ymm4) + a3(vpermq ymm3, ymm3, 0x93) + a3(vpaddq ymm4, ymm3, ymm0) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpxor ymm1, ymm1, ymm4) + a3(vpaddq ymm4, ymm0, ymm1) + a3(vpsrlq ymm5, ymm4, 51) + a3(vpxor ymm10, ymm10, ymm5) + a3(vpsllq ymm4, ymm4, 13) + a3(vpxor ymm10, ymm10, ymm4) + a3(vpaddq ymm4, ymm1, ymm10) + a3(vpsrlq ymm5, ymm4, 25) + a3(vpxor ymm3, ymm3, ymm5) + a3(vpsllq ymm4, ymm4, 39) + a3(vpermq ymm1, ymm1, 0x93) + a3(vpxor ymm3, ymm3, ymm4) + a3(vpermq ymm2, ymm10, 0x4e) + a3(vpaddq ymm4, ymm10, ymm3) + a3(vpshufd ymm4, ymm4, 0xb1) + a3(vpermq ymm3, ymm3, 0x39) + a3(vpxor ymm0, ymm0, ymm4) + a1(dec rax) + aj(jnz scrypt_salsa64_avx2_loop) + a3(vpaddq ymm0,ymm0,ymm6) + a3(vpaddq ymm1,ymm1,ymm7) + a3(vpaddq ymm2,ymm2,ymm8) + a3(vpaddq ymm3,ymm3,ymm9) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],ymm0) + a2(vmovdqa [rax+32],ymm1) + a2(vmovdqa [rax+64],ymm2) + a2(vmovdqa [rax+96],ymm3) + aj(jne scrypt_ChunkMix_avx2_loop) + a1(vzeroupper) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_AVX2 + +static void asm_calling_convention +scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1); + y0 = ymmp[0]; + y1 = ymmp[1]; + y2 = ymmp[2]; + y3 = ymmp[3]; + + if (Bxor) { + ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + ymmp = (ymmi *)scrypt_block(Bin, i); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + + if (Bxor) { + ymmp = (ymmi *)scrypt_block(Bxor, i); + y0 = _mm256_xor_si256(y0, ymmp[0]); + y1 = _mm256_xor_si256(y1, ymmp[1]); + y2 = _mm256_xor_si256(y2, ymmp[2]); + y3 = _mm256_xor_si256(y3, ymmp[3]); + } + + t0 = y0; + t1 = y1; + t2 = y2; + t3 = y3; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm256_add_epi64(y0, y1); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y3 = _mm256_xor_si256(y3, z0); + z0 = _mm256_add_epi64(y3, y0); + z1 = _mm256_srli_epi64(z0, 64-13); + y2 = _mm256_xor_si256(y2, z1); + z0 = _mm256_slli_epi64(z0, 13); + y2 = _mm256_xor_si256(y2, z0); + z0 = _mm256_add_epi64(y2, y3); + z1 = _mm256_srli_epi64(z0, 64-39); + y1 = _mm256_xor_si256(y1, z1); + z0 = _mm256_slli_epi64(z0, 39); + y1 = _mm256_xor_si256(y1, z0); + y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1)); + y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); + y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3)); + z0 = _mm256_add_epi64(y1, y2); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y0 = _mm256_xor_si256(y0, z0); + z0 = _mm256_add_epi64(y0, y3); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y1 = _mm256_xor_si256(y1, z0); + z0 = _mm256_add_epi64(y1, y0); + z1 = _mm256_srli_epi64(z0, 64-13); + y2 = _mm256_xor_si256(y2, z1); + z0 = _mm256_slli_epi64(z0, 13); + y2 = _mm256_xor_si256(y2, z0); + z0 = _mm256_add_epi64(y2, y1); + z1 = _mm256_srli_epi64(z0, 64-39); + y3 = _mm256_xor_si256(y3, z1); + z0 = _mm256_slli_epi64(z0, 39); + y3 = _mm256_xor_si256(y3, z0); + z0 = _mm256_add_epi64(y3, y2); + z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + y0 = _mm256_xor_si256(y0, z0); + y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3)); + y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2)); + y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1)); + } + + y0 = _mm256_add_epi64(y0, t0); + y1 = _mm256_add_epi64(y1, t1); + y2 = _mm256_add_epi64(y2, t2); + y3 = _mm256_add_epi64(y3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half); + ymmp[0] = y0; + ymmp[1] = y1; + ymmp[2] = y2; + ymmp[3] = y3; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_AVX2) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-AVX2" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa64-sse2.h b/scryptjane/scrypt-jane-mix_salsa64-sse2.h new file mode 100644 index 00000000..971d98a3 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64-sse2.h @@ -0,0 +1,449 @@ +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_sse2_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm8, xmm2) + a2(movdqa xmm9, xmm3) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(movdqa xmm2, xmm7) + a2(movdqa xmm3, xmm6) + a2(punpcklqdq xmm10, xmm6) + a2(punpcklqdq xmm11, xmm7) + a2(movdqa xmm6, xmm8) + a2(movdqa xmm7, xmm9) + a2(punpcklqdq xmm9, xmm9) + a2(punpcklqdq xmm8, xmm8) + a2(punpckhqdq xmm2, xmm10) + a2(punpckhqdq xmm3, xmm11) + a2(punpckhqdq xmm6, xmm9) + a2(punpckhqdq xmm7, xmm8) + aj(ja scrypt_salsa64_sse2_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_sse2_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSE2 + +static void asm_calling_convention +scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x4; + z1 = x5; + z2 = x2; + z3 = x3; + x4 = z1; + x5 = z0; + x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); + x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); + x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); + x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSE2" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif + +/* sse3/avx use this as well */ +#if defined(SCRYPT_SALSA64_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + + static void asm_calling_convention + salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { + uint64_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif \ No newline at end of file diff --git a/scryptjane/scrypt-jane-mix_salsa64-ssse3.h b/scryptjane/scrypt-jane-mix_salsa64-ssse3.h new file mode 100644 index 00000000..d1841283 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64-ssse3.h @@ -0,0 +1,399 @@ +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a2(movdqa xmm4,[rax+64]) + a2(movdqa xmm5,[rax+80]) + a2(movdqa xmm6,[rax+96]) + a2(movdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a2(pxor xmm4,[r9+64]) + a2(pxor xmm5,[r9+80]) + a2(pxor xmm6,[r9+96]) + a2(pxor xmm7,[r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a2(pxor xmm4,[rsi+r9+64]) + a2(pxor xmm5,[rsi+r9+80]) + a2(pxor xmm6,[rsi+r9+96]) + a2(pxor xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a2(pxor xmm4,[rdx+r9+64]) + a2(pxor xmm5,[rdx+r9+80]) + a2(pxor xmm6,[rdx+r9+96]) + a2(pxor xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [rsp+0],xmm0) + a2(movdqa [rsp+16],xmm1) + a2(movdqa [rsp+32],xmm2) + a2(movdqa [rsp+48],xmm3) + a2(movdqa [rsp+64],xmm4) + a2(movdqa [rsp+80],xmm5) + a2(movdqa [rsp+96],xmm6) + a2(movdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_ssse3_loop: ) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm4, xmm10) + a2(pxor xmm5, xmm11) + a2(pxor xmm4, xmm8) + a2(pxor xmm5, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm4) + a2(paddq xmm11, xmm5) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm4) + a2(movdqa xmm9, xmm5) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + a2(sub rax, 2) + a2(movdqa xmm8, xmm0) + a2(movdqa xmm9, xmm1) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm6, xmm8) + a2(pxor xmm7, xmm9) + a2(movdqa xmm10, xmm0) + a2(movdqa xmm11, xmm1) + a2(paddq xmm10, xmm6) + a2(paddq xmm11, xmm7) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 51) + a2(psrlq xmm11, 51) + a2(psllq xmm8, 13) + a2(psllq xmm9, 13) + a2(pxor xmm5, xmm10) + a2(pxor xmm4, xmm11) + a2(pxor xmm5, xmm8) + a2(pxor xmm4, xmm9) + a2(movdqa xmm10, xmm6) + a2(movdqa xmm11, xmm7) + a2(paddq xmm10, xmm5) + a2(paddq xmm11, xmm4) + a2(movdqa xmm8, xmm10) + a2(movdqa xmm9, xmm11) + a2(psrlq xmm10, 25) + a2(psrlq xmm11, 25) + a2(psllq xmm8, 39) + a2(psllq xmm9, 39) + a2(pxor xmm2, xmm10) + a2(pxor xmm3, xmm11) + a2(pxor xmm2, xmm8) + a2(pxor xmm3, xmm9) + a2(movdqa xmm8, xmm5) + a2(movdqa xmm9, xmm4) + a2(paddq xmm8, xmm2) + a2(paddq xmm9, xmm3) + a3(pshufd xmm8, xmm8, 0xb1) + a3(pshufd xmm9, xmm9, 0xb1) + a2(pxor xmm0, xmm8) + a2(pxor xmm1, xmm9) + a2(movdqa xmm10, xmm2) + a2(movdqa xmm11, xmm3) + a2(movdqa xmm2, xmm6) + a2(movdqa xmm3, xmm7) + a3(palignr xmm2, xmm7, 8) + a3(palignr xmm3, xmm6, 8) + a2(movdqa xmm6, xmm11) + a2(movdqa xmm7, xmm10) + a3(palignr xmm6, xmm10, 8) + a3(palignr xmm7, xmm11, 8) + aj(ja scrypt_salsa64_ssse3_loop) + a2(paddq xmm0,[rsp+0]) + a2(paddq xmm1,[rsp+16]) + a2(paddq xmm2,[rsp+32]) + a2(paddq xmm3,[rsp+48]) + a2(paddq xmm4,[rsp+64]) + a2(paddq xmm5,[rsp+80]) + a2(paddq xmm6,[rsp+96]) + a2(paddq xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a2(movdqa [rax+64],xmm4) + a2(movdqa [rax+80],xmm5) + a2(movdqa [rax+96],xmm6) + a2(movdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_ssse3_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_SSSE3 + +static void asm_calling_convention +scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z2); + x5 = _mm_xor_si128(x5, z3); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z2 = _mm_srli_epi64(z0, 64-13); + z3 = _mm_srli_epi64(z1, 64-13); + z0 = _mm_slli_epi64(z0, 13); + z1 = _mm_slli_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z2); + x4 = _mm_xor_si128(x4, z3); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z2 = _mm_srli_epi64(z0, 64-39); + z3 = _mm_srli_epi64(z1, 64-39); + z0 = _mm_slli_epi64(z0, 39); + z1 = _mm_slli_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z2); + x3 = _mm_xor_si128(x3, z3); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-SSSE3" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa64-xop.h b/scryptjane/scrypt-jane-mix_salsa64-xop.h new file mode 100644 index 00000000..d51d1121 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64-xop.h @@ -0,0 +1,335 @@ +/* x64 */ +#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS) + +#define SCRYPT_SALSA64_XOP + +asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_xop) + a1(push rbp) + a2(mov rbp, rsp) + a2(and rsp, ~63) + a2(sub rsp, 128) + a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */ + a2(shl rcx,7) + a2(lea r9,[rcx-128]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a2(vmovdqa xmm4,[rax+64]) + a2(vmovdqa xmm5,[rax+80]) + a2(vmovdqa xmm6,[rax+96]) + a2(vmovdqa xmm7,[rax+112]) + aj(jz scrypt_ChunkMix_xop_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a3(vpxor xmm4,xmm4,[r9+64]) + a3(vpxor xmm5,xmm5,[r9+80]) + a3(vpxor xmm6,xmm6,[r9+96]) + a3(vpxor xmm7,xmm7,[r9+112]) + a1(scrypt_ChunkMix_xop_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_xop_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a3(vpxor xmm4,xmm4,[rsi+r9+64]) + a3(vpxor xmm5,xmm5,[rsi+r9+80]) + a3(vpxor xmm6,xmm6,[rsi+r9+96]) + a3(vpxor xmm7,xmm7,[rsi+r9+112]) + aj(jz scrypt_ChunkMix_xop_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a3(vpxor xmm4,xmm4,[rdx+r9+64]) + a3(vpxor xmm5,xmm5,[rdx+r9+80]) + a3(vpxor xmm6,xmm6,[rdx+r9+96]) + a3(vpxor xmm7,xmm7,[rdx+r9+112]) + a1(scrypt_ChunkMix_xop_no_xor2:) + a2(vmovdqa [rsp+0],xmm0) + a2(vmovdqa [rsp+16],xmm1) + a2(vmovdqa [rsp+32],xmm2) + a2(vmovdqa [rsp+48],xmm3) + a2(vmovdqa [rsp+64],xmm4) + a2(vmovdqa [rsp+80],xmm5) + a2(vmovdqa [rsp+96],xmm6) + a2(vmovdqa [rsp+112],xmm7) + a2(mov rax,8) + a1(scrypt_salsa64_xop_loop: ) + a3(vpaddq xmm8, xmm0, xmm2) + a3(vpaddq xmm9, xmm1, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm6, xmm6, xmm8) + a3(vpxor xmm7, xmm7, xmm9) + a3(vpaddq xmm10, xmm0, xmm6) + a3(vpaddq xmm11, xmm1, xmm7) + a3(vprotq xmm10, xmm10, 13) + a3(vprotq xmm11, xmm11, 13) + a3(vpxor xmm4, xmm4, xmm10) + a3(vpxor xmm5, xmm5, xmm11) + a3(vpaddq xmm8, xmm6, xmm4) + a3(vpaddq xmm9, xmm7, xmm5) + a3(vprotq xmm8, xmm8, 39) + a3(vprotq xmm9, xmm9, 39) + a3(vpxor xmm2, xmm2, xmm8) + a3(vpxor xmm3, xmm3, xmm9) + a3(vpaddq xmm10, xmm4, xmm2) + a3(vpaddq xmm11, xmm5, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm0, xmm0, xmm10) + a3(vpxor xmm1, xmm1, xmm11) + a2(vmovdqa xmm8, xmm2) + a2(vmovdqa xmm9, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm9, xmm8, 8) + a4(vpalignr xmm7, xmm8, xmm9, 8) + a3(vpaddq xmm10, xmm0, xmm2) + a3(vpaddq xmm11, xmm1, xmm3) + a3(vpshufd xmm10, xmm10, 0xb1) + a3(vpshufd xmm11, xmm11, 0xb1) + a3(vpxor xmm6, xmm6, xmm10) + a3(vpxor xmm7, xmm7, xmm11) + a3(vpaddq xmm8, xmm0, xmm6) + a3(vpaddq xmm9, xmm1, xmm7) + a3(vprotq xmm8, xmm8, 13) + a3(vprotq xmm9, xmm9, 13) + a3(vpxor xmm5, xmm5, xmm8) + a3(vpxor xmm4, xmm4, xmm9) + a3(vpaddq xmm10, xmm6, xmm5) + a3(vpaddq xmm11, xmm7, xmm4) + a3(vprotq xmm10, xmm10, 39) + a3(vprotq xmm11, xmm11, 39) + a3(vpxor xmm2, xmm2, xmm10) + a3(vpxor xmm3, xmm3, xmm11) + a3(vpaddq xmm8, xmm5, xmm2) + a3(vpaddq xmm9, xmm4, xmm3) + a3(vpshufd xmm8, xmm8, 0xb1) + a3(vpshufd xmm9, xmm9, 0xb1) + a3(vpxor xmm0, xmm0, xmm8) + a3(vpxor xmm1, xmm1, xmm9) + a2(vmovdqa xmm10, xmm2) + a2(vmovdqa xmm11, xmm3) + a4(vpalignr xmm2, xmm6, xmm7, 8) + a4(vpalignr xmm3, xmm7, xmm6, 8) + a4(vpalignr xmm6, xmm11, xmm10, 8) + a4(vpalignr xmm7, xmm10, xmm11, 8) + a2(sub rax, 2) + aj(ja scrypt_salsa64_xop_loop) + a3(vpaddq xmm0,xmm0,[rsp+0]) + a3(vpaddq xmm1,xmm1,[rsp+16]) + a3(vpaddq xmm2,xmm2,[rsp+32]) + a3(vpaddq xmm3,xmm3,[rsp+48]) + a3(vpaddq xmm4,xmm4,[rsp+64]) + a3(vpaddq xmm5,xmm5,[rsp+80]) + a3(vpaddq xmm6,xmm6,[rsp+96]) + a3(vpaddq xmm7,xmm7,[rsp+112]) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0xff) + a2(add r9,128) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a2(vmovdqa [rax+64],xmm4) + a2(vmovdqa [rax+80],xmm5) + a2(vmovdqa [rax+96],xmm6) + a2(vmovdqa [rax+112],xmm7) + aj(jne scrypt_ChunkMix_xop_loop) + a2(mov rsp, rbp) + a1(pop rbp) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_xop) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) + +#define SCRYPT_SALSA64_XOP + +static void asm_calling_convention +scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + x4 = xmmp[4]; + x5 = xmmp[5]; + x6 = xmmp[6]; + x7 = xmmp[7]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + x4 = _mm_xor_si128(x4, xmmp[4]); + x5 = _mm_xor_si128(x5, xmmp[5]); + x6 = _mm_xor_si128(x6, xmmp[6]); + x7 = _mm_xor_si128(x7, xmmp[7]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + t4 = x4; + t5 = x5; + t6 = x6; + t7 = x7; + + for (rounds = 8; rounds; rounds -= 2) { + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z0 = _mm_roti_epi64(z0, 13); + z1 = _mm_roti_epi64(z1, 13); + x4 = _mm_xor_si128(x4, z0); + x5 = _mm_xor_si128(x5, z1); + + z0 = _mm_add_epi64(x4, x6); + z1 = _mm_add_epi64(x5, x7); + z0 = _mm_roti_epi64(z0, 39); + z1 = _mm_roti_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x4); + z1 = _mm_add_epi64(x3, x5); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + + z0 = _mm_add_epi64(x0, x2); + z1 = _mm_add_epi64(x1, x3); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x6 = _mm_xor_si128(x6, z0); + x7 = _mm_xor_si128(x7, z1); + + z0 = _mm_add_epi64(x6, x0); + z1 = _mm_add_epi64(x7, x1); + z0 = _mm_roti_epi64(z0, 13); + z1 = _mm_roti_epi64(z1, 13); + x5 = _mm_xor_si128(x5, z0); + x4 = _mm_xor_si128(x4, z1); + + z0 = _mm_add_epi64(x5, x6); + z1 = _mm_add_epi64(x4, x7); + z0 = _mm_roti_epi64(z0, 39); + z1 = _mm_roti_epi64(z1, 39); + x2 = _mm_xor_si128(x2, z0); + x3 = _mm_xor_si128(x3, z1); + + z0 = _mm_add_epi64(x2, x5); + z1 = _mm_add_epi64(x3, x4); + z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); + z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); + x0 = _mm_xor_si128(x0, z0); + x1 = _mm_xor_si128(x1, z1); + + z0 = x2; + z1 = x3; + x2 = _mm_alignr_epi8(x6, x7, 8); + x3 = _mm_alignr_epi8(x7, x6, 8); + x6 = _mm_alignr_epi8(z1, z0, 8); + x7 = _mm_alignr_epi8(z0, z1, 8); + } + + x0 = _mm_add_epi64(x0, t0); + x1 = _mm_add_epi64(x1, t1); + x2 = _mm_add_epi64(x2, t2); + x3 = _mm_add_epi64(x3, t3); + x4 = _mm_add_epi64(x4, t4); + x5 = _mm_add_epi64(x5, t5); + x6 = _mm_add_epi64(x6, t6); + x7 = _mm_add_epi64(x7, t7); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + xmmp[4] = x4; + xmmp[5] = x5; + xmmp[6] = x6; + xmmp[7] = x7; + } +} + +#endif + +#if defined(SCRYPT_SALSA64_XOP) + /* uses salsa64_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa64/8-XOP" + #undef SCRYPT_SALSA64_INCLUDED + #define SCRYPT_SALSA64_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa64.h b/scryptjane/scrypt-jane-mix_salsa64.h new file mode 100644 index 00000000..2aec04f3 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa64.h @@ -0,0 +1,41 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa64/8 Ref" + +#undef SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_INCLUDED +#define SCRYPT_SALSA64_BASIC + +static void +salsa64_core_basic(uint64_t state[16]) { + const size_t rounds = 8; + uint64_t v[16], t; + size_t i; + + for (i = 0; i < 16; i++) v[i] = state[i]; + + #define G(a,b,c,d) \ + t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ + t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ + t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ + t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ + + for (i = 0; i < rounds; i += 2) { + G( 0, 4, 8,12); + G( 5, 9,13, 1); + G(10,14, 2, 6); + G(15, 3, 7,11); + G( 0, 1, 2, 3); + G( 5, 6, 7, 4); + G(10,11, 8, 9); + G(15,12,13,14); + } + + for (i = 0; i < 16; i++) state[i] += v[i]; + + #undef G +} + +#endif + diff --git a/scryptjane/scrypt-jane-portable-x86.h b/scryptjane/scrypt-jane-portable-x86.h index 03282fa8..396a7bd8 100644 --- a/scryptjane/scrypt-jane-portable-x86.h +++ b/scryptjane/scrypt-jane-portable-x86.h @@ -1,15 +1,20 @@ #if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) #define X86ASM + /* gcc 2.95 royally screws up stack alignments on variables */ - #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) #define X86ASM_SSE #define X86ASM_SSE2 #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) #define X86ASM_SSSE3 #endif - #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) #define X86ASM_AVX + #define X86ASM_XOP + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700))) + #define X86ASM_AVX2 #endif #endif @@ -21,10 +26,14 @@ #endif #if (COMPILER_GCC >= 40400) #define X86_64ASM_AVX + #define X86_64ASM_XOP + #endif + #if (COMPILER_GCC >= 40700) + #define X86_64ASM_AVX2 #endif #endif -#if defined(COMPILER_MSVC) +#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64)) #define X86_INTRINSIC #if defined(CPU_X86_64) || defined(X86ASM_SSE) #define X86_INTRINSIC_SSE @@ -32,17 +41,16 @@ #if defined(CPU_X86_64) || defined(X86ASM_SSE2) #define X86_INTRINSIC_SSE2 #endif - #if (COMPILER_MSVC >= 1400) + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2005) #define X86_INTRINSIC_SSSE3 #endif -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1) + #define X86_INTRINSIC_AVX + #define X86_INTRINSIC_XOP + #endif + #if (COMPILER_MSVC >= COMPILER_MSVC_VS2012) + #define X86_INTRINSIC_AVX2 + #endif #endif #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) @@ -59,12 +67,17 @@ #if defined(__AVX__) #define X86_INTRINSIC_AVX #endif + #if defined(__XOP__) + #define X86_INTRINSIC_XOP + #endif + #if defined(__AVX2__) + #define X86_INTRINSIC_AVX2 + #endif #endif /* only use simd on windows (or SSE2 on gcc)! */ #if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) #if defined(X86_INTRINSIC_SSE) - #define X86_INTRINSIC #include #include typedef __m64 qmm; @@ -72,17 +85,27 @@ typedef __m128d xmmd; #endif #if defined(X86_INTRINSIC_SSE2) - #define X86_INTRINSIC_SSE2 #include typedef __m128i xmmi; #endif #if defined(X86_INTRINSIC_SSSE3) - #define X86_INTRINSIC_SSSE3 #include #endif + #if defined(X86_INTRINSIC_AVX) + #include + #endif + #if defined(X86_INTRINSIC_XOP) + #if defined(COMPILER_MSVC) + #include + #else + #include + #endif + #endif + #if defined(X86_INTRINSIC_AVX2) + typedef __m256i ymmi; + #endif #endif - #if defined(X86_INTRINSIC_SSE2) typedef union packedelem8_t { uint8_t u[16]; @@ -115,11 +138,9 @@ } packedelem64; #endif -#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3) - const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}}; - const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; - const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; - const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}}; +#if defined(X86_INTRINSIC_SSSE3) + static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; #endif /* @@ -130,7 +151,8 @@ a1(..) a2(.., ..) a3(.., .., ..) - a1(ret) + 64bit OR 0 paramters: a1(ret) + 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters asm_naked_fn_end(name) */ @@ -142,12 +164,13 @@ #define a2(x, y) __asm {x, y} #define a3(x, y, z) __asm {x, y, z} #define a4(x, y, z, w) __asm {x, y, z, w} - #define al(x) __asm {label##x:} - #define aj(x, y, z) __asm {x label##y} + #define aj(x) __asm {x} #define asm_align8 a1(ALIGN 8) #define asm_align16 a1(ALIGN 16) - #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn + #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) + #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn #define asm_naked_fn(fn) { #define asm_naked_fn_end(fn) } #elif defined(COMPILER_GCC) @@ -155,21 +178,66 @@ #define GNU_AS2(x, y) #x ", " #y ";\n" #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" - #define GNU_ASL(x) "\n" #x ":\n" - #define GNU_ASJ(x, y, z) #x " " #y #z ";" + #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" + #define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n" #define a1(x) GNU_AS1(x) #define a2(x, y) GNU_AS2(x, y) #define a3(x, y, z) GNU_AS3(x, y, z) #define a4(x, y, z, w) GNU_AS4(x, y, z, w) - #define al(x) GNU_ASL(x) - #define aj(x, y, z) GNU_ASJ(x, y, z) - #define asm_align8 a1(.align 8) - #define asm_align16 a1(.align 16) - - #define asm_naked_fn_proto(type, fn) extern type STDCALL fn - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #define aj(x) GNU_ASJ(x) + #define asm_align8 ".p2align 3,,7" + #define asm_align16 ".p2align 4,,15" + + #if defined(OS_WINDOWS) + #define asm_calling_convention CDECL + #define aret(n) a1(ret) + + #if defined(X86_64ASM) + #define asm_naked_fn(fn) ; __asm__ ( \ + ".text\n" \ + asm_align16 GNU_ASFN(fn) \ + "subq $136, %rsp;" \ + "movdqa %xmm6, 0(%rsp);" \ + "movdqa %xmm7, 16(%rsp);" \ + "movdqa %xmm8, 32(%rsp);" \ + "movdqa %xmm9, 48(%rsp);" \ + "movdqa %xmm10, 64(%rsp);" \ + "movdqa %xmm11, 80(%rsp);" \ + "movdqa %xmm12, 96(%rsp);" \ + "movq %rdi, 112(%rsp);" \ + "movq %rsi, 120(%rsp);" \ + "movq %rcx, %rdi;" \ + "movq %rdx, %rsi;" \ + "movq %r8, %rdx;" \ + "movq %r9, %rcx;" \ + "call 1f;" \ + "movdqa 0(%rsp), %xmm6;" \ + "movdqa 16(%rsp), %xmm7;" \ + "movdqa 32(%rsp), %xmm8;" \ + "movdqa 48(%rsp), %xmm9;" \ + "movdqa 64(%rsp), %xmm10;" \ + "movdqa 80(%rsp), %xmm11;" \ + "movdqa 96(%rsp), %xmm12;" \ + "movq 112(%rsp), %rdi;" \ + "movq 120(%rsp), %rsi;" \ + "addq $136, %rsp;" \ + "ret;" \ + ".intel_syntax noprefix;" \ + ".p2align 4,,15;" \ + "1:;" + #else + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) + #endif + #else + #define asm_calling_convention STDCALL + #define aret(n) a1(ret n) + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) + #endif + + #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" #define asm_gcc_parms() ".att_syntax prefix;" #define asm_gcc_trashed() __asm__ __volatile__("" ::: @@ -191,7 +259,9 @@ typedef enum cpu_flags_x86_t { cpu_ssse3 = 1 << 4, cpu_sse4_1 = 1 << 5, cpu_sse4_2 = 1 << 6, - cpu_avx = 1 << 7 + cpu_avx = 1 << 7, + cpu_xop = 1 << 8, + cpu_avx2 = 1 << 9 } cpu_flags_x86; typedef enum cpu_vendors_x86_t { @@ -238,6 +308,7 @@ get_cpuid(x86_regs *regs, uint32_t flags) { asm_gcc() a1(push cpuid_bx) + a2(xor ecx, ecx) a1(cpuid) a2(mov [%1 + 0], eax) a2(mov [%1 + 4], ebx) @@ -274,7 +345,7 @@ detect_cpu(void) { union { uint8_t s[12]; uint32_t i[3]; } vendor_string; cpu_vendors_x86 vendor = cpu_nobody; x86_regs regs; - uint32_t max_level; + uint32_t max_level, max_ext_level; size_t cpu_flags = 0; #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) uint64_t xgetbv_flags; @@ -320,7 +391,22 @@ detect_cpu(void) { if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; - + + if (cpu_flags & cpu_avx) { + if (max_level >= 7) { + get_cpuid(®s, 7); + if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2; + } + + get_cpuid(®s, 0x80000000); + max_ext_level = regs.eax; + if (max_ext_level >= 0x80000001) { + get_cpuid(®s, 0x80000001); + if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop; + } + } + + #if defined(SCRYPT_TEST_SPEED) cpu_flags &= cpu_detect_mask; #endif @@ -331,7 +417,9 @@ detect_cpu(void) { #if defined(SCRYPT_TEST_SPEED) static const char * get_top_cpuflag_desc(size_t flag) { - if (flag & cpu_avx) return "AVX"; + if (flag & cpu_avx2) return "AVX2"; + else if (flag & cpu_xop) return "XOP"; + else if (flag & cpu_avx) return "AVX"; else if (flag & cpu_sse4_2) return "SSE4.2"; else if (flag & cpu_sse4_1) return "SSE4.1"; else if (flag & cpu_ssse3) return "SSSE3"; @@ -344,6 +432,16 @@ get_top_cpuflag_desc(size_t flag) { /* enable the highest system-wide option */ #if defined(SCRYPT_CHOOSE_COMPILETIME) + #if !defined(__AVX2__) + #undef X86_64ASM_AVX2 + #undef X86ASM_AVX2 + #undef X86_INTRINSIC_AVX2 + #endif + #if !defined(__XOP__) + #undef X86_64ASM_XOP + #undef X86ASM_XOP + #undef X86_INTRINSIC_XOP + #endif #if !defined(__AVX__) #undef X86_64ASM_AVX #undef X86ASM_AVX diff --git a/scryptjane/scrypt-jane-portable.h b/scryptjane/scrypt-jane-portable.h index 33c8c2ca..e83e3149 100644 --- a/scryptjane/scrypt-jane-portable.h +++ b/scryptjane/scrypt-jane-portable.h @@ -36,14 +36,29 @@ /* determine compiler */ #if defined(_MSC_VER) - #define COMPILER_MSVC _MSC_VER - #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) - #define COMPILER_MSVC6PP_AND_LATER + #define COMPILER_MSVC_VS6 120000000 + #define COMPILER_MSVC_VS6PP 121000000 + #define COMPILER_MSVC_VS2002 130000000 + #define COMPILER_MSVC_VS2003 131000000 + #define COMPILER_MSVC_VS2005 140050727 + #define COMPILER_MSVC_VS2008 150000000 + #define COMPILER_MSVC_VS2008SP1 150030729 + #define COMPILER_MSVC_VS2010 160000000 + #define COMPILER_MSVC_VS2010SP1 160040219 + #define COMPILER_MSVC_VS2012RC 170000000 + #define COMPILER_MSVC_VS2012 170050727 + + #if _MSC_FULL_VER > 100000000 + #define COMPILER_MSVC (_MSC_FULL_VER) + #else + #define COMPILER_MSVC (_MSC_FULL_VER * 10) #endif - #if (COMPILER_MSVC >= 1500) - #define COMPILER_HAS_TMMINTRIN + + #if ((_MSC_VER == 1200) && defined(_mm_free)) + #undef COMPILER_MSVC + #define COMPILER_MSVC COMPILER_MSVC_VS6PP #endif - + #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4100) /* unreferenced formal parameter */ @@ -65,6 +80,8 @@ #define ROTR64(a,b) _rotr64(a,b) #undef NOINLINE #define NOINLINE __declspec(noinline) + #undef NORETURN + #define NORETURN #undef INLINE #define INLINE __forceinline #undef FASTCALL @@ -75,7 +92,7 @@ #define STDCALL __stdcall #undef NAKED #define NAKED __declspec(naked) - #define MM16 __declspec(align(16)) + #define ALIGN(n) __declspec(align(n)) #endif #if defined(__ICC) #define COMPILER_INTEL @@ -97,6 +114,12 @@ #else #define NOINLINE #endif + #undef NORETURN + #if (COMPILER_GCC >= 30000) + #define NORETURN __attribute__((noreturn)) + #else + #define NORETURN + #endif #undef INLINE #if (COMPILER_GCC >= 30000) #define INLINE __attribute__((always_inline)) @@ -113,7 +136,7 @@ #define CDECL __attribute__((cdecl)) #undef STDCALL #define STDCALL __attribute__((stdcall)) - #define MM16 __attribute__((aligned(16))) + #define ALIGN(n) __attribute__((aligned(n))) #include #endif #if defined(__MINGW32__) || defined(__MINGW64__) @@ -247,7 +270,7 @@ scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { return (1 & ((differentbits - 1) >> 8)); } -void +static void scrypt_ensure_zero(void *p, size_t len) { #if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) __stosb((unsigned char *)p, 0, len); @@ -279,3 +302,6 @@ scrypt_ensure_zero(void *p, size_t len) { #include "scrypt-jane-portable-x86.h" +#if !defined(asm_calling_convention) +#define asm_calling_convention +#endif diff --git a/scryptjane/scrypt-jane-romix-basic.h b/scryptjane/scrypt-jane-romix-basic.h index ca1df02d..57ba649f 100644 --- a/scryptjane/scrypt-jane-romix-basic.h +++ b/scryptjane/scrypt-jane-romix-basic.h @@ -4,12 +4,13 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc #endif /* romix pre/post nop function */ -static void STDCALL +static void asm_calling_convention scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { + (void)blocks; (void)nblocks; } /* romix pre/post endian conversion function */ -static void STDCALL +static void asm_calling_convention scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { #if !defined(CPU_LE) static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; @@ -20,18 +21,24 @@ scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); } } +#else + (void)blocks; (void)nblocks; #endif } /* chunkmix test function */ -typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); -typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); +typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); static int scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; - scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; +#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) + scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; +#else + scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v; +#endif uint8_t final[16]; size_t i; diff --git a/scryptjane/scrypt-jane-romix-template.h b/scryptjane/scrypt-jane-romix-template.h index 2fd7674e..6bbda621 100644 --- a/scryptjane/scrypt-jane-romix-template.h +++ b/scryptjane/scrypt-jane-romix-template.h @@ -17,9 +17,13 @@ 2*r: number of blocks in the chunk */ -static void STDCALL +static void asm_calling_convention SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { - scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; +#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2)) + scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block; +#else + scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block; +#endif uint32_t i, j, blocksPerChunk = r * 2, half = 0; /* 1: X = B_{2r - 1} */ @@ -69,7 +73,7 @@ SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *B static void NOINLINE FASTCALL SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { - uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * r * 2); scrypt_mix_word_t *block = V; SCRYPT_ROMIX_TANGLE_FN(X, r * 2); diff --git a/scryptjane/scrypt-jane-romix.h b/scryptjane/scrypt-jane-romix.h index faa655a0..84cf6120 100644 --- a/scryptjane/scrypt-jane-romix.h +++ b/scryptjane/scrypt-jane-romix.h @@ -13,11 +13,11 @@ #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) #if !defined(SCRYPT_CHOOSE_COMPILETIME) static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} - static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } + static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; } #else static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} #endif - static int scrypt_test_mix() { return 0; } + static int scrypt_test_mix(void) { return 0; } #error must define a mix function! #endif diff --git a/scryptjane/scrypt-jane-salsa.h b/scryptjane/scrypt-jane-salsa.h index 0c1604ba..df0a3e0c 100644 --- a/scryptjane/scrypt-jane-salsa.h +++ b/scryptjane/scrypt-jane-salsa.h @@ -11,10 +11,19 @@ typedef uint32_t scrypt_mix_word_t; /* must have these here in case block bytes is ever != 64 */ #include "scrypt-jane-romix-basic.h" +#include "scrypt-jane-mix_salsa-xop.h" #include "scrypt-jane-mix_salsa-avx.h" #include "scrypt-jane-mix_salsa-sse2.h" #include "scrypt-jane-mix_salsa.h" +#if defined(SCRYPT_SALSA_XOP) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop + #define SCRYPT_ROMIX_FN scrypt_ROMix_xop + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + #if defined(SCRYPT_SALSA_AVX) #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx #define SCRYPT_ROMIX_FN scrypt_ROMix_avx @@ -41,9 +50,15 @@ typedef uint32_t scrypt_mix_word_t; #if !defined(SCRYPT_CHOOSE_COMPILETIME) static scrypt_ROMixfn -scrypt_getROMix() { +scrypt_getROMix(void) { size_t cpuflags = detect_cpu(); +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + return scrypt_ROMix_xop; + else +#endif + #if defined(SCRYPT_SALSA_AVX) if (cpuflags & cpu_avx) return scrypt_ROMix_avx; @@ -63,14 +78,22 @@ scrypt_getROMix() { #if defined(SCRYPT_TEST_SPEED) static size_t -available_implementations() { +available_implementations(void) { + size_t cpuflags = detect_cpu(); size_t flags = 0; +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + flags |= cpu_xop; +#endif + #if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) flags |= cpu_avx; #endif #if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) flags |= cpu_sse2; #endif @@ -80,7 +103,7 @@ available_implementations() { static int -scrypt_test_mix() { +scrypt_test_mix(void) { static const uint8_t expected[16] = { 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, }; @@ -88,6 +111,11 @@ scrypt_test_mix() { int ret = 1; size_t cpuflags = detect_cpu(); +#if defined(SCRYPT_SALSA_XOP) + if (cpuflags & cpu_xop) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + #if defined(SCRYPT_SALSA_AVX) if (cpuflags & cpu_avx) ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); diff --git a/scryptjane/scrypt-jane-salsa64.h b/scryptjane/scrypt-jane-salsa64.h new file mode 100644 index 00000000..96b78136 --- /dev/null +++ b/scryptjane/scrypt-jane-salsa64.h @@ -0,0 +1,183 @@ +#define SCRYPT_MIX_BASE "Salsa64/8" + +typedef uint64_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U64TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP + +#define SCRYPT_BLOCK_BYTES 128 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa64-avx2.h" +#include "scrypt-jane-mix_salsa64-xop.h" +#include "scrypt-jane-mix_salsa64-avx.h" +#include "scrypt-jane-mix_salsa64-ssse3.h" +#include "scrypt-jane-mix_salsa64-sse2.h" +#include "scrypt-jane-mix_salsa64.h" + +#if defined(SCRYPT_SALSA64_AVX2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx2 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_XOP) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop + #define SCRYPT_ROMIX_FN scrypt_ROMix_xop + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa64_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix(void) { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + return scrypt_ROMix_avx2; + else +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + return scrypt_ROMix_xop; + else +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations(void) { + size_t cpuflags = detect_cpu(); + size_t flags = 0; + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + flags |= cpu_avx2; +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + flags |= cpu_xop; +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix(void) { + static const uint8_t expected[16] = { + 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA64_AVX2) + if (cpuflags & cpu_avx2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_XOP) + if (cpuflags & cpu_xop) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA64_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/scryptjane/scrypt-jane-test-vectors.h b/scryptjane/scrypt-jane-test-vectors.h index a1e4c619..72a72763 100644 --- a/scryptjane/scrypt-jane-test-vectors.h +++ b/scryptjane/scrypt-jane-test-vectors.h @@ -6,7 +6,7 @@ typedef struct scrypt_test_setting_t { static const scrypt_test_setting post_settings[] = { {"", "", 3, 0, 0}, {"password", "NaCl", 9, 3, 4}, - {0} + {0, 0, 0, 0, 0} }; #if defined(SCRYPT_SHA256)