diff --git a/binding.gyp b/binding.gyp
index f8266880..46bb2482 100644
--- a/binding.gyp
+++ b/binding.gyp
@@ -58,6 +58,14 @@
             "cflags_cc": [
                 "-std=c++0x"
             ],
+            'conditions': [
+                ['OS == "mac"', {
+	            'xcode_settings': {
+                      'OTHER_CFLAGS': ['-no-integrated-as']
+	             }
+	          }
+                ]
+            ],
         }
     ]
 }
diff --git a/crypto/oaes_lib.c b/crypto/oaes_lib.c
index f3f2aac8..f615035f 100644
--- a/crypto/oaes_lib.c
+++ b/crypto/oaes_lib.c
@@ -34,7 +34,9 @@ static const char _NR[] = {
 #include <stddef.h>
 #include <time.h> 
 #include <sys/timeb.h>
+#ifdef __linux__
 #include <malloc.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
diff --git a/package-lock.json b/package-lock.json
index e1e27baa..09dfc51f 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -2,6 +2,7 @@
     "name": "multi-hashing",
     "version": "0.0.9",
     "lockfileVersion": 1,
+    "requires": true,
     "dependencies": {
         "bindings": {
             "version": "1.3.0",
@@ -9,9 +10,9 @@
             "integrity": "sha512-DpLh5EzMR2kzvX1KIlVC0VkC3iZtHKTgdtZ0a3pglBZdaQFjt5S9g9xd1lE+YvXyfd6mtCeRnrUfOLYiTMlNSw=="
         },
         "nan": {
-            "version": "2.6.2",
-            "resolved": "https://registry.npmjs.org/nan/-/nan-2.6.2.tgz",
-            "integrity": "sha1-5P805slf37WuzAjeZZb0NgWn20U="
+            "version": "2.8.0",
+            "resolved": "https://registry.npmjs.org/nan/-/nan-2.8.0.tgz",
+            "integrity": "sha1-7XFfP+neArV6XmJS2QqWZ14fCFo="
         }
     }
 }
diff --git a/package.json b/package.json
index a375cc8b..211348aa 100644
--- a/package.json
+++ b/package.json
@@ -16,7 +16,7 @@
     },
     "dependencies": {
         "bindings": "*",
-        "nan": "^2.6.2"
+        "nan": "^2.8.0"
     },
     "keywords": [
         "scrypt",
diff --git a/scryptjane.c b/scryptjane.c
index 1cd702e2..de656e22 100644
--- a/scryptjane.c
+++ b/scryptjane.c
@@ -12,8 +12,7 @@
 #include "scryptjane/scrypt-jane-romix.h"
 #include "scryptjane/scrypt-jane-test-vectors.h"
 
-
-#define scrypt_maxN 30  /* (1 << (30 + 1)) = ~2 billion */
+#define scrypt_maxNfactor 30  /* (1 << (30 + 1)) = ~2 billion */
 #if (SCRYPT_BLOCK_BYTES == 64)
 #define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
 #elif (SCRYPT_BLOCK_BYTES == 128)
@@ -23,64 +22,64 @@
 #elif (SCRYPT_BLOCK_BYTES == 512)
 #define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
 #endif
-#define scrypt_maxr scrypt_r_32kb /* 32kb */
-#define scrypt_maxp 25  /* (1 << 25) = ~33 million */
+#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */
+#define scrypt_maxpfactor 25  /* (1 << 25) = ~33 million */
 
 #include <stdio.h>
-#include <stdlib.h>
+//#include <malloc.h>
 
-static void
+static void NORETURN
 scrypt_fatal_error_default(const char *msg) {
-	fprintf(stderr, "%s\n", msg);
-	exit(1);
+    fprintf(stderr, "%s\n", msg);
+    exit(1);
 }
 
 static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
 
 void
-scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) {
-	scrypt_fatal_error = fn;
+scrypt_set_fatal_error(scrypt_fatal_errorfn fn) {
+    scrypt_fatal_error = fn;
 }
 
 static int
-scrypt_power_on_self_test() {
-	const scrypt_test_setting *t;
-	uint8_t test_digest[64];
-	uint32_t i;
-	int res = 7, scrypt_valid;
+scrypt_power_on_self_test(void) {
+    const scrypt_test_setting *t;
+    uint8_t test_digest[64];
+    uint32_t i;
+    int res = 7, scrypt_valid;
 
-	if (!scrypt_test_mix()) {
+    if (!scrypt_test_mix()) {
 #if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
+        scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
 #endif
-		res &= ~1;
-	}
+        res &= ~1;
+    }
 
-	if (!scrypt_test_hash()) {
+    if (!scrypt_test_hash()) {
 #if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
+        scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
 #endif
-		res &= ~2;
-	}
-
-	for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
-		t = post_settings + i;
-		scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
-		scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
-	}
-	
-	if (!scrypt_valid) {
+        res &= ~2;
+    }
+
+    for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
+        t = post_settings + i;
+        scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
+        scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
+    }
+    
+    if (!scrypt_valid) {
 #if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
+        scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
 #endif
-		res &= ~4;
-	}
+        res &= ~4;
+    }
 
-	return res;
+    return res;
 }
 
 typedef struct scrypt_aligned_alloc_t {
-	uint8_t *mem, *ptr;
+    uint8_t *mem, *ptr;
 } scrypt_aligned_alloc;
 
 #if defined(SCRYPT_TEST_SPEED)
@@ -90,95 +89,95 @@ static size_t mem_bump = 0;
 /* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
 static scrypt_aligned_alloc
 scrypt_alloc(uint64_t size) {
-	scrypt_aligned_alloc aa;
-	if (!mem_base) {
-		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
-		if (!mem_base)
-			scrypt_fatal_error("scrypt: out of memory");
-		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
-	}
-	aa.mem = mem_base + mem_bump;
-	aa.ptr = aa.mem;
-	mem_bump += (size_t)size;
-	return aa;
+    scrypt_aligned_alloc aa;
+    if (!mem_base) {
+        mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+        if (!mem_base)
+            scrypt_fatal_error("scrypt: out of memory");
+        mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+    }
+    aa.mem = mem_base + mem_bump;
+    aa.ptr = aa.mem;
+    mem_bump += (size_t)size;
+    return aa;
 }
 
 static void
 scrypt_free(scrypt_aligned_alloc *aa) {
-	mem_bump = 0;
+    mem_bump = 0;
 }
 #else
 static scrypt_aligned_alloc
 scrypt_alloc(uint64_t size) {
-	static const size_t max_alloc = (size_t)-1;
-	scrypt_aligned_alloc aa;
-	size += (SCRYPT_BLOCK_BYTES - 1);
-	if (size > max_alloc)
-		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
-	aa.mem = (uint8_t *)malloc((size_t)size);
-	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
-	if (!aa.mem)
-		scrypt_fatal_error("scrypt: out of memory");
-	return aa;
+    static const size_t max_alloc = (size_t)-1;
+    scrypt_aligned_alloc aa;
+    size += (SCRYPT_BLOCK_BYTES - 1);
+    if (size > max_alloc)
+        scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+    aa.mem = (uint8_t *)malloc((size_t)size);
+    aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+    if (!aa.mem)
+        scrypt_fatal_error("scrypt: out of memory");
+    return aa;
 }
 
 static void
 scrypt_free(scrypt_aligned_alloc *aa) {
-	free(aa->mem);
+    free(aa->mem);
 }
 #endif
 
 
 void
 scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes) {
-	scrypt_aligned_alloc YX, V;
-	uint8_t *X, *Y;
-	uint32_t N, r, p, chunk_bytes, i;
+    scrypt_aligned_alloc YX, V;
+    uint8_t *X, *Y;
+    uint32_t N, r, p, chunk_bytes, i;
 
 #if !defined(SCRYPT_CHOOSE_COMPILETIME)
-	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+    scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
 #endif
 
 #if !defined(SCRYPT_TEST)
-	static int power_on_self_test = 0;
-	if (!power_on_self_test) {
-		power_on_self_test = 1;
-		if (!scrypt_power_on_self_test())
-			scrypt_fatal_error("scrypt: power on self test failed");
-	}
+    static int power_on_self_test = 0;
+    if (!power_on_self_test) {
+        power_on_self_test = 1;
+        if (!scrypt_power_on_self_test())
+            scrypt_fatal_error("scrypt: power on self test failed");
+    }
 #endif
 
-	if (Nfactor > scrypt_maxN)
-		scrypt_fatal_error("scrypt: N out of range");
-	if (rfactor > scrypt_maxr)
-		scrypt_fatal_error("scrypt: r out of range");
-	if (pfactor > scrypt_maxp)
-		scrypt_fatal_error("scrypt: p out of range");
+    if (Nfactor > scrypt_maxNfactor)
+        scrypt_fatal_error("scrypt: N out of range");
+    if (rfactor > scrypt_maxrfactor)
+        scrypt_fatal_error("scrypt: r out of range");
+    if (pfactor > scrypt_maxpfactor)
+        scrypt_fatal_error("scrypt: p out of range");
 
-	N = (1 << (Nfactor + 1));
-	r = (1 << rfactor);
-	p = (1 << pfactor);
+    N = (1 << (Nfactor + 1));
+    r = (1 << rfactor);
+    p = (1 << pfactor);
 
-	chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
-	V = scrypt_alloc((uint64_t)N * chunk_bytes);
-	YX = scrypt_alloc((p + 1) * chunk_bytes);
+    chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+    V = scrypt_alloc((uint64_t)N * chunk_bytes);
+    YX = scrypt_alloc((p + 1) * chunk_bytes);
 
-	/* 1: X = PBKDF2(password, salt) */
-	Y = YX.ptr;
-	X = Y + chunk_bytes;
-	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
+    /* 1: X = PBKDF2(password, salt) */
+    Y = YX.ptr;
+    X = Y + chunk_bytes;
+    scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
 
-	/* 2: X = ROMix(X) */
-	for (i = 0; i < p; i++)
-		scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
+    /* 2: X = ROMix(X) */
+    for (i = 0; i < p; i++)
+        scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
 
-	/* 3: Out = PBKDF2(password, X) */
-	scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
+    /* 3: Out = PBKDF2(password, X) */
+    scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
 
-	scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
+    scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
 
-	scrypt_free(&V);
-	scrypt_free(&YX);
+    scrypt_free(&V);
+    scrypt_free(&YX);
 }
 
 #define max(a,b)            (((a) > (b)) ? (a) : (b))
diff --git a/scryptjane/scrypt-conf.h b/scryptjane/scrypt-conf.h
new file mode 100644
index 00000000..46685a51
--- /dev/null
+++ b/scryptjane/scrypt-conf.h
@@ -0,0 +1,28 @@
+/*
+	pick the best algo at runtime or compile time?
+	----------------------------------------------
+	SCRYPT_CHOOSE_COMPILETIME (gcc only!)
+	SCRYPT_CHOOSE_RUNTIME
+*/
+#define SCRYPT_CHOOSE_RUNTIME
+
+
+/*
+	hash function to use
+	-------------------------------
+	SCRYPT_BLAKE256
+	SCRYPT_BLAKE512
+	SCRYPT_SHA256
+	SCRYPT_SHA512
+	SCRYPT_SKEIN512
+*/
+//#define SCRYPT_SHA256
+
+
+/*
+	block mixer to use
+	-----------------------------
+	SCRYPT_CHACHA
+	SCRYPT_SALSA
+*/
+//#define SCRYPT_SALSA
diff --git a/scryptjane/scrypt-jane-chacha.h b/scryptjane/scrypt-jane-chacha.h
index 41d96e5e..54234576 100644
--- a/scryptjane/scrypt-jane-chacha.h
+++ b/scryptjane/scrypt-jane-chacha.h
@@ -11,11 +11,21 @@ typedef uint32_t scrypt_mix_word_t;
 /* must have these here in case block bytes is ever != 64 */
 #include "scrypt-jane-romix-basic.h"
 
+#include "scrypt-jane-mix_chacha-xop.h"
 #include "scrypt-jane-mix_chacha-avx.h"
 #include "scrypt-jane-mix_chacha-ssse3.h"
 #include "scrypt-jane-mix_chacha-sse2.h"
 #include "scrypt-jane-mix_chacha.h"
 
+#if defined(SCRYPT_CHACHA_XOP)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
+	#define SCRYPT_MIX_FN chacha_core_xop
+	#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
+	#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
+	#include "scrypt-jane-romix-template.h"
+#endif
+
 #if defined(SCRYPT_CHACHA_AVX)
 	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
 	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
@@ -52,9 +62,15 @@ typedef uint32_t scrypt_mix_word_t;
 
 #if !defined(SCRYPT_CHOOSE_COMPILETIME)
 static scrypt_ROMixfn
-scrypt_getROMix() {
+scrypt_getROMix(void) {
 	size_t cpuflags = detect_cpu();
 
+#if defined(SCRYPT_CHACHA_XOP)
+	if (cpuflags & cpu_xop)
+		return scrypt_ROMix_xop;
+	else
+#endif
+
 #if defined(SCRYPT_CHACHA_AVX)
 	if (cpuflags & cpu_avx)
 		return scrypt_ROMix_avx;
@@ -80,18 +96,27 @@ scrypt_getROMix() {
 
 #if defined(SCRYPT_TEST_SPEED)
 static size_t
-available_implementations() {
+available_implementations(void) {
+	size_t cpuflags = detect_cpu();
 	size_t flags = 0;
 
+#if defined(SCRYPT_CHACHA_XOP)
+	if (cpuflags & cpu_xop)
+		flags |= cpu_xop;
+#endif
+
 #if defined(SCRYPT_CHACHA_AVX)
-	flags |= cpu_avx;
+	if (cpuflags & cpu_avx)
+		flags |= cpu_avx;
 #endif
 
 #if defined(SCRYPT_CHACHA_SSSE3)
-	flags |= cpu_ssse3;
+	if (cpuflags & cpu_ssse3)
+		flags |= cpu_ssse3;
 #endif
 
 #if defined(SCRYPT_CHACHA_SSE2)
+	if (cpuflags & cpu_sse2)
 		flags |= cpu_sse2;
 #endif
 
@@ -100,7 +125,7 @@ available_implementations() {
 #endif
 
 static int
-scrypt_test_mix() {
+scrypt_test_mix(void) {
 	static const uint8_t expected[16] = {
 		0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
 	};
@@ -108,6 +133,11 @@ scrypt_test_mix() {
 	int ret = 1;
 	size_t cpuflags = detect_cpu();
 
+#if defined(SCRYPT_CHACHA_XOP)
+	if (cpuflags & cpu_xop)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, scrypt_romix_nop, scrypt_romix_nop, expected);
+#endif
+
 #if defined(SCRYPT_CHACHA_AVX)
 	if (cpuflags & cpu_avx)
 		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected);
diff --git a/scryptjane/scrypt-jane-hash.h b/scryptjane/scrypt-jane-hash.h
index db5c1db3..e7278148 100644
--- a/scryptjane/scrypt-jane-hash.h
+++ b/scryptjane/scrypt-jane-hash.h
@@ -28,7 +28,7 @@
 #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
 
 static int
-scrypt_test_hash() {
+scrypt_test_hash(void) {
 	scrypt_hash_state st;
 	scrypt_hash_digest hash, final;
 	uint8_t msg[SCRYPT_TEST_HASH_LEN];
diff --git a/scryptjane/scrypt-jane-hash_blake256.h b/scryptjane/scrypt-jane-hash_blake256.h
new file mode 100644
index 00000000..4690b114
--- /dev/null
+++ b/scryptjane/scrypt-jane-hash_blake256.h
@@ -0,0 +1,177 @@
+#define SCRYPT_HASH "BLAKE-256"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 32
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake256_sigma[] = {
+	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+	14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+	11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+	 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+	 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+	 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+	12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+	13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+	 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+	10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint32_t blake256_constants[16] = {
+	0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
+	0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
+};
+
+typedef struct scrypt_hash_state_t {
+	uint32_t H[8], T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+	const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
+	uint32_t m[16], v[16], h[8], t[2];
+	uint32_t i;
+
+	for (i = 0; i < 8; i++) h[i] = S->H[i];
+	for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+	while (blocks--) {
+		t[0] += 512;
+		t[1] += (t[0] < 512) ? 1 : 0;
+
+		for (i = 0; i <  8; i++) v[i     ] = h[i];
+		for (i = 0; i <  4; i++) v[i +  8] = blake256_constants[i];
+		for (i = 0; i <  2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0];
+		for (i = 0; i <  2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1];
+		
+		for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]);
+		in += 64;
+
+		#define G(a,b,c,d,e)                                                 \
+			v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
+			v[d] = ROTR32(v[d] ^ v[a],16);                                   \
+			v[c] += v[d];                                                    \
+			v[b] = ROTR32(v[b] ^ v[c],12);                                   \
+			v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
+			v[d] = ROTR32(v[d] ^ v[a], 8);                                   \
+			v[c] += v[d];                                                    \
+			v[b] = ROTR32(v[b] ^ v[c], 7);
+
+		for (i = 0, sigma = blake256_sigma; i < 14; i++) {
+			G(0, 4, 8,12, 0);
+			G(1, 5, 9,13, 2);
+			G(2, 6,10,14, 4);
+			G(3, 7,11,15, 6);
+
+			G(0, 5,10,15, 8);
+			G(1, 6,11,12,10);
+			G(2, 7, 8,13,12);
+			G(3, 4, 9,14,14);
+
+			sigma += 16;
+			if (sigma == sigma_end)
+				sigma = blake256_sigma;
+		}
+
+		#undef G
+
+		for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+	}
+
+	for (i = 0; i < 8; i++) S->H[i] = h[i];
+	for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->H[0] = 0x6a09e667ULL;
+	S->H[1] = 0xbb67ae85ULL;
+	S->H[2] = 0x3c6ef372ULL;
+	S->H[3] = 0xa54ff53aULL;
+	S->H[4] = 0x510e527fULL;
+	S->H[5] = 0x9b05688cULL;
+	S->H[6] = 0x1f83d9abULL;
+	S->H[7] = 0x5be0cd19ULL;
+	S->T[0] = 0;
+	S->T[1] = 0;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		blake256_blocks(S, S->buffer, 1);
+	}
+
+	/* handle the current data */
+	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+	S->leftover = (uint32_t)(inlen - blocks);
+	if (blocks) {
+		blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+		in += blocks;
+	}
+
+	/* handle leftover data */
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	uint32_t th, tl, bits;
+
+	bits = (S->leftover << 3);
+	tl = S->T[0] + bits;
+	th = S->T[1];
+	if (S->leftover == 0) {
+		S->T[0] = (uint32_t)0 - (uint32_t)512;
+		S->T[1] = (uint32_t)0 - (uint32_t)1;
+	} else if (S->T[0] == 0) {
+		S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
+		S->T[1] = S->T[1] - 1;
+	} else {
+		S->T[0] -= (512 - bits);
+	}
+
+	S->buffer[S->leftover] = 0x80;
+	if (S->leftover <= 55) {
+		memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
+	} else {
+		memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
+		blake256_blocks(S, S->buffer, 1);
+		S->T[0] = (uint32_t)0 - (uint32_t)512;
+		S->T[1] = (uint32_t)0 - (uint32_t)1;
+		memset(S->buffer, 0, 56);
+	}
+	S->buffer[55] |= 1;
+	U32TO8_BE(S->buffer + 56, th);
+	U32TO8_BE(S->buffer + 60, tl);
+	blake256_blocks(S, S->buffer, 1);
+
+	U32TO8_BE(&hash[ 0], S->H[0]);
+	U32TO8_BE(&hash[ 4], S->H[1]);
+	U32TO8_BE(&hash[ 8], S->H[2]);
+	U32TO8_BE(&hash[12], S->H[3]);
+	U32TO8_BE(&hash[16], S->H[4]);
+	U32TO8_BE(&hash[20], S->H[5]);
+	U32TO8_BE(&hash[24], S->H[6]);
+	U32TO8_BE(&hash[28], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20,
+	0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8,
+};
diff --git a/scryptjane/scrypt-jane-hash_blake512.h b/scryptjane/scrypt-jane-hash_blake512.h
new file mode 100644
index 00000000..ea2a583d
--- /dev/null
+++ b/scryptjane/scrypt-jane-hash_blake512.h
@@ -0,0 +1,181 @@
+#define SCRYPT_HASH "BLAKE-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+const uint8_t blake512_sigma[] = {
+	 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+	14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
+	11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
+	 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
+	 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
+	 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
+	12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
+	13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
+	 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
+	10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
+};
+
+const uint64_t blake512_constants[16] = {
+	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+	0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+	0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+	0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+typedef struct scrypt_hash_state_t {
+	uint64_t H[8], T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static void
+blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+	const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16);
+	uint64_t m[16], v[16], h[8], t[2];
+	uint32_t i;
+
+	for (i = 0; i < 8; i++) h[i] = S->H[i];
+	for (i = 0; i < 2; i++) t[i] = S->T[i];
+
+	while (blocks--) {
+		t[0] += 1024;
+		t[1] += (t[0] < 1024) ? 1 : 0;
+
+		for (i = 0; i <  8; i++) v[i     ] = h[i];
+		for (i = 0; i <  4; i++) v[i +  8] = blake512_constants[i];
+		for (i = 0; i <  2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0];
+		for (i = 0; i <  2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1];
+
+		for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]);
+		in += 128;
+
+		#define G(a,b,c,d,e)                                                 \
+			v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \
+			v[d] = ROTR64(v[d] ^ v[a],32);                                   \
+			v[c] += v[d];                                                    \
+			v[b] = ROTR64(v[b] ^ v[c],25);                                   \
+			v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \
+			v[d] = ROTR64(v[d] ^ v[a],16);                                   \
+			v[c] += v[d];                                                    \
+			v[b] = ROTR64(v[b] ^ v[c],11);
+
+		for (i = 0, sigma = blake512_sigma; i < 16; i++) {
+			G(0, 4, 8,12, 0);
+			G(1, 5, 9,13, 2);
+			G(2, 6,10,14, 4);
+			G(3, 7,11,15, 6);
+			G(0, 5,10,15, 8);
+			G(1, 6,11,12,10);
+			G(2, 7, 8,13,12);
+			G(3, 4, 9,14,14);
+
+			sigma += 16;
+			if (sigma == sigma_end)
+				sigma = blake512_sigma;
+		}
+
+		#undef G
+
+		for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
+	}
+
+	for (i = 0; i < 8; i++) S->H[i] = h[i];
+	for (i = 0; i < 2; i++) S->T[i] = t[i];
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->H[0] = 0x6a09e667f3bcc908ULL;
+	S->H[1] = 0xbb67ae8584caa73bULL;
+	S->H[2] = 0x3c6ef372fe94f82bULL;
+	S->H[3] = 0xa54ff53a5f1d36f1ULL;
+	S->H[4] = 0x510e527fade682d1ULL;
+	S->H[5] = 0x9b05688c2b3e6c1fULL;
+	S->H[6] = 0x1f83d9abfb41bd6bULL;
+	S->H[7] = 0x5be0cd19137e2179ULL;
+	S->T[0] = 0;
+	S->T[1] = 0;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		blake512_blocks(S, S->buffer, 1);
+	}
+
+	/* handle the current data */
+	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+	S->leftover = (uint32_t)(inlen - blocks);
+	if (blocks) {
+		blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+		in += blocks;
+	}
+
+	/* handle leftover data */
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	uint64_t th, tl;
+	size_t bits;
+
+	bits = (S->leftover << 3);
+	tl = S->T[0] + bits;
+	th = S->T[1];
+	if (S->leftover == 0) {
+		S->T[0] = (uint64_t)0 - (uint64_t)1024;
+		S->T[1] = (uint64_t)0 - (uint64_t)1;
+	} else if (S->T[0] == 0) {
+		S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits;
+		S->T[1] = S->T[1] - 1;
+	} else {
+		S->T[0] -= (1024 - bits);
+	}
+
+	S->buffer[S->leftover] = 0x80;
+	if (S->leftover <= 111) {
+		memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+	} else {
+		memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+		blake512_blocks(S, S->buffer, 1);
+		S->T[0] = (uint64_t)0 - (uint64_t)1024;
+		S->T[1] = (uint64_t)0 - (uint64_t)1;
+		memset(S->buffer, 0, 112);
+	}
+	S->buffer[111] |= 1;
+	U64TO8_BE(S->buffer + 112, th);
+	U64TO8_BE(S->buffer + 120, tl);
+	blake512_blocks(S, S->buffer, 1);
+
+	U64TO8_BE(&hash[ 0], S->H[0]);
+	U64TO8_BE(&hash[ 8], S->H[1]);
+	U64TO8_BE(&hash[16], S->H[2]);
+	U64TO8_BE(&hash[24], S->H[3]);
+	U64TO8_BE(&hash[32], S->H[4]);
+	U64TO8_BE(&hash[40], S->H[5]);
+	U64TO8_BE(&hash[48], S->H[6]);
+	U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f,
+	0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83,
+	0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a,
+	0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac
+};
diff --git a/scryptjane/scrypt-jane-hash_sha512.h b/scryptjane/scrypt-jane-hash_sha512.h
new file mode 100644
index 00000000..3e3997d0
--- /dev/null
+++ b/scryptjane/scrypt-jane-hash_sha512.h
@@ -0,0 +1,152 @@
+#define SCRYPT_HASH "SHA-2-512"
+#define SCRYPT_HASH_BLOCK_SIZE 128
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t H[8];
+	uint64_t T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+static const uint64_t sha512_constants[80] = {
+	0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull,
+	0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull,
+	0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull,
+	0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull,
+	0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull,
+	0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull,
+	0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull,
+	0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull,
+	0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull,
+	0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull,
+	0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull,
+	0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull,
+	0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull,
+	0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull,
+	0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull,
+	0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull,
+	0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull,
+	0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull,
+	0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull,
+	0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull
+};
+
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) (((x | y) & z) | (x & y))
+#define S0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define S1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define G0(x)      (ROTR64(x,  1) ^ ROTR64(x,  8) ^ (x >>  7))
+#define G1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >>  6))
+#define W0(in,i)   (U8TO64_BE(&in[i * 8]))
+#define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
+#define STEP(i) \
+	t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
+	t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \
+	r[7] = r[6]; \
+	r[6] = r[5]; \
+	r[5] = r[4]; \
+	r[4] = r[3] + t0; \
+	r[3] = r[2]; \
+	r[2] = r[1]; \
+	r[1] = r[0]; \
+	r[0] = t0 + t1;
+
+static void
+sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
+	uint64_t r[8], w[80], t0, t1;
+	size_t i;
+
+	for (i = 0; i < 8; i++) r[i] = S->H[i];
+
+	while (blocks--) {
+		for (i =  0; i < 16; i++) { w[i] = W0(in, i); }
+		for (i = 16; i < 80; i++) { w[i] = W1(i); }
+		for (i =  0; i < 80; i++) { STEP(i); }
+		for (i =  0; i <  8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
+		S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8;
+		S->T[1] += (!S->T[0]) ? 1 : 0;
+		in += SCRYPT_HASH_BLOCK_SIZE;
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->H[0] = 0x6a09e667f3bcc908ull;
+	S->H[1] = 0xbb67ae8584caa73bull;
+	S->H[2] = 0x3c6ef372fe94f82bull;
+	S->H[3] = 0xa54ff53a5f1d36f1ull;
+	S->H[4] = 0x510e527fade682d1ull;
+	S->H[5] = 0x9b05688c2b3e6c1full;
+	S->H[6] = 0x1f83d9abfb41bd6bull;
+	S->H[7] = 0x5be0cd19137e2179ull;
+	S->T[0] = 0;
+	S->T[1] = 0;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* handle the previous data */
+	if (S->leftover) {
+		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+		want = (want < inlen) ? want : inlen;
+		memcpy(S->buffer + S->leftover, in, want);
+		S->leftover += (uint32_t)want;
+		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
+			return;
+		in += want;
+		inlen -= want;
+		sha512_blocks(S, S->buffer, 1);
+	}
+
+	/* handle the current data */
+	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+	S->leftover = (uint32_t)(inlen - blocks);
+	if (blocks) {
+		sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
+		in += blocks;
+	}
+
+	/* handle leftover data */
+	if (S->leftover)
+		memcpy(S->buffer, in, S->leftover);
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1];
+
+	S->buffer[S->leftover] = 0x80;
+	if (S->leftover <= 111) {
+		memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
+	} else {
+		memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
+		sha512_blocks(S, S->buffer, 1);
+		memset(S->buffer, 0, 112);
+	}
+
+	U64TO8_BE(S->buffer + 112, t1);
+	U64TO8_BE(S->buffer + 120, t0);
+	sha512_blocks(S, S->buffer, 1);
+
+	U64TO8_BE(&hash[ 0], S->H[0]);
+	U64TO8_BE(&hash[ 8], S->H[1]);
+	U64TO8_BE(&hash[16], S->H[2]);
+	U64TO8_BE(&hash[24], S->H[3]);
+	U64TO8_BE(&hash[32], S->H[4]);
+	U64TO8_BE(&hash[40], S->H[5]);
+	U64TO8_BE(&hash[48], S->H[6]);
+	U64TO8_BE(&hash[56], S->H[7]);
+}
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d,
+	0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec,
+	0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c,
+	0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c,
+};
diff --git a/scryptjane/scrypt-jane-hash_skein512.h b/scryptjane/scrypt-jane-hash_skein512.h
new file mode 100644
index 00000000..736d893d
--- /dev/null
+++ b/scryptjane/scrypt-jane-hash_skein512.h
@@ -0,0 +1,188 @@
+#define SCRYPT_HASH "Skein-512"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t X[8], T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+#include <stdio.h>
+
+static void
+skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
+	uint64_t X[8], key[8], Xt[9+18], T[3+1];
+	size_t r;
+
+	while (blocks--) {
+		T[0] = S->T[0] + add;
+		T[1] = S->T[1];
+		T[2] = T[0] ^ T[1];
+		key[0] = U8TO64_LE(in +  0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
+		key[1] = U8TO64_LE(in +  8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
+		key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
+		key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
+		key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
+		key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
+		key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
+		key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
+		Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
+		in += SCRYPT_HASH_BLOCK_SIZE;
+
+		for (r = 0; r < 18; r++)
+			Xt[r + 9] = Xt[r + 0];
+
+		for (r = 0; r < 18; r += 2) {
+			X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7],  9) ^ X[0];
+
+			X[0] += Xt[r + 1];
+			X[1] += Xt[r + 2];
+			X[2] += Xt[r + 3];
+			X[3] += Xt[r + 4];
+			X[4] += Xt[r + 5];
+			X[5] += Xt[r + 6] + T[1];
+			X[6] += Xt[r + 7] + T[2];
+			X[7] += Xt[r + 8] + r + 1;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+
+			X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1],  8) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
+
+			X[0] += Xt[r + 2];
+			X[1] += Xt[r + 3];
+			X[2] += Xt[r + 4];
+			X[3] += Xt[r + 5];
+			X[4] += Xt[r + 6];
+			X[5] += Xt[r + 7] + T[1];
+			X[6] += Xt[r + 8] + T[2];
+			X[7] += Xt[r + 9] + r + 2;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+		}
+
+		S->X[0] = key[0] ^ X[0];
+		S->X[1] = key[1] ^ X[1];
+		S->X[2] = key[2] ^ X[2];
+		S->X[3] = key[3] ^ X[3];
+		S->X[4] = key[4] ^ X[4];
+		S->X[5] = key[5] ^ X[5];
+		S->X[6] = key[6] ^ X[6];
+		S->X[7] = key[7] ^ X[7];
+
+		S->T[0] = T[0];
+		S->T[1] = T[1] & ~0x4000000000000000ull;
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->X[0] = 0x4903ADFF749C51CEull;
+	S->X[1] = 0x0D95DE399746DF03ull;
+	S->X[2] = 0x8FD1934127C79BCEull;
+	S->X[3] = 0x9A255629FF352CB1ull;
+	S->X[4] = 0x5DB62599DF6CA7B0ull;
+	S->X[5] = 0xEABE394CA9D5C3F4ull;
+	S->X[6] = 0x991112C71A75B523ull;
+	S->X[7] = 0xAE18A40B660FCC33ull;
+	S->T[0] = 0x0000000000000000ull;
+	S->T[1] = 0x7000000000000000ull;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
+	if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
+		/* handle the previous data, we know there is enough for at least one block */
+		if (S->leftover) {
+			want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+			memcpy(S->buffer + S->leftover, in, want);
+			in += want;
+			inlen -= want;
+			S->leftover = 0;
+			skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
+		}
+
+		/* handle the current data if there's more than one block */
+		if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
+			blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+			skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
+			inlen -= blocks;
+			in += blocks;
+		}
+	}
+	
+	/* handle leftover data */
+	memcpy(S->buffer + S->leftover, in, inlen);
+	S->leftover += inlen;
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+	S->T[1] |= 0x8000000000000000ull;
+	skein512_blocks(S, S->buffer, 1, S->leftover);
+
+	memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
+	S->T[0] = 0;
+	S->T[1] = 0xff00000000000000ull;
+	skein512_blocks(S, S->buffer, 1, 8);
+
+	U64TO8_LE(&hash[ 0], S->X[0]);
+	U64TO8_LE(&hash[ 8], S->X[1]);
+	U64TO8_LE(&hash[16], S->X[2]);
+	U64TO8_LE(&hash[24], S->X[3]);
+	U64TO8_LE(&hash[32], S->X[4]);
+	U64TO8_LE(&hash[40], S->X[5]);
+	U64TO8_LE(&hash[48], S->X[6]);
+	U64TO8_LE(&hash[56], S->X[7]);
+}
+
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
+	0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
+	0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
+	0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
+};
diff --git a/scryptjane/scrypt-jane-mix_chacha-avx.h b/scryptjane/scrypt-jane-mix_chacha-avx.h
index 50d6e2d2..ddd3ee11 100644
--- a/scryptjane/scrypt-jane-mix_chacha-avx.h
+++ b/scryptjane/scrypt-jane-mix_chacha-avx.h
@@ -1,5 +1,5 @@
 /* x86 */
-#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_AVX
 
@@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 	a2(shl edx,6)
 	a2(lea ecx,[edx-64])
 	a2(and eax, eax)
-	a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
-	a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
+	a2(mov ebx, 0x01000302)
+	a2(vmovd xmm4, ebx)
+	a2(mov ebx, 0x05040706)
+	a2(vmovd xmm0, ebx)
+	a2(mov ebx, 0x09080b0a)
+	a2(vmovd xmm1, ebx)
+	a2(mov ebx, 0x0d0c0f0e)
+	a2(vmovd xmm2, ebx)
+	a2(mov ebx, 0x02010003)
+	a2(vmovd xmm5, ebx)
+	a2(mov ebx, 0x06050407)
+	a2(vmovd xmm3, ebx)
+	a2(mov ebx, 0x0a09080b)
+	a2(vmovd xmm6, ebx)
+	a2(mov ebx, 0x0e0d0c0f)
+	a2(vmovd xmm7, ebx)
+	a3(vpunpckldq xmm4, xmm4, xmm0)
+	a3(vpunpckldq xmm5, xmm5, xmm3)
+	a3(vpunpckldq xmm1, xmm1, xmm2)
+	a3(vpunpckldq xmm6, xmm6, xmm7)
+	a3(vpunpcklqdq xmm4, xmm4, xmm1)
+	a3(vpunpcklqdq xmm5, xmm5, xmm6)
 	a2(vmovdqa xmm0,[ecx+esi+0])
 	a2(vmovdqa xmm1,[ecx+esi+16])
 	a2(vmovdqa xmm2,[ecx+esi+32])
 	a2(vmovdqa xmm3,[ecx+esi+48])
-	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
 	a3(vpxor xmm0,xmm0,[ecx+eax+0])
 	a3(vpxor xmm1,xmm1,[ecx+eax+16])
 	a3(vpxor xmm2,xmm2,[ecx+eax+32])
@@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a3(vpxor xmm1,xmm1,[esi+ecx+16])
 		a3(vpxor xmm2,xmm2,[esi+ecx+32])
 		a3(vpxor xmm3,xmm3,[esi+ecx+48])
-		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
 		a3(vpxor xmm0,xmm0,[eax+ecx+0])
 		a3(vpxor xmm1,xmm1,[eax+ecx+16])
 		a3(vpxor xmm2,xmm2,[eax+ecx+32])
@@ -71,7 +91,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm6,xmm1,25)
 			a3(vpslld xmm1,xmm1,7)
 			a3(vpxor xmm1,xmm1,xmm6)
-			a2(sub eax,2)
 			a3(vpaddd xmm0,xmm0,xmm1)
 			a3(vpxor  xmm3,xmm3,xmm0)
 			a3(vpshufb xmm3,xmm3,xmm4)
@@ -85,13 +104,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpshufb xmm3,xmm3,xmm5)
 			a3(vpshufd xmm0,xmm0,0x39)
 			a3(vpaddd xmm2,xmm2,xmm3)
-			a3(pshufd xmm3,xmm3,0x4e)
+			a3(vpshufd xmm3,xmm3,0x4e)
 			a3(vpxor  xmm1,xmm1,xmm2)
-			a3(pshufd xmm2,xmm2,0x93)
+			a3(vpshufd xmm2,xmm2,0x93)
 			a3(vpsrld xmm6,xmm1,25)
 			a3(vpslld xmm1,xmm1,7)
 			a3(vpxor  xmm1,xmm1,xmm6)
-			a1(ja scrypt_chacha_avx_loop)
+			a2(sub eax,2)
+			aj(ja scrypt_chacha_avx_loop)
 		a3(vpaddd xmm0,xmm0,[esp+0])
 		a3(vpaddd xmm1,xmm1,[esp+16])
 		a3(vpaddd xmm2,xmm2,[esp+32])
@@ -108,13 +128,13 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a2(vmovdqa [eax+32],xmm2)
 		a2(vmovdqa [eax+48],xmm3)
 		a2(mov eax,[ebp+28])
-		a1(jne scrypt_ChunkMix_avx_loop)
+		aj(jne scrypt_ChunkMix_avx_loop)
 	a2(mov esp,ebp)
 	a1(pop ebp)
 	a1(pop esi)
 	a1(pop edi)
 	a1(pop ebx)
-	a1(ret 16)
+	aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #endif
@@ -122,25 +142,33 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 
 /* x64 */
-#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_AVX
 
 asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
 asm_naked_fn(scrypt_ChunkMix_avx)
-	a2(lea rcx,[rcx*2])
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
 	a2(shl rcx,6)
 	a2(lea r9,[rcx-64])
 	a2(lea rax,[rsi+r9])
 	a2(lea r9,[rdx+r9])
 	a2(and rdx, rdx)
-	a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
-	a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
 	a2(vmovdqa xmm0,[rax+0])
 	a2(vmovdqa xmm1,[rax+16])
 	a2(vmovdqa xmm2,[rax+32])
 	a2(vmovdqa xmm3,[rax+48])
-	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	a2(mov r8, 0x0504070601000302)
+	a2(mov rax, 0x0d0c0f0e09080b0a)
+	a2(movd xmm4, r8)
+	a2(movd xmm6, rax)
+	a2(mov r8, 0x0605040702010003)
+	a2(mov rax, 0x0e0d0c0f0a09080b)
+	a2(movd xmm5, r8)
+	a2(movd xmm7, rax)
+	a3(vpunpcklqdq xmm4, xmm4, xmm6)
+	a3(vpunpcklqdq xmm5, xmm5, xmm7)
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
 	a3(vpxor xmm0,xmm0,[r9+0])
 	a3(vpxor xmm1,xmm1,[r9+16])
 	a3(vpxor xmm2,xmm2,[r9+32])
@@ -154,7 +182,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a3(vpxor xmm1,xmm1,[rsi+r9+16])
 		a3(vpxor xmm2,xmm2,[rsi+r9+32])
 		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
 		a3(vpxor xmm0,xmm0,[rdx+r9+0])
 		a3(vpxor xmm1,xmm1,[rdx+r9+16])
 		a3(vpxor xmm2,xmm2,[rdx+r9+32])
@@ -185,7 +213,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm12,xmm1,25)
 			a3(vpslld xmm1,xmm1,7)
 			a3(vpxor xmm1,xmm1,xmm12)
-			a2(sub rax,2)
 			a3(vpaddd xmm0,xmm0,xmm1)
 			a3(vpxor  xmm3,xmm3,xmm0)
 			a3(vpshufb xmm3,xmm3,xmm4)
@@ -199,13 +226,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpshufb xmm3,xmm3,xmm5)
 			a3(vpshufd xmm0,xmm0,0x39)
 			a3(vpaddd xmm2,xmm2,xmm3)
-			a3(pshufd xmm3,xmm3,0x4e)
+			a3(vpshufd xmm3,xmm3,0x4e)
 			a3(vpxor  xmm1,xmm1,xmm2)
-			a3(pshufd xmm2,xmm2,0x93)
+			a3(vpshufd xmm2,xmm2,0x93)
 			a3(vpsrld xmm12,xmm1,25)
 			a3(vpslld xmm1,xmm1,7)
 			a3(vpxor  xmm1,xmm1,xmm12)
-			a1(ja scrypt_chacha_avx_loop)
+			a2(sub rax,2)
+			aj(ja scrypt_chacha_avx_loop)
 		a3(vpaddd xmm0,xmm0,xmm8)
 		a3(vpaddd xmm1,xmm1,xmm9)
 		a3(vpaddd xmm2,xmm2,xmm10)
@@ -221,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a2(vmovdqa [rax+16],xmm1)
 		a2(vmovdqa [rax+32],xmm2)
 		a2(vmovdqa [rax+48],xmm3)
-		a1(jne scrypt_ChunkMix_avx_loop)
+		aj(jne scrypt_ChunkMix_avx_loop)
 	a1(ret)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
@@ -233,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #define SCRYPT_CHACHA_AVX
 
-static void NOINLINE
+static void asm_calling_convention NOINLINE
 scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
 	uint32_t i, blocksPerChunk = r * 2, half = 0;
 	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
diff --git a/scryptjane/scrypt-jane-mix_chacha-sse2.h b/scryptjane/scrypt-jane-mix_chacha-sse2.h
index d2192c8f..a8c2197a 100644
--- a/scryptjane/scrypt-jane-mix_chacha-sse2.h
+++ b/scryptjane/scrypt-jane-mix_chacha-sse2.h
@@ -1,5 +1,5 @@
 /* x86 */
-#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_SSE2
 
@@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 	a2(movdqa xmm1,[ecx+esi+16])
 	a2(movdqa xmm2,[ecx+esi+32])
 	a2(movdqa xmm3,[ecx+esi+48])
-	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
 	a2(pxor xmm0,[ecx+eax+0])
 	a2(pxor xmm1,[ecx+eax+16])
 	a2(pxor xmm2,[ecx+eax+32])
@@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(pxor xmm1,[esi+ecx+16])
 		a2(pxor xmm2,[esi+ecx+32])
 		a2(pxor xmm3,[esi+ecx+48])
-		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
 		a2(pxor xmm0,[eax+ecx+0])
 		a2(pxor xmm1,[eax+ecx+16])
 		a2(pxor xmm2,[eax+ecx+32])
@@ -52,10 +52,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a1(scrypt_chacha_sse2_loop: )
 			a2(paddd xmm0,xmm1)
 			a2(pxor  xmm3,xmm0)
-			a2(movdqa xmm6,xmm3)
-			a2(pslld xmm3,16)
-			a2(psrld xmm6,16)
-			a2(pxor  xmm3,xmm6)
+			a3(pshuflw xmm3,xmm3,0xb1)
+			a3(pshufhw xmm3,xmm3,0xb1)
 			a2(paddd xmm2,xmm3)
 			a2(pxor  xmm1,xmm2)
 			a2(movdqa xmm6,xmm1)
@@ -80,10 +78,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(sub eax,2)
 			a2(paddd xmm0,xmm1)
 			a2(pxor  xmm3,xmm0)
-			a2(movdqa xmm6,xmm3)
-			a2(pslld xmm3,16)
-			a2(psrld xmm6,16)
-			a2(pxor  xmm3,xmm6)
+			a3(pshuflw xmm3,xmm3,0xb1)
+			a3(pshufhw xmm3,xmm3,0xb1)
 			a2(paddd xmm2,xmm3)
 			a2(pxor  xmm1,xmm2)
 			a2(movdqa xmm6,xmm1)
@@ -105,7 +101,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(pslld xmm1,7)
 			a2(psrld xmm6,25)
 			a2(pxor  xmm1,xmm6)
-			a1(ja scrypt_chacha_sse2_loop)
+			aj(ja scrypt_chacha_sse2_loop)
 		a2(paddd xmm0,[esp+0])
 		a2(paddd xmm1,xmm4)
 		a2(paddd xmm2,xmm5)
@@ -122,13 +118,13 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(movdqa [eax+32],xmm2)
 		a2(movdqa [eax+48],xmm3)
 		a2(mov eax,[ebp+28])
-		a1(jne scrypt_ChunkMix_sse2_loop)
+		aj(jne scrypt_ChunkMix_sse2_loop)
 	a2(mov esp,ebp)
 	a1(pop ebp)
 	a1(pop esi)
 	a1(pop edi)
 	a1(pop ebx)
-	a1(ret 16)
+	aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #endif
@@ -136,13 +132,13 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 
 /* x64 */
-#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_SSE2
 
 asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
 asm_naked_fn(scrypt_ChunkMix_sse2)
-	a2(lea rcx,[rcx*2])
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
 	a2(shl rcx,6)
 	a2(lea r9,[rcx-64])
 	a2(lea rax,[rsi+r9])
@@ -152,7 +148,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 	a2(movdqa xmm1,[rax+16])
 	a2(movdqa xmm2,[rax+32])
 	a2(movdqa xmm3,[rax+48])
-	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
 	a2(pxor xmm0,[r9+0])
 	a2(pxor xmm1,[r9+16])
 	a2(pxor xmm2,[r9+32])
@@ -166,7 +162,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(pxor xmm1,[rsi+r9+16])
 		a2(pxor xmm2,[rsi+r9+32])
 		a2(pxor xmm3,[rsi+r9+48])
-		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
 		a2(pxor xmm0,[rdx+r9+0])
 		a2(pxor xmm1,[rdx+r9+16])
 		a2(pxor xmm2,[rdx+r9+32])
@@ -180,10 +176,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a1(scrypt_chacha_sse2_loop: )
 			a2(paddd xmm0,xmm1)
 			a2(pxor  xmm3,xmm0)
-			a2(movdqa xmm6,xmm3)
-			a2(pslld xmm3,16)
-			a2(psrld xmm6,16)
-			a2(pxor  xmm3,xmm6)
+			a3(pshuflw xmm3,xmm3,0xb1)
+			a3(pshufhw xmm3,xmm3,0xb1)
 			a2(paddd xmm2,xmm3)
 			a2(pxor  xmm1,xmm2)
 			a2(movdqa xmm6,xmm1)
@@ -208,10 +202,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(sub rax,2)
 			a2(paddd xmm0,xmm1)
 			a2(pxor  xmm3,xmm0)
-			a2(movdqa xmm6,xmm3)
-			a2(pslld xmm3,16)
-			a2(psrld xmm6,16)
-			a2(pxor  xmm3,xmm6)
+			a3(pshuflw xmm3,xmm3,0xb1)
+			a3(pshufhw xmm3,xmm3,0xb1)
 			a2(paddd xmm2,xmm3)
 			a2(pxor  xmm1,xmm2)
 			a2(movdqa xmm6,xmm1)
@@ -233,7 +225,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(pslld xmm1,7)
 			a2(psrld xmm6,25)
 			a2(pxor  xmm1,xmm6)
-			a1(ja scrypt_chacha_sse2_loop)
+			aj(ja scrypt_chacha_sse2_loop)
 		a2(paddd xmm0,xmm8)
 		a2(paddd xmm1,xmm9)
 		a2(paddd xmm2,xmm10)
@@ -249,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(movdqa [rax+16],xmm1)
 		a2(movdqa [rax+32],xmm2)
 		a2(movdqa [rax+48],xmm3)
-		a1(jne scrypt_ChunkMix_sse2_loop)
+		aj(jne scrypt_ChunkMix_sse2_loop)
 	a1(ret)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
@@ -261,7 +253,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #define SCRYPT_CHACHA_SSE2
 
-static void NOINLINE
+static void NOINLINE asm_calling_convention
 scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
 	uint32_t i, blocksPerChunk = r * 2, half = 0;
 	xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
@@ -308,7 +300,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
 			x0 = _mm_add_epi32(x0, x1);
 			x3 = _mm_xor_si128(x3, x0);
 			x4 = x3;
-			x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+			x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
 			x2 = _mm_add_epi32(x2, x3);
 			x1 = _mm_xor_si128(x1, x2);
 			x4 = x1;
@@ -327,7 +319,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
 			x0 = _mm_add_epi32(x0, x1);
 			x3 = _mm_xor_si128(x3, x0);
 			x4 = x3;
-			x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
+			x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
 			x2 = _mm_add_epi32(x2, x3);
 			x1 = _mm_xor_si128(x1, x2);
 			x4 = x1;
diff --git a/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/scryptjane/scrypt-jane-mix_chacha-ssse3.h
index b25e3567..894312e6 100644
--- a/scryptjane/scrypt-jane-mix_chacha-ssse3.h
+++ b/scryptjane/scrypt-jane-mix_chacha-ssse3.h
@@ -1,5 +1,5 @@
 /* x86 */
-#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_SSSE3
 
@@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 	a2(shl edx,6)
 	a2(lea ecx,[edx-64])
 	a2(and eax, eax)
-	a2(movdqa xmm4,[ssse3_rotl16_32bit])
-	a2(movdqa xmm5,[ssse3_rotl8_32bit])
+	a2(mov ebx, 0x01000302)
+	a2(movd xmm4, ebx)
+	a2(mov ebx, 0x05040706)
+	a2(movd xmm0, ebx)
+	a2(mov ebx, 0x09080b0a)
+	a2(movd xmm1, ebx)
+	a2(mov ebx, 0x0d0c0f0e)
+	a2(movd xmm2, ebx)
+	a2(mov ebx, 0x02010003)
+	a2(movd xmm5, ebx)
+	a2(mov ebx, 0x06050407)
+	a2(movd xmm3, ebx)
+	a2(mov ebx, 0x0a09080b)
+	a2(movd xmm6, ebx)
+	a2(mov ebx, 0x0e0d0c0f)
+	a2(movd xmm7, ebx)
+	a2(punpckldq xmm4, xmm0)
+	a2(punpckldq xmm5, xmm3)
+	a2(punpckldq xmm1, xmm2)
+	a2(punpckldq xmm6, xmm7)
+	a2(punpcklqdq xmm4, xmm1)
+	a2(punpcklqdq xmm5, xmm6)
 	a2(movdqa xmm0,[ecx+esi+0])
 	a2(movdqa xmm1,[ecx+esi+16])
 	a2(movdqa xmm2,[ecx+esi+32])
 	a2(movdqa xmm3,[ecx+esi+48])
-	a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
 	a2(pxor xmm0,[ecx+eax+0])
 	a2(pxor xmm1,[ecx+eax+16])
 	a2(pxor xmm2,[ecx+eax+32])
@@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 		a2(pxor xmm1,[esi+ecx+16])
 		a2(pxor xmm2,[esi+ecx+32])
 		a2(pxor xmm3,[esi+ecx+48])
-		a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
 		a2(pxor xmm0,[eax+ecx+0])
 		a2(pxor xmm1,[eax+ecx+16])
 		a2(pxor xmm2,[eax+ecx+32])
@@ -95,7 +115,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 			a2(pslld xmm1,7)
 			a2(psrld xmm6,25)
 			a2(pxor  xmm1,xmm6)
-			a1(ja scrypt_chacha_ssse3_loop)
+			aj(ja scrypt_chacha_ssse3_loop)
 		a2(paddd xmm0,[esp+0])
 		a2(paddd xmm1,[esp+16])
 		a2(paddd xmm2,[esp+32])
@@ -112,13 +132,13 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 		a2(movdqa [eax+32],xmm2)
 		a2(movdqa [eax+48],xmm3)
 		a2(mov eax,[ebp+28])
-		a1(jne scrypt_ChunkMix_ssse3_loop)
+		aj(jne scrypt_ChunkMix_ssse3_loop)
 	a2(mov esp,ebp)
 	a1(pop ebp)
 	a1(pop esi)
 	a1(pop edi)
 	a1(pop ebx)
-	a1(ret 16)
+	aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_ssse3)
 
 #endif
@@ -126,25 +146,33 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3)
 
 
 /* x64 */
-#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_CHACHA_SSSE3
 
 asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
 asm_naked_fn(scrypt_ChunkMix_ssse3)
-	a2(lea rcx,[rcx*2])
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
 	a2(shl rcx,6)
 	a2(lea r9,[rcx-64])
 	a2(lea rax,[rsi+r9])
 	a2(lea r9,[rdx+r9])
 	a2(and rdx, rdx)
-	a2(movdqa xmm4,[ssse3_rotl16_32bit])
-	a2(movdqa xmm5,[ssse3_rotl8_32bit])
 	a2(movdqa xmm0,[rax+0])
 	a2(movdqa xmm1,[rax+16])
 	a2(movdqa xmm2,[rax+32])
 	a2(movdqa xmm3,[rax+48])
-	a1(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(mov r8, 0x0504070601000302)
+	a2(mov rax, 0x0d0c0f0e09080b0a)
+	a2(movd xmm4, r8)
+	a2(movd xmm6, rax)
+	a2(mov r8, 0x0605040702010003)
+	a2(mov rax, 0x0e0d0c0f0a09080b)
+	a2(movd xmm5, r8)
+	a2(movd xmm7, rax)
+	a2(punpcklqdq xmm4, xmm6)
+	a2(punpcklqdq xmm5, xmm7)
+	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
 	a2(pxor xmm0,[r9+0])
 	a2(pxor xmm1,[r9+16])
 	a2(pxor xmm2,[r9+32])
@@ -158,7 +186,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 		a2(pxor xmm1,[rsi+r9+16])
 		a2(pxor xmm2,[rsi+r9+32])
 		a2(pxor xmm3,[rsi+r9+48])
-		a1(jz scrypt_ChunkMix_ssse3_no_xor2)
+		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
 		a2(pxor xmm0,[rdx+r9+0])
 		a2(pxor xmm1,[rdx+r9+16])
 		a2(pxor xmm2,[rdx+r9+32])
@@ -213,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 			a2(pslld xmm1,7)
 			a2(psrld xmm12,25)
 			a2(pxor  xmm1,xmm12)
-			a1(ja scrypt_chacha_ssse3_loop)
+			aj(ja scrypt_chacha_ssse3_loop)
 		a2(paddd xmm0,xmm8)
 		a2(paddd xmm1,xmm9)
 		a2(paddd xmm2,xmm10)
@@ -229,7 +257,7 @@ asm_naked_fn(scrypt_ChunkMix_ssse3)
 		a2(movdqa [rax+16],xmm1)
 		a2(movdqa [rax+32],xmm2)
 		a2(movdqa [rax+48],xmm3)
-		a1(jne scrypt_ChunkMix_ssse3_loop)
+		aj(jne scrypt_ChunkMix_ssse3_loop)
 	a1(ret)
 asm_naked_fn_end(scrypt_ChunkMix_ssse3)
 
@@ -241,7 +269,7 @@ asm_naked_fn_end(scrypt_ChunkMix_ssse3)
 
 #define SCRYPT_CHACHA_SSSE3
 
-static void NOINLINE
+static void NOINLINE asm_calling_convention
 scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
 	uint32_t i, blocksPerChunk = r * 2, half = 0;
 	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
diff --git a/scryptjane/scrypt-jane-mix_chacha-xop.h b/scryptjane/scrypt-jane-mix_chacha-xop.h
new file mode 100644
index 00000000..4c25d887
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_chacha-xop.h
@@ -0,0 +1,315 @@
+/* x86 */
+#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_CHACHA_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,64)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(vmovdqa xmm0,[ecx+esi+0])
+	a2(vmovdqa xmm1,[ecx+esi+16])
+	a2(vmovdqa xmm2,[ecx+esi+32])
+	a2(vmovdqa xmm3,[ecx+esi+48])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[ecx+eax+0])
+	a3(vpxor xmm1,xmm1,[ecx+eax+16])
+	a3(vpxor xmm2,xmm2,[ecx+eax+32])
+	a3(vpxor xmm3,xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and eax, eax)
+		a3(vpxor xmm0,xmm0,[esi+ecx+0])
+		a3(vpxor xmm1,xmm1,[esi+ecx+16])
+		a3(vpxor xmm2,xmm2,[esi+ecx+32])
+		a3(vpxor xmm3,xmm3,[esi+ecx+48])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[eax+ecx+0])
+		a3(vpxor xmm1,xmm1,[eax+ecx+16])
+		a3(vpxor xmm2,xmm2,[eax+ecx+32])
+		a3(vpxor xmm3,xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa xmm4,xmm0)
+		a2(vmovdqa xmm5,xmm1)
+		a2(vmovdqa xmm6,xmm2)
+		a2(vmovdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_chacha_xop_loop: )
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,16)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,8)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpshufd xmm0,xmm0,0x93)
+			a3(vpxor xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,7)
+			a3(vpshufd xmm3,xmm3,0x4e)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpshufd xmm2,xmm2,0x39)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,16)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,8)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpshufd xmm0,xmm0,0x39)
+			a3(vprotd xmm1,xmm1,7)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(sub eax,2)
+			aj(ja scrypt_chacha_xop_loop)
+		a3(vpaddd xmm0,xmm0,xmm4)
+		a3(vpaddd xmm1,xmm1,xmm5)
+		a3(vpaddd xmm2,xmm2,xmm6)
+		a3(vpaddd xmm3,xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(vmovdqa [eax+0],xmm0)
+		a2(vmovdqa [eax+16],xmm1)
+		a2(vmovdqa [eax+32],xmm2)
+		a2(vmovdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	aret(16)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_CHACHA_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor r8,r8)
+	a2(xor r9,r9)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa xmm4,xmm0)
+		a2(vmovdqa xmm5,xmm1)
+		a2(vmovdqa xmm6,xmm2)
+		a2(vmovdqa xmm7,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_chacha_xop_loop: )
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,16)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,8)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpshufd xmm0,xmm0,0x93)
+			a3(vpxor xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,7)
+			a3(vpshufd xmm3,xmm3,0x4e)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpshufd xmm2,xmm2,0x39)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,16)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vprotd xmm1,xmm1,12)
+			a3(vpaddd xmm0,xmm0,xmm1)
+			a3(vpxor  xmm3,xmm3,xmm0)
+			a3(vprotd xmm3,xmm3,8)
+			a3(vpaddd xmm2,xmm2,xmm3)
+			a3(vpxor  xmm1,xmm1,xmm2)
+			a3(vpshufd xmm0,xmm0,0x39)
+			a3(vprotd xmm1,xmm1,7)
+			a3(pshufd xmm3,xmm3,0x4e)
+			a3(pshufd xmm2,xmm2,0x93)
+			a2(sub rax,2)
+			aj(ja scrypt_chacha_xop_loop)
+		a3(vpaddd xmm0,xmm0,xmm4)
+		a3(vpaddd xmm1,xmm1,xmm5)
+		a3(vpaddd xmm2,xmm2,xmm6)
+		a3(vpaddd xmm3,xmm3,xmm7)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
+
+#define SCRYPT_CHACHA_XOP
+
+static void asm_calling_convention NOINLINE
+scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_roti_epi32(x3, 16);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x1 = _mm_roti_epi32(x1, 12);
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_roti_epi32(x3, 8);
+			x2 = _mm_add_epi32(x2, x3);
+			x0 = _mm_shuffle_epi32(x0, 0x93);
+			x1 = _mm_xor_si128(x1, x2);
+			x1 = _mm_roti_epi32(x1, 7);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x0 = _mm_add_epi32(x0, x1);
+			x2 = _mm_shuffle_epi32(x2, 0x39);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_roti_epi32(x3, 16);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x1 = _mm_roti_epi32(x1, 12);
+			x0 = _mm_add_epi32(x0, x1);
+			x3 = _mm_xor_si128(x3, x0);
+			x3 = _mm_roti_epi32(x3, 8);
+			x2 = _mm_add_epi32(x2, x3);
+			x1 = _mm_xor_si128(x1, x2);
+			x0 = _mm_shuffle_epi32(x0, 0x39);
+			x1 = _mm_roti_epi32(x1, 7);
+			x3 = _mm_shuffle_epi32(x3, 0x4e);
+			x2 = _mm_shuffle_epi32(x2, 0x93);
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_CHACHA_XOP)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "ChaCha/8-XOP"
+	#undef SCRYPT_CHACHA_INCLUDED
+	#define SCRYPT_CHACHA_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa-avx.h b/scryptjane/scrypt-jane-mix_salsa-avx.h
index 15fb48e3..259fae46 100644
--- a/scryptjane/scrypt-jane-mix_salsa-avx.h
+++ b/scryptjane/scrypt-jane-mix_salsa-avx.h
@@ -1,5 +1,5 @@
 /* x86 */
-#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_SALSA_AVX
 
@@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 	a2(movdqa xmm1,[ecx+esi+16])
 	a2(movdqa xmm2,[ecx+esi+32])
 	a2(movdqa xmm3,[ecx+esi+48])
-	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
 	a3(vpxor xmm0,xmm0,[ecx+eax+0])
 	a3(vpxor xmm1,xmm1,[ecx+eax+16])
 	a3(vpxor xmm2,xmm2,[ecx+eax+32])
@@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a3(vpxor xmm1,xmm1,[esi+ecx+16])
 		a3(vpxor xmm2,xmm2,[esi+ecx+32])
 		a3(vpxor xmm3,xmm3,[esi+ecx+48])
-		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
 		a3(vpxor xmm0,xmm0,[eax+ecx+0])
 		a3(vpxor xmm1,xmm1,[eax+ecx+16])
 		a3(vpxor xmm2,xmm2,[eax+ecx+32])
@@ -64,17 +64,16 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm5, xmm4, 19)
 			a3(vpslld xmm4, xmm4, 13)
 			a3(vpxor xmm1, xmm1, xmm5)
-			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vpshufd xmm3, xmm3, 0x93)
 			a3(vpxor xmm1, xmm1, xmm4)
 			a3(vpaddd xmm4, xmm2, xmm1)
 			a3(vpsrld xmm5, xmm4, 14)
 			a3(vpslld xmm4, xmm4, 18)
 			a3(vpxor xmm0, xmm0, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpshufd xmm2, xmm2, 0x4e)
 			a3(vpxor xmm0, xmm0, xmm4)
-			a2(sub eax, 2)
 			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpshufd xmm1, xmm1, 0x39)
 			a3(vpsrld xmm5, xmm4, 25)
 			a3(vpslld xmm4, xmm4, 7)
 			a3(vpxor xmm1, xmm1, xmm5)
@@ -88,16 +87,17 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm5, xmm4, 19)
 			a3(vpslld xmm4, xmm4, 13)
 			a3(vpxor xmm3, xmm3, xmm5)
-			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpshufd xmm1, xmm1, 0x93)
 			a3(vpxor xmm3, xmm3, xmm4)
 			a3(vpaddd xmm4, xmm2, xmm3)
 			a3(vpsrld xmm5, xmm4, 14)
 			a3(vpslld xmm4, xmm4, 18)
 			a3(vpxor xmm0, xmm0, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpshufd xmm2, xmm2, 0x4e)
 			a3(vpxor xmm0, xmm0, xmm4)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a1(ja scrypt_salsa_avx_loop)
+			a3(vpshufd xmm3, xmm3, 0x39)
+			a2(sub eax, 2)
+			aj(ja scrypt_salsa_avx_loop)
 		a3(vpaddd xmm0,xmm0,[esp+0])
 		a3(vpaddd xmm1,xmm1,[esp+16])
 		a3(vpaddd xmm2,xmm2,xmm6)
@@ -114,13 +114,13 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a2(vmovdqa [eax+32],xmm2)
 		a2(vmovdqa [eax+48],xmm3)
 		a2(mov eax,[ebp+28])
-		a1(jne scrypt_ChunkMix_avx_loop)
+		aj(jne scrypt_ChunkMix_avx_loop)
 	a2(mov esp,ebp)
 	a1(pop ebp)
 	a1(pop esi)
 	a1(pop edi)
 	a1(pop ebx)
-	a1(ret 16)
+	aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #endif
@@ -128,13 +128,13 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 
 /* x64 */
-#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_SALSA_AVX
 
 asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
 asm_naked_fn(scrypt_ChunkMix_avx)
-	a2(lea rcx,[rcx*2])
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
 	a2(shl rcx,6)
 	a2(lea r9,[rcx-64])
 	a2(lea rax,[rsi+r9])
@@ -144,7 +144,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 	a2(vmovdqa xmm1,[rax+16])
 	a2(vmovdqa xmm2,[rax+32])
 	a2(vmovdqa xmm3,[rax+48])
-	a1(jz scrypt_ChunkMix_avx_no_xor1)
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
 	a3(vpxor xmm0,xmm0,[r9+0])
 	a3(vpxor xmm1,xmm1,[r9+16])
 	a3(vpxor xmm2,xmm2,[r9+32])
@@ -158,7 +158,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a3(vpxor xmm1,xmm1,[rsi+r9+16])
 		a3(vpxor xmm2,xmm2,[rsi+r9+32])
 		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		a1(jz scrypt_ChunkMix_avx_no_xor2)
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
 		a3(vpxor xmm0,xmm0,[rdx+r9+0])
 		a3(vpxor xmm1,xmm1,[rdx+r9+16])
 		a3(vpxor xmm2,xmm2,[rdx+r9+32])
@@ -184,17 +184,16 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm5, xmm4, 19)
 			a3(vpslld xmm4, xmm4, 13)
 			a3(vpxor xmm1, xmm1, xmm5)
-			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vpshufd xmm3, xmm3, 0x93)
 			a3(vpxor xmm1, xmm1, xmm4)
 			a3(vpaddd xmm4, xmm2, xmm1)
 			a3(vpsrld xmm5, xmm4, 14)
 			a3(vpslld xmm4, xmm4, 18)
 			a3(vpxor xmm0, xmm0, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpshufd xmm2, xmm2, 0x4e)
 			a3(vpxor xmm0, xmm0, xmm4)
-			a2(sub rax, 2)
 			a3(vpaddd xmm4, xmm3, xmm0)
-			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpshufd xmm1, xmm1, 0x39)
 			a3(vpsrld xmm5, xmm4, 25)
 			a3(vpslld xmm4, xmm4, 7)
 			a3(vpxor xmm1, xmm1, xmm5)
@@ -208,16 +207,17 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 			a3(vpsrld xmm5, xmm4, 19)
 			a3(vpslld xmm4, xmm4, 13)
 			a3(vpxor xmm3, xmm3, xmm5)
-			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpshufd xmm1, xmm1, 0x93)
 			a3(vpxor xmm3, xmm3, xmm4)
 			a3(vpaddd xmm4, xmm2, xmm3)
 			a3(vpsrld xmm5, xmm4, 14)
 			a3(vpslld xmm4, xmm4, 18)
 			a3(vpxor xmm0, xmm0, xmm5)
-			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpshufd xmm2, xmm2, 0x4e)
 			a3(vpxor xmm0, xmm0, xmm4)
-			a3(pshufd xmm3, xmm3, 0x39)
-			a1(ja scrypt_salsa_avx_loop)
+			a3(vpshufd xmm3, xmm3, 0x39)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa_avx_loop)
 		a3(vpaddd xmm0,xmm0,xmm8)
 		a3(vpaddd xmm1,xmm1,xmm9)
 		a3(vpaddd xmm2,xmm2,xmm10)
@@ -233,7 +233,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
 		a2(vmovdqa [rax+16],xmm1)
 		a2(vmovdqa [rax+32],xmm2)
 		a2(vmovdqa [rax+48],xmm3)
-		a1(jne scrypt_ChunkMix_avx_loop)
+		aj(jne scrypt_ChunkMix_avx_loop)
 	a1(ret)
 asm_naked_fn_end(scrypt_ChunkMix_avx)
 
@@ -245,7 +245,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
 
 #define SCRYPT_SALSA_AVX
 
-static void NOINLINE
+static void asm_calling_convention NOINLINE
 scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
 	uint32_t i, blocksPerChunk = r * 2, half = 0;
 	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
diff --git a/scryptjane/scrypt-jane-mix_salsa-sse2.h b/scryptjane/scrypt-jane-mix_salsa-sse2.h
index 4898659e..d7ef969c 100644
--- a/scryptjane/scrypt-jane-mix_salsa-sse2.h
+++ b/scryptjane/scrypt-jane-mix_salsa-sse2.h
@@ -1,5 +1,5 @@
 /* x86 */
-#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_SALSA_SSE2
 
@@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 	a2(movdqa xmm1,[ecx+esi+16])
 	a2(movdqa xmm2,[ecx+esi+32])
 	a2(movdqa xmm3,[ecx+esi+48])
-	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
 	a2(pxor xmm0,[ecx+eax+0])
 	a2(pxor xmm1,[ecx+eax+16])
 	a2(pxor xmm2,[ecx+eax+32])
@@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(pxor xmm1,[esi+ecx+16])
 		a2(pxor xmm2,[esi+ecx+32])
 		a2(pxor xmm3,[esi+ecx+48])
-		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
 		a2(pxor xmm0,[eax+ecx+0])
 		a2(pxor xmm1,[eax+ecx+16])
 		a2(pxor xmm2,[eax+ecx+32])
@@ -113,7 +113,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(pxor xmm0, xmm4)
 			a3(pshufd xmm3, xmm3, 0x39)
 			a2(pxor xmm0, xmm5)
-			a1(ja scrypt_salsa_sse2_loop)
+			aj(ja scrypt_salsa_sse2_loop)
 		a2(paddd xmm0,[esp+0])
 		a2(paddd xmm1,[esp+16])
 		a2(paddd xmm2,xmm6)
@@ -130,13 +130,13 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(movdqa [eax+32],xmm2)
 		a2(movdqa [eax+48],xmm3)
 		a2(mov eax,[ebp+28])
-		a1(jne scrypt_ChunkMix_sse2_loop)
+		aj(jne scrypt_ChunkMix_sse2_loop)
 	a2(mov esp,ebp)
 	a1(pop ebp)
 	a1(pop esi)
 	a1(pop edi)
 	a1(pop ebx)
-	a1(ret 16)
+	aret(16)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #endif
@@ -144,13 +144,13 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 
 /* x64 */
-#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
 
 #define SCRYPT_SALSA_SSE2
 
 asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
 asm_naked_fn(scrypt_ChunkMix_sse2)
-	a2(lea rcx,[rcx*2])
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
 	a2(shl rcx,6)
 	a2(lea r9,[rcx-64])
 	a2(lea rax,[rsi+r9])
@@ -160,7 +160,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 	a2(movdqa xmm1,[rax+16])
 	a2(movdqa xmm2,[rax+32])
 	a2(movdqa xmm3,[rax+48])
-	a1(jz scrypt_ChunkMix_sse2_no_xor1)
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
 	a2(pxor xmm0,[r9+0])
 	a2(pxor xmm1,[r9+16])
 	a2(pxor xmm2,[r9+32])
@@ -174,7 +174,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(pxor xmm1,[rsi+r9+16])
 		a2(pxor xmm2,[rsi+r9+32])
 		a2(pxor xmm3,[rsi+r9+48])
-		a1(jz scrypt_ChunkMix_sse2_no_xor2)
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
 		a2(pxor xmm0,[rdx+r9+0])
 		a2(pxor xmm1,[rdx+r9+16])
 		a2(pxor xmm2,[rdx+r9+32])
@@ -249,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 			a2(pxor xmm0, xmm4)
 			a3(pshufd xmm3, xmm3, 0x39)
 			a2(pxor xmm0, xmm5)
-			a1(ja scrypt_salsa_sse2_loop)
+			aj(ja scrypt_salsa_sse2_loop)
 		a2(paddd xmm0,xmm8)
 		a2(paddd xmm1,xmm9)
 		a2(paddd xmm2,xmm10)
@@ -265,7 +265,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
 		a2(movdqa [rax+16],xmm1)
 		a2(movdqa [rax+32],xmm2)
 		a2(movdqa [rax+48],xmm3)		
-		a1(jne scrypt_ChunkMix_sse2_loop)
+		aj(jne scrypt_ChunkMix_sse2_loop)
 	a1(ret)
 asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
@@ -277,7 +277,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2)
 
 #define SCRYPT_SALSA_SSE2
 
-static void NOINLINE
+static void NOINLINE asm_calling_convention
 scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
 	uint32_t i, blocksPerChunk = r * 2, half = 0;
 	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
@@ -426,7 +426,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
 		 4  9 14  3
 	*/
 
-	static void STDCALL
+	static void asm_calling_convention
 	salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
 		uint32_t t;
 		while (count--) {
diff --git a/scryptjane/scrypt-jane-mix_salsa-xop.h b/scryptjane/scrypt-jane-mix_salsa-xop.h
new file mode 100644
index 00000000..1d014d2a
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa-xop.h
@@ -0,0 +1,317 @@
+/* x86 */
+#if defined(X86ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a1(push ebx)
+	a1(push edi)
+	a1(push esi)
+	a1(push ebp)
+	a2(mov ebp,esp)
+	a2(mov edi,[ebp+20])
+	a2(mov esi,[ebp+24])
+	a2(mov eax,[ebp+28])
+	a2(mov ebx,[ebp+32])
+	a2(sub esp,32)
+	a2(and esp,~63)
+	a2(lea edx,[ebx*2])
+	a2(shl edx,6)
+	a2(lea ecx,[edx-64])
+	a2(and eax, eax)
+	a2(movdqa xmm0,[ecx+esi+0])
+	a2(movdqa xmm1,[ecx+esi+16])
+	a2(movdqa xmm2,[ecx+esi+32])
+	a2(movdqa xmm3,[ecx+esi+48])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[ecx+eax+0])
+	a3(vpxor xmm1,xmm1,[ecx+eax+16])
+	a3(vpxor xmm2,xmm2,[ecx+eax+32])
+	a3(vpxor xmm3,xmm3,[ecx+eax+48])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor ecx,ecx)
+	a2(xor ebx,ebx)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and eax, eax)
+		a3(vpxor xmm0,xmm0,[esi+ecx+0])
+		a3(vpxor xmm1,xmm1,[esi+ecx+16])
+		a3(vpxor xmm2,xmm2,[esi+ecx+32])
+		a3(vpxor xmm3,xmm3,[esi+ecx+48])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[eax+ecx+0])
+		a3(vpxor xmm1,xmm1,[eax+ecx+16])
+		a3(vpxor xmm2,xmm2,[eax+ecx+32])
+		a3(vpxor xmm3,xmm3,[eax+ecx+48])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa [esp+0],xmm0)
+		a2(vmovdqa [esp+16],xmm1)
+		a2(vmovdqa xmm6,xmm2)
+		a2(vmovdqa xmm7,xmm3)
+		a2(mov eax,8)
+		a1(scrypt_salsa_xop_loop: )
+			a3(vpaddd xmm4, xmm1, xmm0)
+			a3(vprotd xmm4, xmm4, 7)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm3)
+			a3(vprotd xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm3, xmm2)
+			a3(vprotd xmm4, xmm4, 13)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm1)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vprotd xmm4, xmm4, 18)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpaddd xmm4, xmm3, xmm0)
+			a3(vprotd xmm4, xmm4, 7)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm1)
+			a3(vprotd xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm1, xmm2)
+			a3(vprotd xmm4, xmm4, 13)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpaddd xmm4, xmm2, xmm3)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vprotd xmm4, xmm4, 18)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a2(sub eax, 2)
+			aj(ja scrypt_salsa_xop_loop)
+		a3(vpaddd xmm0,xmm0,[esp+0])
+		a3(vpaddd xmm1,xmm1,[esp+16])
+		a3(vpaddd xmm2,xmm2,xmm6)
+		a3(vpaddd xmm3,xmm3,xmm7)
+		a2(lea eax,[ebx+ecx])
+		a2(xor ebx,edx)
+		a2(and eax,~0x7f)
+		a2(add ecx,64)
+		a2(shr eax,1)
+		a2(add eax, edi)
+		a2(cmp ecx,edx)
+		a2(vmovdqa [eax+0],xmm0)
+		a2(vmovdqa [eax+16],xmm1)
+		a2(vmovdqa [eax+32],xmm2)
+		a2(vmovdqa [eax+48],xmm3)
+		a2(mov eax,[ebp+28])
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a2(mov esp,ebp)
+	a1(pop ebp)
+	a1(pop esi)
+	a1(pop edi)
+	a1(pop ebx)
+	aret(16)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+
+/* x64 */
+#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,6)
+	a2(lea r9,[rcx-64])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa xmm8,xmm0)
+		a2(vmovdqa xmm9,xmm1)
+		a2(vmovdqa xmm10,xmm2)
+		a2(vmovdqa xmm11,xmm3)
+		a2(mov rax,8)
+		a1(scrypt_salsa_xop_loop: )
+			a3(vpaddd xmm4, xmm1, xmm0)
+			a3(vprotd xmm4, xmm4, 7)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm3)
+			a3(vprotd xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm3, xmm2)
+			a3(vprotd xmm4, xmm4, 13)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm2, xmm1)
+			a3(pshufd xmm3, xmm3, 0x93)
+			a3(vprotd xmm4, xmm4, 18)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a3(pshufd xmm1, xmm1, 0x39)
+			a3(vpaddd xmm4, xmm3, xmm0)
+			a3(vprotd xmm4, xmm4, 7)
+			a3(vpxor xmm1, xmm1, xmm4)
+			a3(vpaddd xmm4, xmm0, xmm1)
+			a3(vprotd xmm4, xmm4, 9)
+			a3(vpxor xmm2, xmm2, xmm4)
+			a3(vpaddd xmm4, xmm1, xmm2)
+			a3(vprotd xmm4, xmm4, 13)
+			a3(vpxor xmm3, xmm3, xmm4)
+			a3(pshufd xmm1, xmm1, 0x93)
+			a3(vpaddd xmm4, xmm2, xmm3)
+			a3(pshufd xmm2, xmm2, 0x4e)
+			a3(vprotd xmm4, xmm4, 18)
+			a3(pshufd xmm3, xmm3, 0x39)
+			a3(vpxor xmm0, xmm0, xmm4)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa_xop_loop)
+		a3(vpaddd xmm0,xmm0,xmm8)
+		a3(vpaddd xmm1,xmm1,xmm9)
+		a3(vpaddd xmm2,xmm2,xmm10)
+		a3(vpaddd xmm3,xmm3,xmm11)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0x7f)
+		a2(add r9,64)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
+
+#define SCRYPT_SALSA_XOP
+
+static void asm_calling_convention NOINLINE
+scrypt_ChunkMix_xop(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			x4 = _mm_add_epi32(x1, x0);
+			x4 = _mm_roti_epi32(x4, 7);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = _mm_add_epi32(x0, x3);
+			x4 = _mm_roti_epi32(x4, 9);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = _mm_add_epi32(x3, x2);
+			x4 = _mm_roti_epi32(x4, 13);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = _mm_add_epi32(x2, x1);
+			x4 = _mm_roti_epi32(x4, 18);
+			x0 = _mm_xor_si128(x0, x4);
+			x3 = _mm_shuffle_epi32(x3, 0x93);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x1 = _mm_shuffle_epi32(x1, 0x39);
+			x4 = _mm_add_epi32(x3, x0);
+			x4 = _mm_roti_epi32(x4, 7);
+			x1 = _mm_xor_si128(x1, x4);
+			x4 = _mm_add_epi32(x0, x1);
+			x4 = _mm_roti_epi32(x4, 9);
+			x2 = _mm_xor_si128(x2, x4);
+			x4 = _mm_add_epi32(x1, x2);
+			x4 = _mm_roti_epi32(x4, 13);
+			x3 = _mm_xor_si128(x3, x4);
+			x4 = _mm_add_epi32(x2, x3);
+			x4 = _mm_roti_epi32(x4, 18);
+			x0 = _mm_xor_si128(x0, x4);
+			x1 = _mm_shuffle_epi32(x1, 0x93);
+			x2 = _mm_shuffle_epi32(x2, 0x4e);
+			x3 = _mm_shuffle_epi32(x3, 0x39);
+		}
+
+		x0 = _mm_add_epi32(x0, t0);
+		x1 = _mm_add_epi32(x1, t1);
+		x2 = _mm_add_epi32(x2, t2);
+		x3 = _mm_add_epi32(x3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA_XOP)
+	/* uses salsa_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa/8-XOP"
+	#undef SCRYPT_SALSA_INCLUDED
+	#define SCRYPT_SALSA_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64-avx.h b/scryptjane/scrypt-jane-mix_salsa64-avx.h
new file mode 100644
index 00000000..c6e41dc3
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64-avx.h
@@ -0,0 +1,367 @@
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_avx_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vpsrlq xmm8, xmm10, 51)
+			a3(vpsrlq xmm9, xmm11, 51)
+			a3(vpsllq xmm10, xmm10, 13)
+			a3(vpsllq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm8)
+			a3(vpxor xmm5, xmm5, xmm9)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vpsrlq xmm10, xmm8, 25)
+			a3(vpsrlq xmm11, xmm9, 25)
+			a3(vpsllq xmm8, xmm8, 39)
+			a3(vpsllq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vpsrlq xmm10, xmm8, 51)
+			a3(vpsrlq xmm11, xmm9, 51)
+			a3(vpsllq xmm8, xmm8, 13)
+			a3(vpsllq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm10)
+			a3(vpxor xmm4, xmm4, xmm11)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vpsrlq xmm8, xmm10, 25)
+			a3(vpsrlq xmm9, xmm11, 25)
+			a3(vpsllq xmm10, xmm10, 39)
+			a3(vpsllq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_avx_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_avx_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	/* uses salsa64_core_tangle_sse2 */
+	
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64-avx2.h b/scryptjane/scrypt-jane-mix_salsa64-avx2.h
new file mode 100644
index 00000000..a42e808b
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64-avx2.h
@@ -0,0 +1,221 @@
+/* x64 */
+#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx2)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa ymm0,[rax+0])
+	a2(vmovdqa ymm1,[rax+32])
+	a2(vmovdqa ymm2,[rax+64])
+	a2(vmovdqa ymm3,[rax+96])
+	aj(jz scrypt_ChunkMix_avx2_no_xor1)
+	a3(vpxor ymm0,ymm0,[r9+0])
+	a3(vpxor ymm1,ymm1,[r9+32])
+	a3(vpxor ymm2,ymm2,[r9+64])
+	a3(vpxor ymm3,ymm3,[r9+96])
+	a1(scrypt_ChunkMix_avx2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx2_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor ymm0,ymm0,[rsi+r9+0])
+		a3(vpxor ymm1,ymm1,[rsi+r9+32])
+		a3(vpxor ymm2,ymm2,[rsi+r9+64])
+		a3(vpxor ymm3,ymm3,[rsi+r9+96])
+		aj(jz scrypt_ChunkMix_avx2_no_xor2)
+		a3(vpxor ymm0,ymm0,[rdx+r9+0])
+		a3(vpxor ymm1,ymm1,[rdx+r9+32])
+		a3(vpxor ymm2,ymm2,[rdx+r9+64])
+		a3(vpxor ymm3,ymm3,[rdx+r9+96])
+		a1(scrypt_ChunkMix_avx2_no_xor2:)
+		a2(vmovdqa ymm6,ymm0)
+		a2(vmovdqa ymm7,ymm1)
+		a2(vmovdqa ymm8,ymm2)
+		a2(vmovdqa ymm9,ymm3)
+		a2(mov rax,4)
+		a1(scrypt_salsa64_avx2_loop: )
+			a3(vpaddq ymm4, ymm1, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm3)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm2, ymm2, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm2, ymm2, ymm4)
+			a3(vpaddq ymm4, ymm3, ymm2)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm1, ymm1, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm2, ymm1)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm1, ymm1, 0x39)
+			a3(vpermq ymm10, ymm2, 0x4e)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a3(vpermq ymm3, ymm3, 0x93)
+			a3(vpaddq ymm4, ymm3, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm1)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm10, ymm10, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm10, ymm10, ymm4)
+			a3(vpaddq ymm4, ymm1, ymm10)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm3, ymm3, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpermq ymm1, ymm1, 0x93)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpermq ymm2, ymm10, 0x4e)
+			a3(vpaddq ymm4, ymm10, ymm3)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm3, ymm3, 0x39)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a1(dec rax)
+			aj(jnz scrypt_salsa64_avx2_loop)
+		a3(vpaddq ymm0,ymm0,ymm6)
+		a3(vpaddq ymm1,ymm1,ymm7)
+		a3(vpaddq ymm2,ymm2,ymm8)
+		a3(vpaddq ymm3,ymm3,ymm9)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],ymm0)
+		a2(vmovdqa [rax+32],ymm1)
+		a2(vmovdqa [rax+64],ymm2)
+		a2(vmovdqa [rax+96],ymm3)
+		aj(jne scrypt_ChunkMix_avx2_loop)
+	a1(vzeroupper)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX2
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	y0 = ymmp[0];
+	y1 = ymmp[1];
+	y2 = ymmp[2];
+	y3 = ymmp[3];
+
+	if (Bxor) {
+		ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		ymmp = (ymmi *)scrypt_block(Bin, i);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+
+		if (Bxor) {
+			ymmp = (ymmi *)scrypt_block(Bxor, i);
+			y0 = _mm256_xor_si256(y0, ymmp[0]);
+			y1 = _mm256_xor_si256(y1, ymmp[1]);
+			y2 = _mm256_xor_si256(y2, ymmp[2]);
+			y3 = _mm256_xor_si256(y3, ymmp[3]);
+		}
+
+		t0 = y0;
+		t1 = y1;
+		t2 = y2;
+		t3 = y3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm256_add_epi64(y0, y1);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y3);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y1 = _mm256_xor_si256(y1, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y1 = _mm256_xor_si256(y1, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
+			z0 = _mm256_add_epi64(y1, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			z0 = _mm256_add_epi64(y0, y3);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y1 = _mm256_xor_si256(y1, z0);
+			z0 = _mm256_add_epi64(y1, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y1);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y3 = _mm256_xor_si256(y3, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
+		}
+
+		y0 = _mm256_add_epi64(y0, t0);
+		y1 = _mm256_add_epi64(y1, t1);
+		y2 = _mm256_add_epi64(y2, t2);
+		y3 = _mm256_add_epi64(y3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
+		ymmp[0] = y0;
+		ymmp[1] = y1;
+		ymmp[2] = y2;
+		ymmp[3] = y3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	/* uses salsa64_core_tangle_sse2 */
+	
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64-sse2.h b/scryptjane/scrypt-jane-mix_salsa64-sse2.h
new file mode 100644
index 00000000..971d98a3
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64-sse2.h
@@ -0,0 +1,449 @@
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_sse2_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			aj(ja scrypt_salsa64_sse2_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_sse2_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSE2
+
+static void asm_calling_convention
+scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSE2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
+
+/* sse3/avx use this as well */
+#if defined(SCRYPT_SALSA64_INCLUDED)
+	/*
+		Default layout:
+		 0  1  2  3
+		 4  5  6  7
+		 8  9 10 11
+		12 13 14 15
+
+		SSE2 layout:
+		 0  5 10 15
+		12  1  6 11
+		 8 13  2  7
+		 4  9 14  3
+	*/
+
+
+	static void asm_calling_convention
+	salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
+		uint64_t t;
+		while (count--) {
+			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+			blocks += 16;
+		}
+	}
+#endif
\ No newline at end of file
diff --git a/scryptjane/scrypt-jane-mix_salsa64-ssse3.h b/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
new file mode 100644
index 00000000..d1841283
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64-ssse3.h
@@ -0,0 +1,399 @@
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_ssse3_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_ssse3_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_ssse3_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_ssse3_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			aj(ja scrypt_salsa64_ssse3_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_ssse3_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSSE3
+
+static void asm_calling_convention
+scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	/* uses salsa64_core_tangle_sse2 */
+	
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSSE3"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64-xop.h b/scryptjane/scrypt-jane-mix_salsa64-xop.h
new file mode 100644
index 00000000..d51d1121
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64-xop.h
@@ -0,0 +1,335 @@
+/* x64 */
+#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_xop_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vprotq xmm10, xmm10, 13)
+			a3(vprotq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vprotq xmm8, xmm8, 39)
+			a3(vprotq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vprotq xmm8, xmm8, 13)
+			a3(vprotq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vprotq xmm10, xmm10, 39)
+			a3(vprotq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_xop_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_XOP
+
+static void asm_calling_convention
+scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	/* uses salsa64_core_tangle_sse2 */
+	
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-XOP"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
diff --git a/scryptjane/scrypt-jane-mix_salsa64.h b/scryptjane/scrypt-jane-mix_salsa64.h
new file mode 100644
index 00000000..2aec04f3
--- /dev/null
+++ b/scryptjane/scrypt-jane-mix_salsa64.h
@@ -0,0 +1,41 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa64/8 Ref"
+
+#undef SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_BASIC
+
+static void
+salsa64_core_basic(uint64_t state[16]) {
+	const size_t rounds = 8;
+	uint64_t v[16], t;
+	size_t i;
+
+	for (i = 0; i < 16; i++) v[i] = state[i];
+
+	#define G(a,b,c,d) \
+		t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
+		t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
+		t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
+		t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
+
+	for (i = 0; i < rounds; i += 2) {
+		G( 0, 4, 8,12);
+		G( 5, 9,13, 1);
+		G(10,14, 2, 6);
+		G(15, 3, 7,11);
+		G( 0, 1, 2, 3);
+		G( 5, 6, 7, 4);
+		G(10,11, 8, 9);
+		G(15,12,13,14);
+	}
+
+	for (i = 0; i < 16; i++) state[i] += v[i];
+
+	#undef G
+}
+
+#endif
+
diff --git a/scryptjane/scrypt-jane-portable-x86.h b/scryptjane/scrypt-jane-portable-x86.h
index 03282fa8..396a7bd8 100644
--- a/scryptjane/scrypt-jane-portable-x86.h
+++ b/scryptjane/scrypt-jane-portable-x86.h
@@ -1,15 +1,20 @@
 #if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
 	#define X86ASM
+
 	/* gcc 2.95 royally screws up stack alignments on variables */
-	#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
 		#define X86ASM_SSE
 		#define X86ASM_SSE2
 	#endif
-	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
 		#define X86ASM_SSSE3
 	#endif
-	#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
 		#define X86ASM_AVX
+		#define X86ASM_XOP
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
+		#define X86ASM_AVX2
 	#endif
 #endif
 
@@ -21,10 +26,14 @@
 	#endif
 	#if (COMPILER_GCC >= 40400)
 		#define X86_64ASM_AVX
+		#define X86_64ASM_XOP
+	#endif
+	#if (COMPILER_GCC >= 40700)
+		#define X86_64ASM_AVX2
 	#endif
 #endif
 
-#if defined(COMPILER_MSVC)
+#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
 	#define X86_INTRINSIC
 	#if defined(CPU_X86_64) || defined(X86ASM_SSE)
 		#define X86_INTRINSIC_SSE
@@ -32,17 +41,16 @@
 	#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
 		#define X86_INTRINSIC_SSE2
 	#endif
-	#if (COMPILER_MSVC >= 1400)
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
 		#define X86_INTRINSIC_SSSE3
 	#endif
-#endif
-
-#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
-	#define X86_64USE_INTRINSIC
-#endif
-
-#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
-	#define X86_64USE_INTRINSIC
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
+		#define X86_INTRINSIC_AVX
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
+		#define X86_INTRINSIC_AVX2
+	#endif
 #endif
 
 #if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
@@ -59,12 +67,17 @@
 	#if defined(__AVX__)
 		#define X86_INTRINSIC_AVX
 	#endif
+	#if defined(__XOP__)
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if defined(__AVX2__)
+		#define X86_INTRINSIC_AVX2
+	#endif
 #endif
 
 /* only use simd on windows (or SSE2 on gcc)! */
 #if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
 	#if defined(X86_INTRINSIC_SSE)
-		#define X86_INTRINSIC
 		#include <mmintrin.h>
 		#include <xmmintrin.h>
 		typedef __m64 qmm;
@@ -72,17 +85,27 @@
 		typedef __m128d xmmd;
 	#endif
 	#if defined(X86_INTRINSIC_SSE2)
-		#define X86_INTRINSIC_SSE2
 		#include <emmintrin.h>
 		typedef __m128i xmmi;
 	#endif
 	#if defined(X86_INTRINSIC_SSSE3)
-		#define X86_INTRINSIC_SSSE3
 		#include <tmmintrin.h>
 	#endif
+	#if defined(X86_INTRINSIC_AVX)
+		#include <immintrin.h>
+	#endif
+	#if defined(X86_INTRINSIC_XOP)
+		#if defined(COMPILER_MSVC)
+			#include <intrin.h>
+		#else
+			#include <x86intrin.h>
+		#endif
+	#endif
+	#if defined(X86_INTRINSIC_AVX2)
+		typedef __m256i ymmi;
+	#endif
 #endif
 
-
 #if defined(X86_INTRINSIC_SSE2)
 	typedef union packedelem8_t {
 		uint8_t u[16];
@@ -115,11 +138,9 @@
 	} packedelem64;
 #endif
 
-#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3)
-	const packedelem8 MM16 ssse3_rotr16_64bit      = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}};
-	const packedelem8 MM16 ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
-	const packedelem8 MM16 ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
-	const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}};
+#if defined(X86_INTRINSIC_SSSE3)
+	static const packedelem8 ALIGN(16) ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+	static const packedelem8 ALIGN(16) ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
 #endif
 
 /*
@@ -130,7 +151,8 @@
 		a1(..)
 		a2(.., ..)
 		a3(.., .., ..)
-		a1(ret)
+		64bit OR 0 paramters: a1(ret)
+		32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
 	asm_naked_fn_end(name)
 */
 
@@ -142,12 +164,13 @@
 	#define a2(x, y) __asm {x, y}
 	#define a3(x, y, z) __asm {x, y, z}
 	#define a4(x, y, z, w) __asm {x, y, z, w}
-	#define al(x) __asm {label##x:}
-	#define aj(x, y, z) __asm {x label##y}
+	#define aj(x) __asm {x}
 	#define asm_align8 a1(ALIGN 8)
 	#define asm_align16 a1(ALIGN 16)
 
-	#define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn
+	#define asm_calling_convention STDCALL
+	#define aret(n) a1(ret n)
+	#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
 	#define asm_naked_fn(fn) {
 	#define asm_naked_fn_end(fn) }
 #elif defined(COMPILER_GCC)
@@ -155,21 +178,66 @@
 	#define GNU_AS2(x, y) #x ", " #y ";\n"
 	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
 	#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
-	#define GNU_ASL(x) "\n" #x ":\n"
-	#define GNU_ASJ(x, y, z) #x " " #y #z ";"
+	#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
+	#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
 
 	#define a1(x) GNU_AS1(x)
 	#define a2(x, y) GNU_AS2(x, y)
 	#define a3(x, y, z) GNU_AS3(x, y, z)
 	#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
-	#define al(x) GNU_ASL(x)
-	#define aj(x, y, z) GNU_ASJ(x, y, z)
-	#define asm_align8 a1(.align 8)
-	#define asm_align16 a1(.align 16)
-
-	#define asm_naked_fn_proto(type, fn) extern type STDCALL fn
-	#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn)
-	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type  " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
+	#define aj(x) GNU_ASJ(x)
+	#define asm_align8 ".p2align 3,,7"
+	#define asm_align16 ".p2align 4,,15"
+
+	#if defined(OS_WINDOWS)
+		#define asm_calling_convention CDECL
+		#define aret(n) a1(ret)
+
+		#if defined(X86_64ASM)
+			#define asm_naked_fn(fn) ; __asm__ ( \
+				".text\n"                        \
+				asm_align16 GNU_ASFN(fn)         \
+				"subq $136, %rsp;"               \
+			 	"movdqa %xmm6, 0(%rsp);"         \
+				"movdqa %xmm7, 16(%rsp);"        \
+			 	"movdqa %xmm8, 32(%rsp);"        \
+				"movdqa %xmm9, 48(%rsp);"        \
+			 	"movdqa %xmm10, 64(%rsp);"       \
+				"movdqa %xmm11, 80(%rsp);"       \
+				"movdqa %xmm12, 96(%rsp);"       \
+				"movq %rdi, 112(%rsp);"          \
+				"movq %rsi, 120(%rsp);"          \
+				"movq %rcx, %rdi;"               \
+				"movq %rdx, %rsi;"               \
+				"movq %r8, %rdx;"                \
+				"movq %r9, %rcx;"                \
+				"call 1f;"                       \
+				"movdqa 0(%rsp), %xmm6;"         \
+				"movdqa 16(%rsp), %xmm7;"        \
+				"movdqa 32(%rsp), %xmm8;"        \
+				"movdqa 48(%rsp), %xmm9;"        \
+				"movdqa 64(%rsp), %xmm10;"       \
+				"movdqa 80(%rsp), %xmm11;"       \
+				"movdqa 96(%rsp), %xmm12;"       \
+				"movq 112(%rsp), %rdi;"          \
+				"movq 120(%rsp), %rsi;"          \
+				"addq $136, %rsp;"               \
+				"ret;"                           \
+				".intel_syntax noprefix;"        \
+				".p2align 4,,15;"                \
+				"1:;"
+		#else
+			#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+		#endif
+	#else
+		#define asm_calling_convention STDCALL
+		#define aret(n) a1(ret n)		
+		#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+	#endif
+
+	#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
+	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
+
 	#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
 	#define asm_gcc_parms() ".att_syntax prefix;"
 	#define asm_gcc_trashed() __asm__ __volatile__("" :::
@@ -191,7 +259,9 @@ typedef enum cpu_flags_x86_t {
 	cpu_ssse3 = 1 << 4,
 	cpu_sse4_1 = 1 << 5,
 	cpu_sse4_2 = 1 << 6,
-	cpu_avx = 1 << 7
+	cpu_avx = 1 << 7,
+	cpu_xop = 1 << 8,
+	cpu_avx2 = 1 << 9
 } cpu_flags_x86;
 
 typedef enum cpu_vendors_x86_t {
@@ -238,6 +308,7 @@ get_cpuid(x86_regs *regs, uint32_t flags) {
 
 	asm_gcc()
 		a1(push cpuid_bx)
+		a2(xor ecx, ecx)
 		a1(cpuid)
 		a2(mov [%1 + 0], eax)
 		a2(mov [%1 + 4], ebx)
@@ -274,7 +345,7 @@ detect_cpu(void) {
 	union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
 	cpu_vendors_x86 vendor = cpu_nobody;
 	x86_regs regs;
-	uint32_t max_level;
+	uint32_t max_level, max_ext_level;
 	size_t cpu_flags = 0;
 #if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
 	uint64_t xgetbv_flags;
@@ -320,7 +391,22 @@ detect_cpu(void) {
 	if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
 	if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
 	if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
-	
+
+	if (cpu_flags & cpu_avx) {
+		if (max_level >= 7) {
+			get_cpuid(&regs, 7);
+			if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
+		}
+
+		get_cpuid(&regs, 0x80000000);
+		max_ext_level = regs.eax;
+		if (max_ext_level >= 0x80000001) {
+			get_cpuid(&regs, 0x80000001);
+			if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
+		}
+	}
+
+
 #if defined(SCRYPT_TEST_SPEED)
 	cpu_flags &= cpu_detect_mask;
 #endif
@@ -331,7 +417,9 @@ detect_cpu(void) {
 #if defined(SCRYPT_TEST_SPEED)
 static const char *
 get_top_cpuflag_desc(size_t flag) {
-	if (flag & cpu_avx) return "AVX";
+	if (flag & cpu_avx2) return "AVX2";
+	else if (flag & cpu_xop) return "XOP";
+	else if (flag & cpu_avx) return "AVX";
 	else if (flag & cpu_sse4_2) return "SSE4.2";
 	else if (flag & cpu_sse4_1) return "SSE4.1";
 	else if (flag & cpu_ssse3) return "SSSE3";
@@ -344,6 +432,16 @@ get_top_cpuflag_desc(size_t flag) {
 
 /* enable the highest system-wide option */
 #if defined(SCRYPT_CHOOSE_COMPILETIME)
+	#if !defined(__AVX2__)
+		#undef X86_64ASM_AVX2
+		#undef X86ASM_AVX2
+		#undef X86_INTRINSIC_AVX2
+	#endif
+	#if !defined(__XOP__)
+		#undef X86_64ASM_XOP
+		#undef X86ASM_XOP
+		#undef X86_INTRINSIC_XOP
+	#endif
 	#if !defined(__AVX__)
 		#undef X86_64ASM_AVX
 		#undef X86ASM_AVX
diff --git a/scryptjane/scrypt-jane-portable.h b/scryptjane/scrypt-jane-portable.h
index 33c8c2ca..e83e3149 100644
--- a/scryptjane/scrypt-jane-portable.h
+++ b/scryptjane/scrypt-jane-portable.h
@@ -36,14 +36,29 @@
 
 /* determine compiler */
 #if defined(_MSC_VER)
-	#define COMPILER_MSVC _MSC_VER
-	#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
-		#define COMPILER_MSVC6PP_AND_LATER
+	#define COMPILER_MSVC_VS6       120000000
+	#define COMPILER_MSVC_VS6PP     121000000
+	#define COMPILER_MSVC_VS2002    130000000
+	#define COMPILER_MSVC_VS2003    131000000
+	#define COMPILER_MSVC_VS2005    140050727
+	#define COMPILER_MSVC_VS2008    150000000
+	#define COMPILER_MSVC_VS2008SP1 150030729
+	#define COMPILER_MSVC_VS2010    160000000
+	#define COMPILER_MSVC_VS2010SP1 160040219
+	#define COMPILER_MSVC_VS2012RC  170000000
+	#define COMPILER_MSVC_VS2012    170050727
+
+	#if _MSC_FULL_VER > 100000000
+		#define COMPILER_MSVC (_MSC_FULL_VER)
+	#else
+		#define COMPILER_MSVC (_MSC_FULL_VER * 10)
 	#endif
-	#if (COMPILER_MSVC >= 1500)
-		#define COMPILER_HAS_TMMINTRIN
+
+	#if ((_MSC_VER == 1200) && defined(_mm_free))
+		#undef COMPILER_MSVC
+		#define COMPILER_MSVC COMPILER_MSVC_VS6PP
 	#endif
-	
+
 	#pragma warning(disable : 4127) /* conditional expression is constant */
 	#pragma warning(disable : 4100) /* unreferenced formal parameter */
 	
@@ -65,6 +80,8 @@
 	#define ROTR64(a,b) _rotr64(a,b)
 	#undef NOINLINE
 	#define NOINLINE __declspec(noinline)
+	#undef NORETURN
+	#define NORETURN
 	#undef INLINE
 	#define INLINE __forceinline
 	#undef FASTCALL
@@ -75,7 +92,7 @@
 	#define STDCALL __stdcall
 	#undef NAKED
 	#define NAKED __declspec(naked)
-	#define MM16 __declspec(align(16))
+	#define ALIGN(n) __declspec(align(n))
 #endif
 #if defined(__ICC)
 	#define COMPILER_INTEL
@@ -97,6 +114,12 @@
 	#else
 		#define NOINLINE
 	#endif
+	#undef NORETURN
+	#if (COMPILER_GCC >= 30000)
+		#define NORETURN __attribute__((noreturn))
+	#else
+		#define NORETURN
+	#endif
 	#undef INLINE
 	#if (COMPILER_GCC >= 30000)
 		#define INLINE __attribute__((always_inline))
@@ -113,7 +136,7 @@
 	#define CDECL __attribute__((cdecl))
 	#undef STDCALL
 	#define STDCALL __attribute__((stdcall))
-	#define MM16 __attribute__((aligned(16)))
+	#define ALIGN(n) __attribute__((aligned(n)))
 	#include <stdint.h>
 #endif
 #if defined(__MINGW32__) || defined(__MINGW64__)
@@ -247,7 +270,7 @@ scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
 	return (1 & ((differentbits - 1) >> 8));
 }
 
-void
+static void
 scrypt_ensure_zero(void *p, size_t len) {
 #if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
 		__stosb((unsigned char *)p, 0, len);
@@ -279,3 +302,6 @@ scrypt_ensure_zero(void *p, size_t len) {
 
 #include "scrypt-jane-portable-x86.h"
 
+#if !defined(asm_calling_convention)
+#define asm_calling_convention
+#endif
diff --git a/scryptjane/scrypt-jane-romix-basic.h b/scryptjane/scrypt-jane-romix-basic.h
index ca1df02d..57ba649f 100644
--- a/scryptjane/scrypt-jane-romix-basic.h
+++ b/scryptjane/scrypt-jane-romix-basic.h
@@ -4,12 +4,13 @@ typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, sc
 #endif
 
 /* romix pre/post nop function */
-static void STDCALL
+static void asm_calling_convention
 scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+	(void)blocks; (void)nblocks;
 }
 
 /* romix pre/post endian conversion function */
-static void STDCALL
+static void asm_calling_convention
 scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
 #if !defined(CPU_LE)
 	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
@@ -20,18 +21,24 @@ scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
 			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
 		}
 	}
+#else
+	(void)blocks; (void)nblocks;
 #endif
 }
 
 /* chunkmix test function */
-typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
-typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
 
 static int
 scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
 	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
 	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
-	scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#else
+	scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#endif
 	uint8_t final[16];
 	size_t i;
 
diff --git a/scryptjane/scrypt-jane-romix-template.h b/scryptjane/scrypt-jane-romix-template.h
index 2fd7674e..6bbda621 100644
--- a/scryptjane/scrypt-jane-romix-template.h
+++ b/scryptjane/scrypt-jane-romix-template.h
@@ -17,9 +17,13 @@
 
 	2*r: number of blocks in the chunk
 */
-static void STDCALL
+static void asm_calling_convention
 SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
-	scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
+#else
+	scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
+#endif
 	uint32_t i, j, blocksPerChunk = r * 2, half = 0;
 
 	/* 1: X = B_{2r - 1} */
@@ -69,7 +73,7 @@ SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *B
 
 static void NOINLINE FASTCALL
 SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
-	uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
+	uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * r * 2);
 	scrypt_mix_word_t *block = V;
 
 	SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
diff --git a/scryptjane/scrypt-jane-romix.h b/scryptjane/scrypt-jane-romix.h
index faa655a0..84cf6120 100644
--- a/scryptjane/scrypt-jane-romix.h
+++ b/scryptjane/scrypt-jane-romix.h
@@ -13,11 +13,11 @@
 	#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
 	#if !defined(SCRYPT_CHOOSE_COMPILETIME)
 		static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
-		static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; }
+		static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
 	#else
 		static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
 	#endif
-	static int scrypt_test_mix() { return 0; }
+	static int scrypt_test_mix(void) { return 0; }
 	#error must define a mix function!
 #endif
 
diff --git a/scryptjane/scrypt-jane-salsa.h b/scryptjane/scrypt-jane-salsa.h
index 0c1604ba..df0a3e0c 100644
--- a/scryptjane/scrypt-jane-salsa.h
+++ b/scryptjane/scrypt-jane-salsa.h
@@ -11,10 +11,19 @@ typedef uint32_t scrypt_mix_word_t;
 /* must have these here in case block bytes is ever != 64 */
 #include "scrypt-jane-romix-basic.h"
 
+#include "scrypt-jane-mix_salsa-xop.h"
 #include "scrypt-jane-mix_salsa-avx.h"
 #include "scrypt-jane-mix_salsa-sse2.h"
 #include "scrypt-jane-mix_salsa.h"
 
+#if defined(SCRYPT_SALSA_XOP)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
+	#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
 #if defined(SCRYPT_SALSA_AVX)
 	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
 	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
@@ -41,9 +50,15 @@ typedef uint32_t scrypt_mix_word_t;
 
 #if !defined(SCRYPT_CHOOSE_COMPILETIME)
 static scrypt_ROMixfn
-scrypt_getROMix() {
+scrypt_getROMix(void) {
 	size_t cpuflags = detect_cpu();
 
+#if defined(SCRYPT_SALSA_XOP)
+	if (cpuflags & cpu_xop)
+		return scrypt_ROMix_xop;
+	else
+#endif
+
 #if defined(SCRYPT_SALSA_AVX)
 	if (cpuflags & cpu_avx)
 		return scrypt_ROMix_avx;
@@ -63,14 +78,22 @@ scrypt_getROMix() {
 
 #if defined(SCRYPT_TEST_SPEED)
 static size_t
-available_implementations() {
+available_implementations(void) {
+	size_t cpuflags = detect_cpu();
 	size_t flags = 0;
 
+#if defined(SCRYPT_SALSA_XOP)
+	if (cpuflags & cpu_xop)
+		flags |= cpu_xop;
+#endif
+
 #if defined(SCRYPT_SALSA_AVX)
+	if (cpuflags & cpu_avx)
 		flags |= cpu_avx;
 #endif
 
 #if defined(SCRYPT_SALSA_SSE2)
+	if (cpuflags & cpu_sse2)
 		flags |= cpu_sse2;
 #endif
 
@@ -80,7 +103,7 @@ available_implementations() {
 
 
 static int
-scrypt_test_mix() {
+scrypt_test_mix(void) {
 	static const uint8_t expected[16] = {
 		0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
 	};
@@ -88,6 +111,11 @@ scrypt_test_mix() {
 	int ret = 1;
 	size_t cpuflags = detect_cpu();
 
+#if defined(SCRYPT_SALSA_XOP)
+	if (cpuflags & cpu_xop)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
+#endif
+
 #if defined(SCRYPT_SALSA_AVX)
 	if (cpuflags & cpu_avx)
 		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
diff --git a/scryptjane/scrypt-jane-salsa64.h b/scryptjane/scrypt-jane-salsa64.h
new file mode 100644
index 00000000..96b78136
--- /dev/null
+++ b/scryptjane/scrypt-jane-salsa64.h
@@ -0,0 +1,183 @@
+#define SCRYPT_MIX_BASE "Salsa64/8"
+
+typedef uint64_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U64TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
+
+#define SCRYPT_BLOCK_BYTES 128
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa64-avx2.h"
+#include "scrypt-jane-mix_salsa64-xop.h"
+#include "scrypt-jane-mix_salsa64-avx.h"
+#include "scrypt-jane-mix_salsa64-ssse3.h"
+#include "scrypt-jane-mix_salsa64-sse2.h"
+#include "scrypt-jane-mix_salsa64.h"
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa64_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix(void) {
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		return scrypt_ROMix_avx2;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		return scrypt_ROMix_xop;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		return scrypt_ROMix_avx;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		return scrypt_ROMix_ssse3;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		return scrypt_ROMix_sse2;
+	else
+#endif
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations(void) {
+	size_t cpuflags = detect_cpu();
+	size_t flags = 0;
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		flags |= cpu_avx2;
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		flags |= cpu_xop;
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		flags |= cpu_sse2;
+#endif
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix(void) {
+	static const uint8_t expected[16] = {
+		0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
diff --git a/scryptjane/scrypt-jane-test-vectors.h b/scryptjane/scrypt-jane-test-vectors.h
index a1e4c619..72a72763 100644
--- a/scryptjane/scrypt-jane-test-vectors.h
+++ b/scryptjane/scrypt-jane-test-vectors.h
@@ -6,7 +6,7 @@ typedef struct scrypt_test_setting_t {
 static const scrypt_test_setting post_settings[] = {
 	{"", "", 3, 0, 0},
 	{"password", "NaCl", 9, 3, 4},
-	{0}
+	{0, 0, 0, 0, 0}
 };
 
 #if defined(SCRYPT_SHA256)