Initial upload v3.4.7

2025-09-17 23:44:27 +00:00 · 2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions
--- a/algo/argon2/ar2/sj/scrypt-jane-hash.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-hash.h
@@ -0,0 +1,38 @@
+#if defined(SCRYPT_SKEIN512)
+#include "scrypt-jane-hash_skein512.h"
+#else
+	#define SCRYPT_HASH "ERROR"
+	#define SCRYPT_HASH_BLOCK_SIZE 64
+	#define SCRYPT_HASH_DIGEST_SIZE 64
+	typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
+	typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+	static void scrypt_hash_init(scrypt_hash_state *S) {}
+	static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
+	static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
+	static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
+	#error must define a hash function!
+#endif
+
+#include "scrypt-jane-pbkdf2.h"
+
+#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
+
+static int
+scrypt_test_hash(void) {
+	scrypt_hash_state st;
+	scrypt_hash_digest hash, final;
+	uint8_t msg[SCRYPT_TEST_HASH_LEN];
+	size_t i;
+
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
+		msg[i] = (uint8_t)i;
+
+	scrypt_hash_init(&st);
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
+		scrypt_hash(hash, msg, i);
+		scrypt_hash_update(&st, hash, sizeof(hash));
+	}
+	scrypt_hash_finish(&st, final);
+	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
+}
+
--- a/algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
@@ -0,0 +1,188 @@
+#define SCRYPT_HASH "Skein-512"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t X[8], T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+#include <stdio.h>
+
+static void
+skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
+	uint64_t X[8], key[8], Xt[9+18], T[3+1];
+	size_t r;
+
+	while (blocks--) {
+		T[0] = S->T[0] + add;
+		T[1] = S->T[1];
+		T[2] = T[0] ^ T[1];
+		key[0] = U8TO64_LE(in +  0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
+		key[1] = U8TO64_LE(in +  8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
+		key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
+		key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
+		key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
+		key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
+		key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
+		key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
+		Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
+		in += SCRYPT_HASH_BLOCK_SIZE;
+
+		for (r = 0; r < 18; r++)
+			Xt[r + 9] = Xt[r + 0];
+
+		for (r = 0; r < 18; r += 2) {
+			X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7],  9) ^ X[0];
+
+			X[0] += Xt[r + 1];
+			X[1] += Xt[r + 2];
+			X[2] += Xt[r + 3];
+			X[3] += Xt[r + 4];
+			X[4] += Xt[r + 5];
+			X[5] += Xt[r + 6] + T[1];
+			X[6] += Xt[r + 7] + T[2];
+			X[7] += Xt[r + 8] + r + 1;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+
+			X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1],  8) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
+
+			X[0] += Xt[r + 2];
+			X[1] += Xt[r + 3];
+			X[2] += Xt[r + 4];
+			X[3] += Xt[r + 5];
+			X[4] += Xt[r + 6];
+			X[5] += Xt[r + 7] + T[1];
+			X[6] += Xt[r + 8] + T[2];
+			X[7] += Xt[r + 9] + r + 2;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+		}
+
+		S->X[0] = key[0] ^ X[0];
+		S->X[1] = key[1] ^ X[1];
+		S->X[2] = key[2] ^ X[2];
+		S->X[3] = key[3] ^ X[3];
+		S->X[4] = key[4] ^ X[4];
+		S->X[5] = key[5] ^ X[5];
+		S->X[6] = key[6] ^ X[6];
+		S->X[7] = key[7] ^ X[7];
+
+		S->T[0] = T[0];
+		S->T[1] = T[1] & ~0x4000000000000000ull;
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->X[0] = 0x4903ADFF749C51CEull;
+	S->X[1] = 0x0D95DE399746DF03ull;
+	S->X[2] = 0x8FD1934127C79BCEull;
+	S->X[3] = 0x9A255629FF352CB1ull;
+	S->X[4] = 0x5DB62599DF6CA7B0ull;
+	S->X[5] = 0xEABE394CA9D5C3F4ull;
+	S->X[6] = 0x991112C71A75B523ull;
+	S->X[7] = 0xAE18A40B660FCC33ull;
+	S->T[0] = 0x0000000000000000ull;
+	S->T[1] = 0x7000000000000000ull;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
+	if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
+		/* handle the previous data, we know there is enough for at least one block */
+		if (S->leftover) {
+			want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+			memcpy(S->buffer + S->leftover, in, want);
+			in += want;
+			inlen -= want;
+			S->leftover = 0;
+			skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
+		}
+
+		/* handle the current data if there's more than one block */
+		if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
+			blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+			skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
+			inlen -= blocks;
+			in += blocks;
+		}
+	}
+
+	/* handle leftover data */
+	memcpy(S->buffer + S->leftover, in, inlen);
+	S->leftover += (int) inlen;
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+	S->T[1] |= 0x8000000000000000ull;
+	skein512_blocks(S, S->buffer, 1, S->leftover);
+
+	memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
+	S->T[0] = 0;
+	S->T[1] = 0xff00000000000000ull;
+	skein512_blocks(S, S->buffer, 1, 8);
+
+	U64TO8_LE(&hash[ 0], S->X[0]);
+	U64TO8_LE(&hash[ 8], S->X[1]);
+	U64TO8_LE(&hash[16], S->X[2]);
+	U64TO8_LE(&hash[24], S->X[3]);
+	U64TO8_LE(&hash[32], S->X[4]);
+	U64TO8_LE(&hash[40], S->X[5]);
+	U64TO8_LE(&hash[48], S->X[6]);
+	U64TO8_LE(&hash[56], S->X[7]);
+}
+
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
+	0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
+	0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
+	0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
+};
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
@@ -0,0 +1,367 @@
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_avx_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vpsrlq xmm8, xmm10, 51)
+			a3(vpsrlq xmm9, xmm11, 51)
+			a3(vpsllq xmm10, xmm10, 13)
+			a3(vpsllq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm8)
+			a3(vpxor xmm5, xmm5, xmm9)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vpsrlq xmm10, xmm8, 25)
+			a3(vpsrlq xmm11, xmm9, 25)
+			a3(vpsllq xmm8, xmm8, 39)
+			a3(vpsllq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vpsrlq xmm10, xmm8, 51)
+			a3(vpsrlq xmm11, xmm9, 51)
+			a3(vpsllq xmm8, xmm8, 13)
+			a3(vpsllq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm10)
+			a3(vpxor xmm4, xmm4, xmm11)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vpsrlq xmm8, xmm10, 25)
+			a3(vpsrlq xmm9, xmm11, 25)
+			a3(vpsllq xmm10, xmm10, 39)
+			a3(vpsllq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_avx_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_avx_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
@@ -0,0 +1,221 @@
+/* x64 */
+#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx2)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa ymm0,[rax+0])
+	a2(vmovdqa ymm1,[rax+32])
+	a2(vmovdqa ymm2,[rax+64])
+	a2(vmovdqa ymm3,[rax+96])
+	aj(jz scrypt_ChunkMix_avx2_no_xor1)
+	a3(vpxor ymm0,ymm0,[r9+0])
+	a3(vpxor ymm1,ymm1,[r9+32])
+	a3(vpxor ymm2,ymm2,[r9+64])
+	a3(vpxor ymm3,ymm3,[r9+96])
+	a1(scrypt_ChunkMix_avx2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx2_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor ymm0,ymm0,[rsi+r9+0])
+		a3(vpxor ymm1,ymm1,[rsi+r9+32])
+		a3(vpxor ymm2,ymm2,[rsi+r9+64])
+		a3(vpxor ymm3,ymm3,[rsi+r9+96])
+		aj(jz scrypt_ChunkMix_avx2_no_xor2)
+		a3(vpxor ymm0,ymm0,[rdx+r9+0])
+		a3(vpxor ymm1,ymm1,[rdx+r9+32])
+		a3(vpxor ymm2,ymm2,[rdx+r9+64])
+		a3(vpxor ymm3,ymm3,[rdx+r9+96])
+		a1(scrypt_ChunkMix_avx2_no_xor2:)
+		a2(vmovdqa ymm6,ymm0)
+		a2(vmovdqa ymm7,ymm1)
+		a2(vmovdqa ymm8,ymm2)
+		a2(vmovdqa ymm9,ymm3)
+		a2(mov rax,4)
+		a1(scrypt_salsa64_avx2_loop: )
+			a3(vpaddq ymm4, ymm1, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm3)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm2, ymm2, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm2, ymm2, ymm4)
+			a3(vpaddq ymm4, ymm3, ymm2)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm1, ymm1, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm2, ymm1)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm1, ymm1, 0x39)
+			a3(vpermq ymm10, ymm2, 0x4e)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a3(vpermq ymm3, ymm3, 0x93)
+			a3(vpaddq ymm4, ymm3, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm1)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm10, ymm10, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm10, ymm10, ymm4)
+			a3(vpaddq ymm4, ymm1, ymm10)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm3, ymm3, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpermq ymm1, ymm1, 0x93)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpermq ymm2, ymm10, 0x4e)
+			a3(vpaddq ymm4, ymm10, ymm3)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm3, ymm3, 0x39)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a1(dec rax)
+			aj(jnz scrypt_salsa64_avx2_loop)
+		a3(vpaddq ymm0,ymm0,ymm6)
+		a3(vpaddq ymm1,ymm1,ymm7)
+		a3(vpaddq ymm2,ymm2,ymm8)
+		a3(vpaddq ymm3,ymm3,ymm9)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],ymm0)
+		a2(vmovdqa [rax+32],ymm1)
+		a2(vmovdqa [rax+64],ymm2)
+		a2(vmovdqa [rax+96],ymm3)
+		aj(jne scrypt_ChunkMix_avx2_loop)
+	a1(vzeroupper)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX2
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	y0 = ymmp[0];
+	y1 = ymmp[1];
+	y2 = ymmp[2];
+	y3 = ymmp[3];
+
+	if (Bxor) {
+		ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		ymmp = (ymmi *)scrypt_block(Bin, i);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+
+		if (Bxor) {
+			ymmp = (ymmi *)scrypt_block(Bxor, i);
+			y0 = _mm256_xor_si256(y0, ymmp[0]);
+			y1 = _mm256_xor_si256(y1, ymmp[1]);
+			y2 = _mm256_xor_si256(y2, ymmp[2]);
+			y3 = _mm256_xor_si256(y3, ymmp[3]);
+		}
+
+		t0 = y0;
+		t1 = y1;
+		t2 = y2;
+		t3 = y3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm256_add_epi64(y0, y1);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y3);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y1 = _mm256_xor_si256(y1, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y1 = _mm256_xor_si256(y1, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
+			z0 = _mm256_add_epi64(y1, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			z0 = _mm256_add_epi64(y0, y3);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y1 = _mm256_xor_si256(y1, z0);
+			z0 = _mm256_add_epi64(y1, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y1);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y3 = _mm256_xor_si256(y3, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
+		}
+
+		y0 = _mm256_add_epi64(y0, t0);
+		y1 = _mm256_add_epi64(y1, t1);
+		y2 = _mm256_add_epi64(y2, t2);
+		y3 = _mm256_add_epi64(y3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
+		ymmp[0] = y0;
+		ymmp[1] = y1;
+		ymmp[2] = y2;
+		ymmp[3] = y3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
@@ -0,0 +1,449 @@
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_sse2_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			aj(ja scrypt_salsa64_sse2_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_sse2_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSE2
+
+static void asm_calling_convention
+scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSE2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
+
+/* sse3/avx use this as well */
+#if defined(SCRYPT_SALSA64_INCLUDED)
+	/*
+		Default layout:
+		 0  1  2  3
+		 4  5  6  7
+		 8  9 10 11
+		12 13 14 15
+
+		SSE2 layout:
+		 0  5 10 15
+		12  1  6 11
+		 8 13  2  7
+		 4  9 14  3
+	*/
+
+
+	static void asm_calling_convention
+	salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
+		uint64_t t;
+		while (count--) {
+			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+			blocks += 16;
+		}
+	}
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
@@ -0,0 +1,399 @@
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_ssse3_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_ssse3_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_ssse3_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_ssse3_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			aj(ja scrypt_salsa64_ssse3_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_ssse3_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSSE3
+
+static void asm_calling_convention
+scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSSE3"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
@@ -0,0 +1,335 @@
+/* x64 */
+#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_xop_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vprotq xmm10, xmm10, 13)
+			a3(vprotq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vprotq xmm8, xmm8, 39)
+			a3(vprotq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vprotq xmm8, xmm8, 13)
+			a3(vprotq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vprotq xmm10, xmm10, 39)
+			a3(vprotq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_xop_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_XOP
+
+static void asm_calling_convention
+scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-XOP"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
@@ -0,0 +1,41 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa64/8 Ref"
+
+#undef SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_BASIC
+
+static void
+salsa64_core_basic(uint64_t state[16]) {
+	const size_t rounds = 8;
+	uint64_t v[16], t;
+	size_t i;
+
+	for (i = 0; i < 16; i++) v[i] = state[i];
+
+	#define G(a,b,c,d) \
+		t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
+		t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
+		t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
+		t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
+
+	for (i = 0; i < rounds; i += 2) {
+		G( 0, 4, 8,12);
+		G( 5, 9,13, 1);
+		G(10,14, 2, 6);
+		G(15, 3, 7,11);
+		G( 0, 1, 2, 3);
+		G( 5, 6, 7, 4);
+		G(10,11, 8, 9);
+		G(15,12,13,14);
+	}
+
+	for (i = 0; i < 16; i++) state[i] += v[i];
+
+	#undef G
+}
+
+#endif
+
--- a/algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
@@ -0,0 +1,112 @@
+typedef struct scrypt_hmac_state_t {
+	scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void
+scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
+	scrypt_hash_state st;
+	scrypt_hash_init(&st);
+	scrypt_hash_update(&st, m, mlen);
+	scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void
+scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	scrypt_hash_init(&st->inner);
+	scrypt_hash_init(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		scrypt_hash(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	scrypt_ensure_zero(pad, sizeof(pad));
+}
+
+static void
+scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
+	/* h(inner || m...) */
+	scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void
+scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
+	/* h(inner || m) */
+	scrypt_hash_digest innerhash;
+	scrypt_hash_finish(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+	scrypt_hash_finish(&st->outer, mac);
+
+	scrypt_ensure_zero(st, sizeof(*st));
+}
+
+static void
+scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
+	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+	scrypt_hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, j, blocks;
+	uint64_t c;
+
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	scrypt_hmac_init(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for (i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		scrypt_hmac_update(&work, be, 4);
+		scrypt_hmac_finish(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		/* T[i] = U1 ^ U2 ^ U3... */
+		for (c = 0; c < N - 1; c++) {
+			/* UX = hmac(password, U{X-1}) */
+			work = hmac_pw;
+			scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+			scrypt_hmac_finish(&work, u);
+
+			/* T[i] ^= UX */
+			for (j = 0; j < sizeof(u); j++)
+				ti[j] ^= u[j];
+		}
+
+		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		out += SCRYPT_HASH_DIGEST_SIZE;
+		bytes -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+
+	scrypt_ensure_zero(ti, sizeof(ti));
+	scrypt_ensure_zero(u, sizeof(u));
+	scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
+	scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+}
--- a/algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
@@ -0,0 +1,463 @@
+#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
+	#define X86ASM
+
+	/* gcc 2.95 royally screws up stack alignments on variables */
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
+		#define X86ASM_SSE
+		#define X86ASM_SSE2
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
+		#define X86ASM_SSSE3
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
+		#define X86ASM_AVX
+		#define X86ASM_XOP
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
+		#define X86ASM_AVX2
+	#endif
+#endif
+
+#if defined(CPU_X86_64) && defined(COMPILER_GCC)
+	#define X86_64ASM
+	#define X86_64ASM_SSE2
+	#if (COMPILER_GCC >= 40102)
+		#define X86_64ASM_SSSE3
+	#endif
+	#if (COMPILER_GCC >= 40400)
+		#define X86_64ASM_AVX
+		#define X86_64ASM_XOP
+	#endif
+	#if (COMPILER_GCC >= 40700)
+		#define X86_64ASM_AVX2
+	#endif
+#endif
+
+#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
+	#define X86_INTRINSIC
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
+		#define X86_INTRINSIC_AVX
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
+		#define X86_INTRINSIC_AVX2
+	#endif
+#endif
+
+#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
+	#define X86_INTRINSIC
+	#if defined(__SSE__)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(__SSE2__)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if defined(__SSSE3__)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+	#if defined(__AVX__)
+		#define X86_INTRINSIC_AVX
+	#endif
+	#if defined(__XOP__)
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if defined(__AVX2__)
+		#define X86_INTRINSIC_AVX2
+	#endif
+#endif
+
+/* only use simd on windows (or SSE2 on gcc)! */
+#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
+	#if defined(X86_INTRINSIC_SSE)
+		#include <mmintrin.h>
+		#include <xmmintrin.h>
+		typedef __m64 qmm;
+		typedef __m128 xmm;
+		typedef __m128d xmmd;
+	#endif
+	#if defined(X86_INTRINSIC_SSE2)
+		#include <emmintrin.h>
+		typedef __m128i xmmi;
+	#endif
+	#if defined(X86_INTRINSIC_SSSE3)
+		#include <tmmintrin.h>
+	#endif
+	#if defined(X86_INTRINSIC_AVX)
+		#include <immintrin.h>
+	#endif
+	#if defined(X86_INTRINSIC_XOP)
+		#if defined(COMPILER_MSVC)
+			#include <intrin.h>
+		#else
+			#include <x86intrin.h>
+		#endif
+	#endif
+	#if defined(X86_INTRINSIC_AVX2)
+		typedef __m256i ymmi;
+	#endif
+#endif
+
+#if defined(X86_INTRINSIC_SSE2)
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		xmmi v;
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		xmmi v;
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		xmmi v;
+	} packedelem64;
+#else
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		uint32_t dw[4];
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		uint8_t b[16];
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		uint8_t b[16];
+	} packedelem64;
+#endif
+
+#if defined(X86_INTRINSIC_SSSE3)
+	static const packedelem8 ALIGN(16) ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+	static const packedelem8 ALIGN(16) ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
+#endif
+
+/*
+	x86 inline asm for gcc/msvc. usage:
+
+	asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
+	asm_naked_fn(name)
+		a1(..)
+		a2(.., ..)
+		a3(.., .., ..)
+		64bit OR 0 paramters: a1(ret)
+		32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
+	asm_naked_fn_end(name)
+*/
+
+#if defined(X86ASM) || defined(X86_64ASM)
+
+#if defined(COMPILER_MSVC)
+	#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
+	#define a1(x) __asm {x}
+	#define a2(x, y) __asm {x, y}
+	#define a3(x, y, z) __asm {x, y, z}
+	#define a4(x, y, z, w) __asm {x, y, z, w}
+	#define aj(x) __asm {x}
+	#define asm_align8 a1(ALIGN 8)
+	#define asm_align16 a1(ALIGN 16)
+
+	#define asm_calling_convention STDCALL
+	#define aret(n) a1(ret n)
+	#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
+	#define asm_naked_fn(fn) {
+	#define asm_naked_fn_end(fn) }
+#elif defined(COMPILER_GCC)
+	#define GNU_AS1(x) #x ";\n"
+	#define GNU_AS2(x, y) #x ", " #y ";\n"
+	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
+	#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
+	#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
+	#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
+
+	#define a1(x) GNU_AS1(x)
+	#define a2(x, y) GNU_AS2(x, y)
+	#define a3(x, y, z) GNU_AS3(x, y, z)
+	#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
+	#define aj(x) GNU_ASJ(x)
+	#define asm_align8 ".p2align 3,,7"
+	#define asm_align16 ".p2align 4,,15"
+
+	#if defined(OS_WINDOWS)
+		#define asm_calling_convention CDECL
+		#define aret(n) a1(ret)
+
+		#if defined(X86_64ASM)
+			#define asm_naked_fn(fn) ; __asm__ ( \
+				".text\n"                        \
+				asm_align16 GNU_ASFN(fn)         \
+				"subq $136, %rsp;"               \
+			 	"movdqa %xmm6, 0(%rsp);"         \
+				"movdqa %xmm7, 16(%rsp);"        \
+			 	"movdqa %xmm8, 32(%rsp);"        \
+				"movdqa %xmm9, 48(%rsp);"        \
+			 	"movdqa %xmm10, 64(%rsp);"       \
+				"movdqa %xmm11, 80(%rsp);"       \
+				"movdqa %xmm12, 96(%rsp);"       \
+				"movq %rdi, 112(%rsp);"          \
+				"movq %rsi, 120(%rsp);"          \
+				"movq %rcx, %rdi;"               \
+				"movq %rdx, %rsi;"               \
+				"movq %r8, %rdx;"                \
+				"movq %r9, %rcx;"                \
+				"call 1f;"                       \
+				"movdqa 0(%rsp), %xmm6;"         \
+				"movdqa 16(%rsp), %xmm7;"        \
+				"movdqa 32(%rsp), %xmm8;"        \
+				"movdqa 48(%rsp), %xmm9;"        \
+				"movdqa 64(%rsp), %xmm10;"       \
+				"movdqa 80(%rsp), %xmm11;"       \
+				"movdqa 96(%rsp), %xmm12;"       \
+				"movq 112(%rsp), %rdi;"          \
+				"movq 120(%rsp), %rsi;"          \
+				"addq $136, %rsp;"               \
+				"ret;"                           \
+				".intel_syntax noprefix;"        \
+				".p2align 4,,15;"                \
+				"1:;"
+		#else
+			#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+		#endif
+	#else
+		#define asm_calling_convention STDCALL
+		#define aret(n) a1(ret n)
+		#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+	#endif
+
+	#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
+	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
+
+	#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
+	#define asm_gcc_parms() ".att_syntax prefix;"
+	#define asm_gcc_trashed() __asm__ __volatile__("" :::
+	#define asm_gcc_end() );
+#else
+	need x86 asm
+#endif
+
+#endif /* X86ASM || X86_64ASM */
+
+
+#if defined(CPU_X86) || defined(CPU_X86_64)
+
+typedef enum cpu_flags_x86_t {
+	cpu_mmx = 1 << 0,
+	cpu_sse = 1 << 1,
+	cpu_sse2 = 1 << 2,
+	cpu_sse3 = 1 << 3,
+	cpu_ssse3 = 1 << 4,
+	cpu_sse4_1 = 1 << 5,
+	cpu_sse4_2 = 1 << 6,
+	cpu_avx = 1 << 7,
+	cpu_xop = 1 << 8,
+	cpu_avx2 = 1 << 9
+} cpu_flags_x86;
+
+typedef enum cpu_vendors_x86_t {
+	cpu_nobody,
+	cpu_intel,
+	cpu_amd
+} cpu_vendors_x86;
+
+typedef struct x86_regs_t {
+	uint32_t eax, ebx, ecx, edx;
+} x86_regs;
+
+#if defined(X86ASM)
+asm_naked_fn_proto(int, has_cpuid)(void)
+asm_naked_fn(has_cpuid)
+	a1(pushfd)
+	a1(pop eax)
+	a2(mov ecx, eax)
+	a2(xor eax, 0x200000)
+	a1(push eax)
+	a1(popfd)
+	a1(pushfd)
+	a1(pop eax)
+	a2(xor eax, ecx)
+	a2(shr eax, 21)
+	a2(and eax, 1)
+	a1(push ecx)
+	a1(popfd)
+	a1(ret)
+asm_naked_fn_end(has_cpuid)
+#endif /* X86ASM */
+
+
+static void NOINLINE
+get_cpuid(x86_regs *regs, uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	__cpuid((int *)regs, (int)flags);
+#else
+	#if defined(CPU_X86_64)
+		#define cpuid_bx rbx
+	#else
+		#define cpuid_bx ebx
+	#endif
+
+	asm_gcc()
+		a1(push cpuid_bx)
+		a2(xor ecx, ecx)
+		a1(cpuid)
+		a2(mov [%1 + 0], eax)
+		a2(mov [%1 + 4], ebx)
+		a2(mov [%1 + 8], ecx)
+		a2(mov [%1 + 12], edx)
+		a1(pop cpuid_bx)
+		asm_gcc_parms() : "+a"(flags) : "S"(regs)  : "%ecx", "%edx", "cc"
+	asm_gcc_end()
+#endif
+}
+
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+static uint64_t NOINLINE
+get_xgetbv(uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	return _xgetbv(flags);
+#else
+	uint32_t lo, hi;
+	asm_gcc()
+		a1(xgetbv)
+		asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
+	asm_gcc_end()
+	return ((uint64_t)lo | ((uint64_t)hi << 32));
+#endif
+}
+#endif // AVX support
+
+#if defined(SCRYPT_TEST_SPEED)
+size_t cpu_detect_mask = (size_t)-1;
+#endif
+
+static size_t
+detect_cpu(void) {
+	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
+	//cpu_vendors_x86 vendor = cpu_nobody;
+	x86_regs regs;
+	uint32_t max_level, max_ext_level;
+	size_t cpu_flags = 0;
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	uint64_t xgetbv_flags;
+#endif
+
+#if defined(CPU_X86)
+	if (!has_cpuid())
+		return cpu_flags;
+#endif
+
+	get_cpuid(&regs, 0);
+	max_level = regs.eax;
+#if 0
+	vendor_string.i[0] = regs.ebx;
+	vendor_string.i[1] = regs.edx;
+	vendor_string.i[2] = regs.ecx;
+
+	if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
+		vendor = cpu_intel;
+	else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
+		vendor = cpu_amd;
+#endif
+	if (max_level & 0x00000500) {
+		/* "Intel P5 pre-B0" */
+		cpu_flags |= cpu_mmx;
+		return cpu_flags;
+	}
+
+	if (max_level < 1)
+		return cpu_flags;
+
+	get_cpuid(&regs, 1);
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	/* xsave/xrestore */
+	if (regs.ecx & (1 << 27)) {
+		xgetbv_flags = get_xgetbv(0);
+		if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
+	}
+#endif
+	if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 <<  9)) cpu_flags |= cpu_ssse3;
+	if (regs.ecx & (1      )) cpu_flags |= cpu_sse3;
+	if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
+	if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
+	if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
+
+	if (cpu_flags & cpu_avx) {
+		if (max_level >= 7) {
+			get_cpuid(&regs, 7);
+			if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
+		}
+
+		get_cpuid(&regs, 0x80000000);
+		max_ext_level = regs.eax;
+		if (max_ext_level >= 0x80000001) {
+			get_cpuid(&regs, 0x80000001);
+			if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
+		}
+	}
+
+
+#if defined(SCRYPT_TEST_SPEED)
+	cpu_flags &= cpu_detect_mask;
+#endif
+
+	return cpu_flags;
+}
+
+#if defined(SCRYPT_TEST_SPEED)
+static const char *
+get_top_cpuflag_desc(size_t flag) {
+	if (flag & cpu_avx2) return "AVX2";
+	else if (flag & cpu_xop) return "XOP";
+	else if (flag & cpu_avx) return "AVX";
+	else if (flag & cpu_sse4_2) return "SSE4.2";
+	else if (flag & cpu_sse4_1) return "SSE4.1";
+	else if (flag & cpu_ssse3) return "SSSE3";
+	else if (flag & cpu_sse2) return "SSE2";
+	else if (flag & cpu_sse) return "SSE";
+	else if (flag & cpu_mmx) return "MMX";
+	else return "Basic";
+}
+#endif
+
+/* enable the highest system-wide option */
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+	#if !defined(__AVX2__)
+		#undef X86_64ASM_AVX2
+		#undef X86ASM_AVX2
+		#undef X86_INTRINSIC_AVX2
+	#endif
+	#if !defined(__XOP__)
+		#undef X86_64ASM_XOP
+		#undef X86ASM_XOP
+		#undef X86_INTRINSIC_XOP
+	#endif
+	#if !defined(__AVX__)
+		#undef X86_64ASM_AVX
+		#undef X86ASM_AVX
+		#undef X86_INTRINSIC_AVX
+	#endif
+	#if !defined(__SSSE3__)
+		#undef X86_64ASM_SSSE3
+		#undef X86ASM_SSSE3
+		#undef X86_INTRINSIC_SSSE3
+	#endif
+	#if !defined(__SSE2__)
+		#undef X86_64ASM_SSE2
+		#undef X86ASM_SSE2
+		#undef X86_INTRINSIC_SSE2
+	#endif
+#endif
+
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
--- a/algo/argon2/ar2/sj/scrypt-jane-portable.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-portable.h
@@ -0,0 +1,310 @@
+/* determine os */
+#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
+	#include <windows.h>
+	#include <wincrypt.h>
+	#define OS_WINDOWS
+#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <fcntl.h>
+
+	#define OS_SOLARIS
+#else
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <sys/param.h> /* need this to define BSD */
+	#include <unistd.h>
+	#include <fcntl.h>
+
+	#define OS_NIX
+	#if defined(__linux__)
+		#include <endian.h>
+		#define OS_LINUX
+	#elif defined(BSD)
+		#define OS_BSD
+
+		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
+			#define OS_OSX
+		#elif defined(macintosh) || defined(Macintosh)
+			#define OS_MAC
+		#elif defined(__OpenBSD__)
+			#define OS_OPENBSD
+		#endif
+	#endif
+#endif
+
+
+/* determine compiler */
+#if defined(_MSC_VER)
+	#define COMPILER_MSVC_VS6       120000000
+	#define COMPILER_MSVC_VS6PP     121000000
+	#define COMPILER_MSVC_VS2002    130000000
+	#define COMPILER_MSVC_VS2003    131000000
+	#define COMPILER_MSVC_VS2005    140050727
+	#define COMPILER_MSVC_VS2008    150000000
+	#define COMPILER_MSVC_VS2008SP1 150030729
+	#define COMPILER_MSVC_VS2010    160000000
+	#define COMPILER_MSVC_VS2010SP1 160040219
+	#define COMPILER_MSVC_VS2012RC  170000000
+	#define COMPILER_MSVC_VS2012    170050727
+
+	#if _MSC_FULL_VER > 100000000
+		#define COMPILER_MSVC (_MSC_FULL_VER)
+	#else
+		#define COMPILER_MSVC (_MSC_FULL_VER * 10)
+	#endif
+
+	#if ((_MSC_VER == 1200) && defined(_mm_free))
+		#undef COMPILER_MSVC
+		#define COMPILER_MSVC COMPILER_MSVC_VS6PP
+	#endif
+
+	#pragma warning(disable : 4127) /* conditional expression is constant */
+	#pragma warning(disable : 4100) /* unreferenced formal parameter */
+
+	#ifndef _CRT_SECURE_NO_WARNINGS
+	#define _CRT_SECURE_NO_WARNINGS
+	#endif
+
+	#include <float.h>
+	#include <stdlib.h> /* _rotl */
+	#include <intrin.h>
+
+	typedef unsigned char uint8_t;
+	typedef unsigned short uint16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;
+	typedef unsigned __int64 uint64_t;
+	typedef signed __int64 int64_t;
+
+	#define ROTL32(a,b) _rotl(a,b)
+	#define ROTR32(a,b) _rotr(a,b)
+	#define ROTL64(a,b) _rotl64(a,b)
+	#define ROTR64(a,b) _rotr64(a,b)
+	#undef NOINLINE
+	#define NOINLINE __declspec(noinline)
+	#undef NORETURN
+	#define NORETURN
+	#undef INLINE
+	#define INLINE __forceinline
+	#undef FASTCALL
+	#define FASTCALL __fastcall
+	#undef CDECL
+	#define CDECL __cdecl
+	#undef STDCALL
+	#define STDCALL __stdcall
+	#undef NAKED
+	#define NAKED __declspec(naked)
+	#define ALIGN(n) __declspec(align(n))
+#endif
+#if defined(__ICC)
+	#define COMPILER_INTEL
+#endif
+#if defined(__GNUC__)
+	#if (__GNUC__ >= 3)
+		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
+	#else
+		#define COMPILER_GCC_PATCHLEVEL 0
+	#endif
+	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
+	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
+	#undef NOINLINE
+	#if (COMPILER_GCC >= 30000)
+		#define NOINLINE __attribute__((noinline))
+	#else
+		#define NOINLINE
+	#endif
+	#undef NORETURN
+	#if (COMPILER_GCC >= 30000)
+		#define NORETURN __attribute__((noreturn))
+	#else
+		#define NORETURN
+	#endif
+	#undef INLINE
+	#if (COMPILER_GCC >= 30000)
+		#define INLINE __attribute__((always_inline))
+	#else
+		#define INLINE inline
+	#endif
+	#undef FASTCALL
+	#if (COMPILER_GCC >= 30400)
+		#define FASTCALL __attribute__((fastcall))
+	#else
+		#define FASTCALL
+	#endif
+	#undef CDECL
+	#define CDECL __attribute__((cdecl))
+	#undef STDCALL
+	#define STDCALL __attribute__((stdcall))
+	#define ALIGN(n) __attribute__((aligned(n)))
+	#include <stdint.h>
+#endif
+#if defined(__MINGW32__) || defined(__MINGW64__)
+	#define COMPILER_MINGW
+#endif
+#if defined(__PATHCC__)
+	#define COMPILER_PATHCC
+#endif
+
+#define OPTIONAL_INLINE
+#if defined(OPTIONAL_INLINE)
+	#undef OPTIONAL_INLINE
+	#define OPTIONAL_INLINE INLINE
+#else
+	#define OPTIONAL_INLINE
+#endif
+
+#define CRYPTO_FN NOINLINE STDCALL
+
+/* determine cpu */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
+	#define CPU_X86_64
+#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
+	#define CPU_X86 500
+#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
+	#define CPU_X86 400
+#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
+	#define CPU_X86 300
+#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
+	#define CPU_IA64
+#endif
+
+#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
+	#define CPU_SPARC
+	#if defined(__sparcv9)
+		#define CPU_SPARC64
+	#endif
+#endif
+
+#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
+	#define CPU_64BITS
+	#undef FASTCALL
+	#define FASTCALL
+	#undef CDECL
+	#define CDECL
+	#undef STDCALL
+	#define STDCALL
+#endif
+
+#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
+	#define CPU_PPC
+	#if defined(_ARCH_PWR7)
+		#define CPU_POWER7
+	#elif defined(__64BIT__)
+		#define CPU_PPC64
+	#else
+		#define CPU_PPC32
+	#endif
+#endif
+
+#if defined(__hppa__) || defined(__hppa)
+	#define CPU_HPPA
+#endif
+
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+	#define CPU_ALPHA
+#endif
+
+/* endian */
+
+#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
+	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
+#define CPU_LE
+#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
+	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
+	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
+#define CPU_BE
+#else
+	/* unknown endian! */
+#endif
+
+
+#define U8TO32_BE(p)                                            \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U8TO32_LE(p)                                            \
+	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
+	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#define U32TO8_BE(p, v)                                           \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U32TO8_LE(p, v)                                           \
+	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
+	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
+
+#define U8TO64_BE(p)                                                  \
+	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
+
+#define U8TO64_LE(p)                                                  \
+	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
+
+#define U64TO8_BE(p, v)                        \
+	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#define U64TO8_LE(p, v)                        \
+	U32TO8_LE((p),     (uint32_t)((v)      )); \
+	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
+
+#define U32_SWAP(v) {                                             \
+	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
+    (v) = ((v) << 16) | ((v) >> 16);                              \
+}
+
+#define U64_SWAP(v) {                                                                       \
+	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
+	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
+    (v) = ((v) << 32) | ((v) >> 32);                                                        \
+}
+
+static int
+scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
+	uint32_t differentbits = 0;
+	while (len--)
+		differentbits |= (*x++ ^ *y++);
+	return (1 & ((differentbits - 1) >> 8));
+}
+
+static void
+scrypt_ensure_zero(void *p, size_t len) {
+#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
+		__stosb((unsigned char *)p, 0, len);
+#elif (defined(CPU_X86) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushl %%edi;\n"
+		"pushl %%ecx;\n"
+		"rep stosb;\n"
+		"popl %%ecx;\n"
+		"popl %%edi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushq %%rdi;\n"
+		"pushq %%rcx;\n"
+		"rep stosb;\n"
+		"popq %%rcx;\n"
+		"popq %%rdi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#else
+	volatile uint8_t *b = (volatile uint8_t *)p;
+	size_t i;
+	for (i = 0; i < len; i++)
+		b[i] = 0;
+#endif
+}
+
+#include "scrypt-jane-portable-x86.h"
+
+#if !defined(asm_calling_convention)
+#define asm_calling_convention
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
@@ -0,0 +1,74 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+/* function type returned by scrypt_getROMix, used with cpu detection */
+typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
+#endif
+
+/* romix pre/post nop function */
+static void asm_calling_convention
+scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+	(void)blocks; (void)nblocks;
+}
+
+/* romix pre/post endian conversion function */
+static void asm_calling_convention
+scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
+#if !defined(CPU_LE)
+	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
+	size_t i;
+	if (endian_test.w == 0x100) {
+		nblocks *= SCRYPT_BLOCK_WORDS;
+		for (i = 0; i < nblocks; i++) {
+			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
+		}
+	}
+#else
+	(void)blocks; (void)nblocks;
+#endif
+}
+
+/* chunkmix test function */
+typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+
+static int
+scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
+	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
+	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#else
+	scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#endif
+	uint8_t final[16];
+	size_t i;
+
+	for (i = 0; i < words; i++) {
+		v = (scrypt_mix_word_t)i;
+		v = (v << 8) | v;
+		v = (v << 16) | v;
+		chunk[0][i] = v;
+	}
+
+	prefn(chunk[0], blocks);
+	mixfn(chunk[1], chunk[0], NULL, r);
+	postfn(chunk[1], blocks);
+
+	/* grab the last 16 bytes of the final block */
+	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
+		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
+	}
+
+	return scrypt_verify(expected, final, 16);
+}
+
+/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
+static scrypt_mix_word_t *
+scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
+	return base + (i * len);
+}
+
+/* returns a pointer to block i */
+static scrypt_mix_word_t *
+scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
+	return base + (i * SCRYPT_BLOCK_WORDS);
+}
--- a/algo/argon2/ar2/sj/scrypt-jane-romix-template.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix-template.h
@@ -0,0 +1,122 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
+
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_ROMIX_FN
+#define SCRYPT_ROMIX_FN scrypt_ROMix
+#endif
+
+#undef SCRYPT_HAVE_ROMIX
+#define SCRYPT_HAVE_ROMIX
+
+#if !defined(SCRYPT_CHUNKMIX_FN)
+
+#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
+
+/*
+	Bout = ChunkMix(Bin)
+
+	2*r: number of blocks in the chunk
+*/
+static void asm_calling_convention
+SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
+#else
+	scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
+#endif
+	uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
+
+	/* 1: X = B_{2r - 1} */
+	block = scrypt_block(Bin, blocksPerChunk - 1);
+	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+		X[i] = block[i];
+
+	if (Bxor) {
+		block = scrypt_block(Bxor, blocksPerChunk - 1);
+		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+			X[i] ^= block[i];
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
+		/* 3: X = H(X ^ B_i) */
+		block = scrypt_block(Bin, i);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			X[j] ^= block[j];
+
+		if (Bxor) {
+			block = scrypt_block(Bxor, i);
+			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+				X[j] ^= block[j];
+		}
+		SCRYPT_MIX_FN(X);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		block = scrypt_block(Bout, (i / 2) + half);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			block[j] = X[j];
+	}
+}
+#endif
+
+/*
+	X = ROMix(X)
+
+	X: chunk to mix
+	Y: scratch chunk
+	N: number of rounds
+	V[N]: array of chunks to randomly index in to
+	2*r: number of blocks in a chunk
+*/
+
+static void NOINLINE FASTCALL
+SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
+	uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
+	}
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < /*N*/512; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
+}
+
+#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
+
+
+#undef SCRYPT_CHUNKMIX_FN
+#undef SCRYPT_ROMIX_FN
+#undef SCRYPT_MIX_FN
+#undef SCRYPT_ROMIX_TANGLE_FN
+#undef SCRYPT_ROMIX_UNTANGLE_FN
+
--- a/algo/argon2/ar2/sj/scrypt-jane-romix.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix.h
@@ -0,0 +1,23 @@
+#if defined(SCRYPT_SALSA64)
+#include "scrypt-jane-salsa64.h"
+#else
+	#define SCRYPT_MIX_BASE "ERROR"
+	typedef uint32_t scrypt_mix_word_t;
+	#define SCRYPT_WORDTO8_LE U32TO8_LE
+	#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+	#define SCRYPT_BLOCK_BYTES 64
+	#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+	#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+		static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
+		static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
+	#else
+		static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
+	#endif
+	static int scrypt_test_mix(void) { return 0; }
+	#error must define a mix function!
+#endif
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_MIX
+#define SCRYPT_MIX SCRYPT_MIX_BASE
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-salsa64.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-salsa64.h
@@ -0,0 +1,183 @@
+#define SCRYPT_MIX_BASE "Salsa64/8"
+
+typedef uint64_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U64TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
+
+#define SCRYPT_BLOCK_BYTES 128
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa64-avx2.h"
+#include "scrypt-jane-mix_salsa64-xop.h"
+#include "scrypt-jane-mix_salsa64-avx.h"
+#include "scrypt-jane-mix_salsa64-ssse3.h"
+#include "scrypt-jane-mix_salsa64-sse2.h"
+#include "scrypt-jane-mix_salsa64.h"
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa64_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix(void) {
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		return scrypt_ROMix_avx2;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		return scrypt_ROMix_xop;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		return scrypt_ROMix_avx;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		return scrypt_ROMix_ssse3;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		return scrypt_ROMix_sse2;
+	else
+#endif
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations(void) {
+	size_t cpuflags = detect_cpu();
+	size_t flags = 0;
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		flags |= cpu_avx2;
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		flags |= cpu_xop;
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		flags |= cpu_sse2;
+#endif
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix(void) {
+	static const uint8_t expected[16] = {
+		0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
--- a/algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
@@ -0,0 +1,28 @@
+typedef struct scrypt_test_setting_t {
+	const char *pw, *salt;
+	uint8_t Nfactor, rfactor, pfactor;
+} scrypt_test_setting;
+
+static const scrypt_test_setting post_settings[] = {
+	{"", "", 3, 0, 0},
+	{"password", "NaCl", 9, 3, 4},
+	{0, 0, 0, 0, 0}
+};
+
+#if defined(SCRYPT_SKEIN512)
+	#if defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
+			 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
+			 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
+			 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
+			{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
+			 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
+			 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
+			 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
+		};
+	#endif
+#else
+	static const uint8_t post_vectors[][64] = {{0}};
+#endif
+