Initial upload v3.4.7

2026-02-22 16:33:08 +00:00 · 2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions
--- a/algo/argon2/.dirstamp
+++ b/algo/argon2/.dirstamp
--- a/algo/argon2/ar2/.dirstamp
+++ b/algo/argon2/ar2/.dirstamp
--- a/algo/argon2/ar2/ar2-scrypt-jane.c
+++ b/algo/argon2/ar2/ar2-scrypt-jane.c
@@ -0,0 +1,249 @@
+/*
+	scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
+
+	Public Domain or MIT License, whichever is easier
+*/
+
+#include <string.h>
+
+#if defined( _WINDOWS )
+#if !defined( QT_GUI )
+extern "C" {
+#endif
+#endif
+
+#include "ar2-scrypt-jane.h"
+
+#include "sj/scrypt-jane-portable.h"
+#include "sj/scrypt-jane-hash.h"
+#include "sj/scrypt-jane-romix.h"
+#include "sj/scrypt-jane-test-vectors.h"
+
+#define scrypt_maxNfactor 30  /* (1 << (30 + 1)) = ~2 billion */
+#if (SCRYPT_BLOCK_BYTES == 64)
+#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 128)
+#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 256)
+#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
+#elif (SCRYPT_BLOCK_BYTES == 512)
+#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
+#endif
+#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */
+#define scrypt_maxpfactor 25  /* (1 << 25) = ~33 million */
+
+#include <stdio.h>
+//#include <malloc.h>
+
+static void NORETURN
+scrypt_fatal_error_default(const char *msg) {
+	fprintf(stderr, "%s\n", msg);
+	exit(1);
+}
+
+static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
+
+void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) {
+	scrypt_fatal_error = fn;
+}
+
+static int scrypt_power_on_self_test(void)
+{
+	const scrypt_test_setting *t;
+	uint8_t test_digest[64];
+	uint32_t i;
+	int res = 7, scrypt_valid;
+
+	if (!scrypt_test_mix()) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
+#endif
+		res &= ~1;
+	}
+
+	if (!scrypt_test_hash()) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
+#endif
+		res &= ~2;
+	}
+
+	for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
+		t = post_settings + i;
+		scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
+		scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
+	}
+
+	if (!scrypt_valid) {
+#if !defined(SCRYPT_TEST)
+		scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
+#endif
+		res &= ~4;
+	}
+
+	return res;
+}
+
+typedef struct scrypt_aligned_alloc_t {
+	uint8_t *mem, *ptr;
+} scrypt_aligned_alloc;
+
+#ifdef SCRYPT_TEST_SPEED
+
+static uint8_t *mem_base = (uint8_t *)0;
+static size_t mem_bump = 0;
+
+/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
+static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
+{
+	scrypt_aligned_alloc aa;
+	if (!mem_base) {
+		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
+		if (!mem_base)
+			scrypt_fatal_error("scrypt: out of memory");
+		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	}
+	aa.mem = mem_base + mem_bump;
+	aa.ptr = aa.mem;
+	mem_bump += (size_t)size;
+	return aa;
+}
+
+static void scrypt_free(scrypt_aligned_alloc *aa) {
+	mem_bump = 0;
+}
+
+#else
+
+static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
+{
+	static const size_t max_alloc = (size_t)-1;
+	scrypt_aligned_alloc aa;
+	size += (SCRYPT_BLOCK_BYTES - 1);
+	if (size > max_alloc)
+		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
+	aa.mem = (uint8_t *)malloc((size_t)size);
+	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
+	if (!aa.mem)
+		scrypt_fatal_error("scrypt: out of memory");
+	return aa;
+}
+
+static void scrypt_free(scrypt_aligned_alloc *aa)
+{
+	free(aa->mem);
+}
+
+#endif /* SCRYPT_TEST_SPEED */
+
+
+void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len,
+	uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes)
+{
+	scrypt_aligned_alloc YX, V;
+	uint8_t *X, *Y;
+	uint32_t N, r, p, chunk_bytes, i;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+#if !defined(SCRYPT_TEST)
+	static int power_on_self_test = 0;
+	if (!power_on_self_test) {
+		power_on_self_test = 1;
+		if (!scrypt_power_on_self_test())
+			scrypt_fatal_error("scrypt: power on self test failed");
+	}
+#endif
+
+	if (Nfactor > scrypt_maxNfactor)
+		scrypt_fatal_error("scrypt: N out of range");
+	if (rfactor > scrypt_maxrfactor)
+		scrypt_fatal_error("scrypt: r out of range");
+	if (pfactor > scrypt_maxpfactor)
+		scrypt_fatal_error("scrypt: p out of range");
+
+	N = (1 << (Nfactor + 1));
+	r = (1 << rfactor);
+	p = (1 << pfactor);
+
+	chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
+	V = scrypt_alloc((uint64_t)N * chunk_bytes);
+	YX = scrypt_alloc((p + 1) * chunk_bytes);
+
+	/* 1: X = PBKDF2(password, salt) */
+	Y = YX.ptr;
+	X = Y + chunk_bytes;
+	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
+
+	/* 2: X = ROMix(X) */
+	for (i = 0; i < p; i++)
+		scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
+
+	/* 3: Out = PBKDF2(password, X) */
+	scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
+
+	scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
+
+	scrypt_free(&V);
+	scrypt_free(&YX);
+}
+
+#define Nfactor 8
+#define rfactor 0
+#define pfactor 0
+#if (SCRYPT_BLOCK_BYTES == 64)
+#define chunk_bytes 128
+#elif (SCRYPT_BLOCK_BYTES == 128)
+#define chunk_bytes 256
+#elif (SCRYPT_BLOCK_BYTES == 256)
+#define chunk_bytes 512
+#elif (SCRYPT_BLOCK_BYTES == 512)
+#define chunk_bytes 1024
+#endif
+
+void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out)
+{
+	scrypt_aligned_alloc YX, V;
+	uint8_t *X, *Y;
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
+#endif
+
+/*
+#if !defined(SCRYPT_TEST)
+	static int power_on_self_test = 0;
+	if (!power_on_self_test) {
+		power_on_self_test = 1;
+		if (!scrypt_power_on_self_test())
+			scrypt_fatal_error("scrypt: power on self test failed");
+	}
+#endif
+*/
+	V = scrypt_alloc((uint64_t)512 * chunk_bytes);
+	YX = scrypt_alloc(2 * chunk_bytes);
+
+	/* 1: X = PBKDF2(password, salt) */
+	Y = YX.ptr;
+	X = Y + chunk_bytes;
+	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes);
+
+	/* 2: X = ROMix(X) */
+	scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1);
+
+	/* 3: Out = PBKDF2(password, X) */
+	scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32);
+
+	scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes);
+
+	scrypt_free(&V);
+	scrypt_free(&YX);
+}
+
+#if defined( _WINDOWS )
+#if !defined( QT_GUI )
+} /* extern "C" */
+#endif
+#endif
--- a/algo/argon2/ar2/ar2-scrypt-jane.h
+++ b/algo/argon2/ar2/ar2-scrypt-jane.h
@@ -0,0 +1,35 @@
+#ifndef AR2_SCRYPT_JANE_H
+#define AR2_SCRYPT_JANE_H
+
+#ifdef _MSC_VER
+#undef SCRYPT_CHOOSE_COMPILETIME
+#endif
+//#define SCRYPT_TEST
+#define SCRYPT_SKEIN512
+#define SCRYPT_SALSA64
+
+/*
+	Nfactor: Increases CPU & Memory Hardness
+	N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
+
+	rfactor: Increases Memory Hardness
+	r = (1 << rfactor): How large a chunk is
+
+	pfactor: Increases CPU Hardness
+	p = (1 << pfactor): Number of times to mix the main chunk
+
+	A block is the basic mixing unit (salsa/chacha block = 64 bytes)
+	A chunk is (2 * r) blocks
+
+	~Memory used = (N + 2) * ((2 * r) * block size)
+*/
+
+#include <stdlib.h>
+#include <stdint.h>
+
+typedef void (*scrypt_fatal_errorfn)(const char *msg);
+void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
+
+void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes);
+void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out);
+#endif /* AR2_SCRYPT_JANE_H */
--- a/algo/argon2/ar2/argon2.c
+++ b/algo/argon2/ar2/argon2.c
@@ -0,0 +1,284 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "argon2.h"
+#include "cores.h"
+
+/* Error messages */
+static const char *Argon2_ErrorMessage[] = {
+	/*{ARGON2_OK, */ "OK",
+	/*},
+
+	{ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL",
+	/*},
+
+{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short",
+	/*},
+{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long",
+	/*},
+
+{ARGON2_PWD_TOO_SHORT, */ "Password is too short",
+	/*},
+{ARGON2_PWD_TOO_LONG, */ "Password is too long",
+	/*},
+
+{ARGON2_SALT_TOO_SHORT, */ "Salt is too short",
+	/*},
+{ARGON2_SALT_TOO_LONG, */ "Salt is too long",
+	/*},
+
+{ARGON2_AD_TOO_SHORT, */ "Associated data is too short",
+	/*},
+{ARGON2_AD_TOO_LONG, */ "Associated date is too long",
+	/*},
+
+{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short",
+	/*},
+{ARGON2_SECRET_TOO_LONG, */ "Secret is too long",
+	/*},
+
+{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small",
+	/*},
+{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large",
+	/*},
+
+{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small",
+	/*},
+{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large",
+	/*},
+
+{ARGON2_LANES_TOO_FEW, */ "Too few lanes",
+	/*},
+{ARGON2_LANES_TOO_MANY, */ "Too many lanes",
+	/*},
+
+{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0",
+	/*},
+{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0",
+	/*},
+{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0",
+	/*},
+{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0",
+	/*},
+
+{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error",
+	/*},
+
+{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL",
+	/*},
+{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL",
+	/*},
+
+{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL",
+	/*},
+{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2",
+	/*},
+
+{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch",
+	/*},
+
+{ARGON2_THREADS_TOO_FEW, */ "Not enough threads",
+	/*},
+{ARGON2_THREADS_TOO_MANY, */ "Too many threads",
+	/*},
+{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/
+};
+
+int argon2d(argon2_context *context) { return argon2_core(context, Argon2_d); }
+
+int argon2i(argon2_context *context) { return argon2_core(context, Argon2_i); }
+
+int verify_d(argon2_context *context, const char *hash)
+{
+	int result;
+	/*if (0 == context->outlen || NULL == hash) {
+		return ARGON2_OUT_PTR_MISMATCH;
+	}*/
+
+	result = argon2_core(context, Argon2_d);
+
+	if (ARGON2_OK != result) {
+		return result;
+	}
+
+	return 0 == memcmp(hash, context->out, 32);
+}
+
+const char *error_message(int error_code)
+{
+	enum {
+		/* Make sure---at compile time---that the enum size matches the array
+		   size */
+		ERROR_STRING_CHECK =
+			1 /
+			!!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) ==
+			   ARGON2_ERROR_CODES_LENGTH)
+	};
+	if (error_code < ARGON2_ERROR_CODES_LENGTH) {
+		return Argon2_ErrorMessage[(argon2_error_codes)error_code];
+	}
+	return "Unknown error code.";
+}
+
+/* encoding/decoding helpers */
+
+/*
+ * Some macros for constant-time comparisons. These work over values in
+ * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true".
+ */
+#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF)
+#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF)
+#define GE(x, y) (GT(y, x) ^ 0xFF)
+#define LT(x, y) GT(y, x)
+#define LE(x, y) GE(y, x)
+
+/*
+ * Convert value x (0..63) to corresponding Base64 character.
+ */
+static int b64_byte_to_char(unsigned x) {
+//static inline int b64_byte_to_char(unsigned x) {
+	return (LT(x, 26) & (x + 'A')) |
+		   (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) |
+		   (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') |
+		   (EQ(x, 63) & '/');
+}
+
+/*
+ * Convert some bytes to Base64. 'dst_len' is the length (in characters)
+ * of the output buffer 'dst'; if that buffer is not large enough to
+ * receive the result (including the terminating 0), then (size_t)-1
+ * is returned. Otherwise, the zero-terminated Base64 string is written
+ * in the buffer, and the output length (counted WITHOUT the terminating
+ * zero) is returned.
+ */
+static size_t to_base64(char *dst, size_t dst_len, const void *src)
+{
+	size_t olen;
+	const unsigned char *buf;
+	unsigned acc, acc_len;
+
+	olen = 43;
+	/*switch (32 % 3) {
+	case 2:
+		olen++;*/
+	/* fall through */
+	/*case 1:
+		olen += 2;
+		break;
+	}*/
+	if (dst_len <= olen) {
+		return (size_t)-1;
+	}
+	acc = 0;
+	acc_len = 0;
+	buf = (const unsigned char *)src;
+	size_t src_len = 32;
+	while (src_len-- > 0) {
+		acc = (acc << 8) + (*buf++);
+		acc_len += 8;
+		while (acc_len >= 6) {
+			acc_len -= 6;
+			*dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F);
+		}
+	}
+	if (acc_len > 0) {
+		*dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F);
+	}
+	*dst++ = 0;
+	return olen;
+}
+
+/* ==================================================================== */
+/*
+ * Code specific to Argon2i.
+ *
+ * The code below applies the following format:
+ *
+ *  $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
+ *
+ * where <num> is a decimal integer (positive, fits in an 'unsigned long')
+ * and <bin> is Base64-encoded data (no '=' padding characters, no newline
+ * or whitespace). The "keyid" is a binary identifier for a key (up to 8
+ * bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
+ * (resp. the 'data') is empty, then it is ommitted from the output.
+ *
+ * The last two binary chunks (encoded in Base64) are, in that order,
+ * the salt and the output. Both are optional, but you cannot have an
+ * output without a salt. The binary salt length is between 8 and 48 bytes.
+ * The output length is always exactly 32 bytes.
+ */
+
+int encode_string(char *dst, size_t dst_len, argon2_context *ctx)
+{
+#define SS(str)                                                                \
+	do {                                                                       \
+		size_t pp_len = strlen(str);                                           \
+		if (pp_len >= dst_len) {                                               \
+			return 0;                                                          \
+		}                                                                      \
+		memcpy(dst, str, pp_len + 1);                                          \
+		dst += pp_len;                                                         \
+		dst_len -= pp_len;                                                     \
+	} while (0)
+
+#define SX(x)                                                                  \
+	do {                                                                       \
+		char tmp[30];                                                          \
+		sprintf(tmp, "%lu", (unsigned long)(x));                               \
+		SS(tmp);                                                               \
+	} while (0);
+
+#define SB(buf)                                                                \
+	do {                                                                       \
+		size_t sb_len = to_base64(dst, dst_len, buf);                          \
+		if (sb_len == (size_t)-1) {                                            \
+			return 0;                                                          \
+		}                                                                      \
+		dst += sb_len;                                                         \
+		dst_len -= sb_len;                                                     \
+	} while (0);
+
+	SS("$argon2i$m=");
+	SX(16);
+	SS(",t=");
+	SX(2);
+	SS(",p=");
+	SX(1);
+
+	/*if (ctx->adlen > 0) {
+		SS(",data=");
+		SB(ctx->ad, ctx->adlen);
+	}*/
+
+	/*if (ctx->saltlen == 0)
+		return 1;*/
+
+	SS("$");
+	SB(ctx->salt);
+
+	/*if (ctx->outlen32 == 0)
+		return 1;*/
+
+	SS("$");
+	SB(ctx->out);
+	return 1;
+
+#undef SS
+#undef SX
+#undef SB
+}
--- a/algo/argon2/ar2/argon2.h
+++ b/algo/argon2/ar2/argon2.h
@@ -0,0 +1,292 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+#ifndef ARGON2_H
+#define ARGON2_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*************************Argon2 input parameter
+ * restrictions**************************************************/
+
+/* Minimum and maximum number of lanes (degree of parallelism) */
+#define ARGON2_MIN_LANES UINT32_C(1)
+#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
+
+/* Minimum and maximum number of threads */
+#define ARGON2_MIN_THREADS UINT32_C(1)
+#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
+
+/* Number of synchronization points between lanes per pass */
+#define ARGON2_SYNC_POINTS UINT32_C(4)
+
+/* Minimum and maximum digest size in bytes */
+#define ARGON2_MIN_OUTLEN UINT32_C(4)
+#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
+#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
+
+#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
+/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB)
+ */
+#define ARGON2_MAX_MEMORY_BITS                                                 \
+    ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
+#define ARGON2_MAX_MEMORY                                                      \
+    ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
+
+/* Minimum and maximum number of passes */
+#define ARGON2_MIN_TIME UINT32_C(1)
+#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum password length in bytes */
+#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum associated data length in bytes */
+#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum salt length in bytes */
+#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
+#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum key length in bytes */
+#define ARGON2_MIN_SECRET UINT32_C(0)
+#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
+
+#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
+#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
+#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2)
+#define ARGON2_DEFAULT_FLAGS                                                   \
+    (ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY)
+
+/* Error codes */
+typedef enum Argon2_ErrorCodes {
+    ARGON2_OK = 0,
+
+    ARGON2_OUTPUT_PTR_NULL = 1,
+
+    ARGON2_OUTPUT_TOO_SHORT = 2,
+    ARGON2_OUTPUT_TOO_LONG = 3,
+
+    ARGON2_PWD_TOO_SHORT = 4,
+    ARGON2_PWD_TOO_LONG = 5,
+
+    ARGON2_SALT_TOO_SHORT = 6,
+    ARGON2_SALT_TOO_LONG = 7,
+
+    ARGON2_AD_TOO_SHORT = 8,
+    ARGON2_AD_TOO_LONG = 9,
+
+    ARGON2_SECRET_TOO_SHORT = 10,
+    ARGON2_SECRET_TOO_LONG = 11,
+
+    ARGON2_TIME_TOO_SMALL = 12,
+    ARGON2_TIME_TOO_LARGE = 13,
+
+    ARGON2_MEMORY_TOO_LITTLE = 14,
+    ARGON2_MEMORY_TOO_MUCH = 15,
+
+    ARGON2_LANES_TOO_FEW = 16,
+    ARGON2_LANES_TOO_MANY = 17,
+
+    ARGON2_PWD_PTR_MISMATCH = 18,    /* NULL ptr with non-zero length */
+    ARGON2_SALT_PTR_MISMATCH = 19,   /* NULL ptr with non-zero length */
+    ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */
+    ARGON2_AD_PTR_MISMATCH = 21,     /* NULL ptr with non-zero length */
+
+    ARGON2_MEMORY_ALLOCATION_ERROR = 22,
+
+    ARGON2_FREE_MEMORY_CBK_NULL = 23,
+    ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24,
+
+    ARGON2_INCORRECT_PARAMETER = 25,
+    ARGON2_INCORRECT_TYPE = 26,
+
+    ARGON2_OUT_PTR_MISMATCH = 27,
+
+    ARGON2_THREADS_TOO_FEW = 28,
+    ARGON2_THREADS_TOO_MANY = 29,
+
+    ARGON2_MISSING_ARGS = 30,
+
+    ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after
+                                 this
+                                 error code */
+} argon2_error_codes;
+
+/* Memory allocator types --- for external allocation */
+typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
+typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
+
+/* Argon2 external data structures */
+
+/*
+ *****Context: structure to hold Argon2 inputs:
+ * output array and its length,
+ * password and its length,
+ * salt and its length,
+ * secret and its length,
+ * associated data and its length,
+ * number of passes, amount of used memory (in KBytes, can be rounded up a bit)
+ * number of parallel threads that will be run.
+ * All the parameters above affect the output hash value.
+ * Additionally, two function pointers can be provided to allocate and
+ deallocate the memory (if NULL, memory will be allocated internally).
+ * Also, three flags indicate whether to erase password, secret as soon as they
+ are pre-hashed (and thus not needed anymore), and the entire memory
+ ****************************
+ Simplest situation: you have output array out[8], password is stored in
+ pwd[32], salt is stored in salt[16], you do not have keys nor associated data.
+ You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel
+ lanes.
+ You want to erase the password, but you're OK with last pass not being erased.
+ You want to use the default memory allocator.
+ */
+typedef struct Argon2_Context {
+    uint8_t *out;    /* output array */
+    uint8_t *pwd;    /* password array */
+    uint8_t *salt;    /* salt array */
+    /*uint8_t *secret;*/    /* key array */
+    /*uint8_t *ad;*/   /* associated data array */
+
+    allocate_fptr allocate_cbk; /* pointer to memory allocator */
+    deallocate_fptr free_cbk;   /* pointer to memory deallocator */
+
+    /*uint32_t outlen;*/ /* digest length */
+    uint32_t pwdlen;  /* password length */
+    /*uint32_t saltlen;*/ /* salt length */
+    /*uint32_t secretlen;*/ /* key length */
+    /*uint32_t adlen;*/ /* associated data length */
+    /*uint32_t t_cost;*/  /* number of passes */
+    /*uint32_t m_cost;*/  /* amount of memory requested (KB) */
+    /*uint32_t lanes;*/   /* number of lanes */
+    /*uint32_t threads;*/ /* maximum number of threads */
+    /*uint32_t flags;*/ /* array of bool options */
+
+} argon2_context;
+
+/**
+ * Function to hash the inputs in the memory-hard fashion (uses Argon2i)
+ * @param  out  Pointer to the memory where the hash digest will be written
+ * @param  outlen Digest length in bytes
+ * @param  in Pointer to the input (password)
+ * @param  inlen Input length in bytes
+ * @param  salt Pointer to the salt
+ * @param  saltlen Salt length in bytes
+ * @pre    @a out must have at least @a outlen bytes allocated
+ * @pre    @a in must be at least @inlen bytes long
+ * @pre    @a saltlen must be at least @saltlen bytes long
+ * @return Zero if successful, 1 otherwise.
+ */
+/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen,
+                 const void *salt, size_t saltlen, unsigned int t_cost,
+                 unsigned int m_cost);*/
+
+/* same for argon2d */
+/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen,
+                 const void *salt, size_t saltlen, unsigned int t_cost,
+                 unsigned int m_cost);*/
+
+/*
+ * **************Argon2d: Version of Argon2 that picks memory blocks depending
+ * on the password and salt. Only for side-channel-free
+ * environment!!***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int argon2d(argon2_context *context);
+
+/*
+ *  * **************Argon2i: Version of Argon2 that picks memory blocks
+ *independent on the password and salt. Good for side-channels,
+ ******************* but worse w.r.t. tradeoff attacks if
+ *******************only one pass is used***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int argon2i(argon2_context *context);
+
+/*
+ *   * **************Argon2di: Reserved name***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int argon2di(argon2_context *context);
+
+/*
+ *   * **************Argon2ds: Argon2d hardened against GPU attacks, 20%
+ * slower***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int argon2ds(argon2_context *context);
+
+/*
+ *   * **************Argon2id: First half-pass over memory is
+ *password-independent, the rest are password-dependent
+ ********************OK against side channels: they reduce to 1/2-pass
+ *Argon2i***************
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int argon2id(argon2_context *context);
+
+/*
+ * Verify if a given password is correct for Argon2d hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+int verify_d(argon2_context *context, const char *hash);
+
+/*
+ * Get the associated error message for given error code
+ * @return  The error message associated with the given error code
+ */
+const char *error_message(int error_code);
+
+/* ==================================================================== */
+/*
+ * Code specific to Argon2i.
+ *
+ * The code below applies the following format:
+ *
+ *  $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
+ *
+ * where <num> is a decimal integer (positive, fits in an 'unsigned long')
+ * and <bin> is Base64-encoded data (no '=' padding characters, no newline
+ * or whitespace). The "keyid" is a binary identifier for a key (up to 8
+ * bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
+ * (resp. the 'data') is empty, then it is ommitted from the output.
+ *
+ * The last two binary chunks (encoded in Base64) are, in that order,
+ * the salt and the output. Both are optional, but you cannot have an
+ * output without a salt. The binary salt length is between 8 and 48 bytes.
+ * The output length is always exactly 32 bytes.
+ */
+
+int encode_string(char *dst, size_t dst_len, argon2_context *ctx);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/argon2/ar2/bench.c
+++ b/algo/argon2/ar2/bench.c
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include "argon2.h"
+
+static uint64_t rdtsc(void)
+{
+#ifdef _MSC_VER
+	return __rdtsc();
+#else
+#if defined(__amd64__) || defined(__x86_64__)
+	uint64_t rax, rdx;
+	__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
+	return (rdx << 32) | rax;
+#elif defined(__i386__) || defined(__i386) || defined(__X86__)
+	uint64_t rax;
+	__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
+	return rax;
+#else
+#error "Not implemented!"
+#endif
+#endif
+}
+
+/*
+ * Benchmarks Argon2 with salt length 16, password length 16, t_cost 1,
+   and different m_cost and threads
+ */
+static void benchmark()
+{
+#define BENCH_OUTLEN 16
+#define BENCH_INLEN 16
+	const uint32_t inlen = BENCH_INLEN;
+	const unsigned outlen = BENCH_OUTLEN;
+	unsigned char out[BENCH_OUTLEN];
+	unsigned char pwd_array[BENCH_INLEN];
+	unsigned char salt_array[BENCH_INLEN];
+#undef BENCH_INLEN
+#undef BENCH_OUTLEN
+
+	uint32_t t_cost = 1;
+	uint32_t m_cost;
+	uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16};
+
+	memset(pwd_array, 0, inlen);
+	memset(salt_array, 1, inlen);
+
+	for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
+		unsigned i;
+		for (i = 0; i < 6; ++i) {
+			argon2_context context;
+			uint32_t thread_n = thread_test[i];
+			uint64_t stop_cycles, stop_cycles_i;
+			clock_t stop_time;
+			uint64_t delta_d, delta_i;
+			double mcycles_d, mcycles_i, run_time;
+
+			clock_t start_time = clock();
+			uint64_t start_cycles = rdtsc();
+
+			context.out = out;
+			context.outlen = outlen;
+			context.pwd = pwd_array;
+			context.pwdlen = inlen;
+			context.salt = salt_array;
+			context.saltlen = inlen;
+			context.secret = NULL;
+			context.secretlen = 0;
+			context.ad = NULL;
+			context.adlen = 0;
+			context.t_cost = t_cost;
+			context.m_cost = m_cost;
+			context.lanes = thread_n;
+			context.threads = thread_n;
+			context.allocate_cbk = NULL;
+			context.free_cbk = NULL;
+			context.flags = 0;
+
+			argon2d(&context);
+			stop_cycles = rdtsc();
+			argon2i(&context);
+			stop_cycles_i = rdtsc();
+			stop_time = clock();
+
+			delta_d = (stop_cycles - start_cycles) / (m_cost);
+			delta_i = (stop_cycles_i - stop_cycles) / (m_cost);
+			mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20);
+			mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20);
+			printf("Argon2d %d iterations  %d MiB %d threads:  %2.2f cpb %2.2f "
+				   "Mcycles \n",
+				   t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024,
+				   mcycles_d);
+			printf("Argon2i %d iterations  %d MiB %d threads:  %2.2f cpb %2.2f "
+				   "Mcycles \n",
+				   t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024,
+				   mcycles_i);
+
+			run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
+			printf("%2.4f seconds\n\n", run_time);
+		}
+	}
+}
+
+int main()
+{
+	benchmark();
+	return ARGON2_OK;
+}
--- a/algo/argon2/ar2/blake2/blake2-impl.h
+++ b/algo/argon2/ar2/blake2/blake2-impl.h
@@ -0,0 +1,143 @@
+#ifndef PORTABLE_BLAKE2_IMPL_H
+#define PORTABLE_BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_INLINE __inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define BLAKE2_INLINE __inline__
+#else
+#define BLAKE2_INLINE
+#endif
+
+/* Argon2 Team - Begin Code */
+/*
+   Not an exhaustive list, but should cover the majority of modern platforms
+   Additionally, the code will always be correct---this is only a performance
+   tweak.
+*/
+#if (defined(__BYTE_ORDER__) &&                                                \
+	 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) ||                           \
+	defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
+	defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) ||       \
+	defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) ||                \
+	defined(_M_ARM)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+/* Argon2 Team - End Code */
+
+static BLAKE2_INLINE uint32_t load32(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+	uint32_t w;
+	memcpy(&w, src, sizeof w);
+	return w;
+#else
+	const uint8_t *p = (const uint8_t *)src;
+	uint32_t w = *p++;
+	w |= (uint32_t)(*p++) << 8;
+	w |= (uint32_t)(*p++) << 16;
+	w |= (uint32_t)(*p++) << 24;
+	return w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+	uint64_t w;
+	memcpy(&w, src, sizeof w);
+	return w;
+#else
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	w |= (uint64_t)(*p++) << 48;
+	w |= (uint64_t)(*p++) << 56;
+	return w;
+#endif
+}
+
+static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+	memcpy(dst, &w, sizeof w);
+#else
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+	memcpy(dst, &w, sizeof w);
+#else
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48(const void *src) {
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	return w;
+}
+
+static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+	w >>= 8;
+	*p++ = (uint8_t)w;
+}
+
+static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
+	return (w >> c) | (w << (32 - c));
+}
+
+static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
+	return (w >> c) | (w << (64 - c));
+}
+
+/* prevents compiler optimizing out memset() */
+static BLAKE2_INLINE void burn(void *v, size_t n) {
+	static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
+	memset_v(v, 0, n);
+}
+
+#endif
--- a/algo/argon2/ar2/blake2/blake2.h
+++ b/algo/argon2/ar2/blake2/blake2.h
@@ -0,0 +1,76 @@
+#ifndef PORTABLE_BLAKE2_H
+#define PORTABLE_BLAKE2_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2b_constant {
+	BLAKE2B_BLOCKBYTES = 128,
+	BLAKE2B_OUTBYTES = 64,
+	BLAKE2B_KEYBYTES = 64,
+	BLAKE2B_SALTBYTES = 16,
+	BLAKE2B_PERSONALBYTES = 16
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2b_param {
+	uint8_t digest_length;                   /* 1 */
+	uint8_t key_length;                      /* 2 */
+	uint8_t fanout;                          /* 3 */
+	uint8_t depth;                           /* 4 */
+	uint32_t leaf_length;                    /* 8 */
+	uint64_t node_offset;                    /* 16 */
+	uint8_t node_depth;                      /* 17 */
+	uint8_t inner_length;                    /* 18 */
+	uint8_t reserved[14];                    /* 32 */
+	uint8_t salt[BLAKE2B_SALTBYTES];         /* 48 */
+	uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+} blake2b_param;
+#pragma pack(pop)
+
+typedef struct __blake2b_state {
+	uint64_t h[8];
+	uint64_t t[2];
+	uint64_t f[2];
+	unsigned buflen;
+	unsigned outlen;
+	uint8_t last_node;
+	uint8_t buf[BLAKE2B_BLOCKBYTES];
+} blake2b_state;
+
+/* Ensure param structs have not been wrongly padded */
+/* Poor man's static_assert */
+enum {
+	blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
+	blake2_size_check_2 =
+		1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
+};
+
+/* Streaming API */
+int blake2b_init(blake2b_state *S, size_t outlen);
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+					 size_t keylen);
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+int blake2b_final(blake2b_state *S, void *out, size_t outlen);
+
+/* Simple API */
+int blake2b(void *out, const void *in, const void *key, size_t keylen);
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *out, const void *in);
+/* Argon2 Team - End Code */
+/* Miouyouyou */
+void blake2b_too(void *out, const void *in);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/argon2/ar2/blake2/blamka-round-opt.h
+++ b/algo/argon2/ar2/blake2/blamka-round-opt.h
@@ -0,0 +1,162 @@
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+#include <immintrin.h>
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c)                                                   \
+    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = D0;                                                       \
+        __m128i t1 = B0;                                                       \
+        D0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = D0;                                                               \
+        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
+        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
+        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
+        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = C0;                                                       \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+        t0 = B0;                                                               \
+        __m128i t1 = D0;                                                       \
+        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
+        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
+        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
+        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+
+#endif
--- a/algo/argon2/ar2/blake2/blamka-round-ref.h
+++ b/algo/argon2/ar2/blake2/blamka-round-ref.h
@@ -0,0 +1,39 @@
+#ifndef BLAKE_ROUND_MKA_H
+#define BLAKE_ROUND_MKA_H
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+/*designed by the Lyra PHC team */
+static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
+    const uint64_t m = UINT64_C(0xFFFFFFFF);
+    const uint64_t xy = (x & m) * (y & m);
+    return x + y + 2 * xy;
+}
+
+#define G(a, b, c, d)                                                          \
+    do {                                                                       \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
+                           v12, v13, v14, v15)                                 \
+    do {                                                                       \
+        G(v0, v4, v8, v12);                                                    \
+        G(v1, v5, v9, v13);                                                    \
+        G(v2, v6, v10, v14);                                                   \
+        G(v3, v7, v11, v15);                                                   \
+        G(v0, v5, v10, v15);                                                   \
+        G(v1, v6, v11, v12);                                                   \
+        G(v2, v7, v8, v13);                                                    \
+        G(v3, v4, v9, v14);                                                    \
+    } while ((void)0, 0)
+
+#endif
--- a/algo/argon2/ar2/blake2b.c
+++ b/algo/argon2/ar2/blake2b.c
@@ -0,0 +1,316 @@
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "blake2/blake2.h"
+#include "blake2/blake2-impl.h"
+
+#if defined(_MSC_VER)
+// i know there is a trick but nvm :p
+#define PRIu64 "%llu"
+#define PRIx64 "%llx"
+#endif
+
+static const uint64_t blake2b_IV[8] = {
+	UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
+	UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
+	UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
+	UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)
+};
+
+static const unsigned int blake2b_sigma[12][16] = {
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+};
+
+static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
+	S->f[1] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
+	if (S->last_node) {
+		blake2b_set_lastnode(S);
+	}
+	S->f[0] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) {
+	S->t[0] += inc;
+	S->t[1] += (S->t[0] < inc);
+}
+
+static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
+	burn(S, sizeof(*S));      /* wipe */
+	blake2b_set_lastblock(S); /* invalidate for further use */
+}
+
+static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
+	memset(S, 0, sizeof(*S));
+	memcpy(S->h, blake2b_IV, sizeof(S->h));
+}
+
+/*
+void print_state(blake2b_state BlakeHash)
+{
+	printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
+				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
+				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
+				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
+		".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
+		".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n",
+		BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3],
+		BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7],
+		BlakeHash.t[0], BlakeHash.t[1],
+		BlakeHash.f[0], BlakeHash.f[1]);
+	printf(".buf = {");
+	for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++)
+		printf("%" PRIu8 ", ", BlakeHash.buf[i]);
+	puts("\n");
+	printf("}\n.buflen = %d\n.outlen = %d\n",
+		  BlakeHash.buflen, BlakeHash.outlen);
+	printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node);
+	fflush(stdout);
+}
+*/
+
+static const blake2b_state miou = {
+	.h = {
+		UINT64_C(7640891576939301128), UINT64_C(13503953896175478587),
+		UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
+		UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
+		UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
+	},
+	.t = {UINT64_C(0), UINT64_C(0)},
+	.f = {UINT64_C(0), UINT64_C(0)},
+	.buf = {
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	},
+	.buflen = 0,
+	.outlen = 64,
+	.last_node = 0
+};
+
+
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	const unsigned char *p = (const unsigned char *)P;
+	unsigned int i;
+
+	if (NULL == P || NULL == S) {
+		return -1;
+	}
+
+	blake2b_init0(S);
+	/* IV XOR Parameter Block */
+	for (i = 0; i < 8; ++i) {
+		S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
+	}
+	S->outlen = P->digest_length;
+	return 0;
+}
+
+void compare_buffs(uint64_t *h, size_t outlen)
+{
+	// printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t))));
+	printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen);
+	fflush(stdout);
+}
+
+/* Sequential blake2b initialization */
+int blake2b_init(blake2b_state *S, size_t outlen)
+{
+	memcpy(S, &miou, sizeof(*S));
+	S->h[0] += outlen;
+	return 0;
+}
+
+void print64(const char *name, const uint64_t *array, uint16_t size)
+{
+	printf("%s = {", name);
+	for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]);
+	printf("};\n");
+}
+
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen)
+{
+	return 0;
+}
+
+static void blake2b_compress(blake2b_state *S, const uint8_t *block)
+{
+	uint64_t m[16];
+	uint64_t v[16];
+	unsigned int i, r;
+
+	for (i = 0; i < 16; ++i) {
+		m[i] = load64(block + i * 8);
+	}
+
+	for (i = 0; i < 8; ++i) {
+		v[i] = S->h[i];
+	}
+
+	v[8] = blake2b_IV[0];
+	v[9] = blake2b_IV[1];
+	v[10] = blake2b_IV[2];
+	v[11] = blake2b_IV[3];
+	v[12] = blake2b_IV[4] ^ S->t[0];
+	v[13] = blake2b_IV[5]/* ^ S->t[1]*/;
+	v[14] = blake2b_IV[6] ^ S->f[0];
+	v[15] = blake2b_IV[7]/* ^ S->f[1]*/;
+
+#define G(r, i, a, b, c, d)                                                    \
+	do {                                                                       \
+		a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
+		d = rotr64(d ^ a, 32);                                                 \
+		c = c + d;                                                             \
+		b = rotr64(b ^ c, 24);                                                 \
+		a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
+		d = rotr64(d ^ a, 16);                                                 \
+		c = c + d;                                                             \
+		b = rotr64(b ^ c, 63);                                                 \
+	} while ((void)0, 0)
+
+#define ROUND(r)                                                               \
+	do {                                                                       \
+		G(r, 0, v[0], v[4], v[8], v[12]);                                      \
+		G(r, 1, v[1], v[5], v[9], v[13]);                                      \
+		G(r, 2, v[2], v[6], v[10], v[14]);                                     \
+		G(r, 3, v[3], v[7], v[11], v[15]);                                     \
+		G(r, 4, v[0], v[5], v[10], v[15]);                                     \
+		G(r, 5, v[1], v[6], v[11], v[12]);                                     \
+		G(r, 6, v[2], v[7], v[8], v[13]);                                      \
+		G(r, 7, v[3], v[4], v[9], v[14]);                                      \
+	} while ((void)0, 0)
+
+	for (r = 0; r < 12; ++r) ROUND(r);
+
+	for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+}
+
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen)
+{
+	const uint8_t *pin = (const uint8_t *)in;
+	/* Complete current block */
+	memcpy(&S->buf[4], pin, 124);
+	blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+	blake2b_compress(S, S->buf);
+	S->buflen = 0;
+	pin += 124;
+
+	register int8_t i = 7;
+	/* Avoid buffer copies when possible */
+	while (i--) {
+	  blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+	  blake2b_compress(S, pin);
+	  pin += BLAKE2B_BLOCKBYTES;
+	}
+	memcpy(&S->buf[S->buflen], pin, 4);
+	S->buflen += 4;
+	return 0;
+}
+
+void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
+{
+	memcpy(&S->buf[S->buflen], in, inlen);
+	S->buflen += (unsigned int)inlen;
+}
+
+int blake2b_final(blake2b_state *S, void *out, size_t outlen)
+{
+	uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+	unsigned int i;
+
+	blake2b_increment_counter(S, S->buflen);
+	blake2b_set_lastblock(S);
+	memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+
+	for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
+		store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
+	}
+
+	memcpy(out, buffer, S->outlen);
+
+	burn(buffer, sizeof(buffer));
+	burn(S->buf, sizeof(S->buf));
+	burn(S->h, sizeof(S->h));
+	return 0;
+}
+
+int blake2b(void *out, const void *in, const void *key, size_t keylen)
+{
+	blake2b_state S;
+
+	blake2b_init(&S, 64);
+	my_blake2b_update(&S, in, 64);
+	blake2b_final(&S, out, 64);
+	burn(&S, sizeof(S));
+	return 0;
+}
+
+void blake2b_too(void *pout, const void *in)
+{
+	uint8_t *out = (uint8_t *)pout;
+	uint8_t out_buffer[64];
+	uint8_t in_buffer[64];
+
+	blake2b_state blake_state;
+	blake2b_init(&blake_state, 64);
+	blake_state.buflen = blake_state.buf[1] = 4;
+	my_blake2b_update(&blake_state, in, 72);
+	blake2b_final(&blake_state, out_buffer, 64);
+	memcpy(out, out_buffer, 32);
+	out += 32;
+
+	register uint8_t i = 29;
+	while (i--) {
+		memcpy(in_buffer, out_buffer, 64);
+		blake2b(out_buffer, in_buffer, NULL, 0);
+		memcpy(out, out_buffer, 32);
+		out += 32;
+	}
+
+	memcpy(in_buffer, out_buffer, 64);
+	blake2b(out_buffer, in_buffer, NULL, 0);
+	memcpy(out, out_buffer, 64);
+
+	burn(&blake_state, sizeof(blake_state));
+}
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *pout, const void *in)
+{
+	uint8_t *out = (uint8_t *)pout;
+	blake2b_state blake_state;
+	uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
+
+	store32(outlen_bytes, 32);
+
+	blake2b_init(&blake_state, 32);
+	my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes));
+	blake2b_update(&blake_state, in, 1024);
+	blake2b_final(&blake_state, out, 32);
+	burn(&blake_state, sizeof(blake_state));
+	return 0;
+}
+/* Argon2 Team - End Code */
--- a/algo/argon2/ar2/cores.c
+++ b/algo/argon2/ar2/cores.c
@@ -0,0 +1,349 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/*For memory wiping*/
+#ifdef _MSC_VER
+#include <windows.h>
+#include <winbase.h> /* For SecureZeroMemory */
+#endif
+#if defined __STDC_LIB_EXT1__
+#define __STDC_WANT_LIB_EXT1__ 1
+#endif
+#define VC_GE_2005(version) (version >= 1400)
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "argon2.h"
+#include "cores.h"
+#include "blake2/blake2.h"
+#include "blake2/blake2-impl.h"
+
+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
+#if defined(__clang__)
+#if __has_attribute(optnone)
+#define NOT_OPTIMIZED __attribute__((optnone))
+#endif
+#elif defined(__GNUC__)
+#define GCC_VERSION                                                            \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION >= 40400
+#define NOT_OPTIMIZED __attribute__((optimize("O0")))
+#endif
+#endif
+#ifndef NOT_OPTIMIZED
+#define NOT_OPTIMIZED
+#endif
+
+/***************Instance and Position constructors**********/
+void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
+//inline void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
+
+void copy_block(block *dst, const block *src) {
+//inline void copy_block(block *dst, const block *src) {
+    memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK);
+}
+
+void xor_block(block *dst, const block *src) {
+//inline void xor_block(block *dst, const block *src) {
+    int i;
+    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
+        dst->v[i] ^= src->v[i];
+    }
+}
+
+static void load_block(block *dst, const void *input) {
+//static inline void load_block(block *dst, const void *input) {
+    unsigned i;
+    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
+        dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
+    }
+}
+
+static void store_block(void *output, const block *src) {
+//static inline void store_block(void *output, const block *src) {
+    unsigned i;
+    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
+        store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
+    }
+}
+
+/***************Memory allocators*****************/
+int allocate_memory(block **memory, uint32_t m_cost) {
+    if (memory != NULL) {
+        size_t memory_size = sizeof(block) * m_cost;
+        if (m_cost != 0 &&
+            memory_size / m_cost !=
+                sizeof(block)) { /*1. Check for multiplication overflow*/
+            return ARGON2_MEMORY_ALLOCATION_ERROR;
+        }
+
+        *memory = (block *)malloc(memory_size); /*2. Try to allocate*/
+
+        if (!*memory) {
+            return ARGON2_MEMORY_ALLOCATION_ERROR;
+        }
+
+        return ARGON2_OK;
+    } else {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+}
+
+void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
+//inline void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
+
+/*********Memory functions*/
+
+void clear_memory(argon2_instance_t *instance, int clear) {
+//inline void clear_memory(argon2_instance_t *instance, int clear) {
+    if (instance->memory != NULL && clear) {
+        secure_wipe_memory(instance->memory,
+                           sizeof(block) * /*instance->memory_blocks*/16);
+    }
+}
+
+void free_memory(block *memory) { free(memory); }
+//inline void free_memory(block *memory) { free(memory); }
+
+void finalize(const argon2_context *context, argon2_instance_t *instance) {
+    if (context != NULL && instance != NULL) {
+        block blockhash;
+        copy_block(&blockhash, instance->memory + 15);
+
+        /* Hash the result */
+        {
+            uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+            store_block(blockhash_bytes, &blockhash);
+            blake2b_long(context->out, blockhash_bytes);
+            secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
+            secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
+        }
+
+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
+        /* Clear memory */
+        // clear_memory(instance, 1);
+
+        free_memory(instance->memory);
+    }
+}
+
+uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane) {
+    /*
+     * Pass 0:
+     *      This lane : all already finished segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : all already finished segments
+     * Pass 1+:
+     *      This lane : (SYNC_POINTS - 1) last segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : (SYNC_POINTS - 1) last segments
+     */
+    uint32_t reference_area_size;
+    uint64_t relative_position;
+    uint32_t start_position, absolute_position;
+
+    if (0 == position->pass) {
+        /* First pass */
+        if (0 == position->slice) {
+            /* First slice */
+            reference_area_size =
+                position->index - 1; /* all but the previous */
+        } else {
+            if (same_lane) {
+                /* The same lane => add current segment */
+                reference_area_size =
+                    position->slice * 4 +
+                    position->index - 1;
+            } else {
+                reference_area_size =
+                    position->slice * 4 +
+                    ((position->index == 0) ? (-1) : 0);
+            }
+        }
+    } else {
+        /* Second pass */
+        if (same_lane) {reference_area_size = 11 + position->index;}
+        else {reference_area_size = 12 - (position->index == 0);}
+    }
+
+    /* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
+     * relative position */
+    relative_position = pseudo_rand;
+    relative_position = relative_position * relative_position >> 32;
+    relative_position = reference_area_size - 1 -
+                        (reference_area_size * relative_position >> 32);
+
+    /* 1.2.5 Computing starting position */
+    start_position = 0;
+
+    if (0 != position->pass) {
+        start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
+                             ? 0 : (position->slice + 1) * 4;
+    }
+
+    /* 1.2.6. Computing absolute position */
+    absolute_position = (start_position + relative_position) % 16;
+    return absolute_position;
+}
+
+void fill_memory_blocks(argon2_instance_t *instance) {
+    uint32_t r, s;
+
+    for (r = 0; r < 2; ++r) {
+        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+
+            argon2_position_t position;
+            position.pass = r;
+            position.lane = 0;
+            position.slice = (uint8_t)s;
+            position.index = 0;
+            fill_segment(instance, position);
+        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
+    }
+}
+
+void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
+    /* Make the first and second block in each lane as G(H0||i||0) or
+       G(H0||i||1) */
+    uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
+    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0);
+    blake2b_too(blockhash_bytes, blockhash);
+    load_block(&instance->memory[0], blockhash_bytes);
+
+    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
+    blake2b_too(blockhash_bytes, blockhash);
+    load_block(&instance->memory[1], blockhash_bytes);
+    secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+}
+
+
+static const blake2b_state base_hash = {
+ .h = {
+  UINT64_C(7640891576939301192), UINT64_C(13503953896175478587),
+  UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
+  UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
+  UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
+ },
+ .t = {UINT64_C(0),UINT64_C(0)},
+ .f = {UINT64_C(0),UINT64_C(0)},
+ .buf = {
+  1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0,
+  0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ .buflen = 28,
+ .outlen = 64,
+ .last_node = 0
+};
+
+#define PWDLEN 32
+#define SALTLEN 32
+#define SECRETLEN 0
+#define ADLEN 0
+void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type) {
+
+    uint8_t value[sizeof(uint32_t)];
+
+    /* Is it generating cache invalidation between cores ? */
+    blake2b_state BlakeHash = base_hash;
+    BlakeHash.buf[20] = (uint8_t) type;
+    my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
+                   PWDLEN);
+
+
+    secure_wipe_memory(context->pwd, PWDLEN);
+    context->pwdlen = 0;
+
+    store32(&value, SALTLEN);
+    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
+                   SALTLEN);
+
+    store32(&value, SECRETLEN);
+    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, ADLEN);
+    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+}
+
+int initialize(argon2_instance_t *instance, argon2_context *context) {
+    /* 1. Memory allocation */
+
+
+    allocate_memory(&(instance->memory), 16);
+
+    /* 2. Initial hashing */
+    /* H_0 + 8 extra bytes to produce the first blocks */
+    /* Hashing all inputs */
+    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
+    initial_hash(blockhash, context, instance->type);
+    /* Zeroing 8 extra bytes */
+    secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
+                       ARGON2_PREHASH_SEED_LENGTH -
+                           ARGON2_PREHASH_DIGEST_LENGTH);
+
+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
+    /* 3. Creating first blocks, we always have at least two blocks in a slice
+     */
+    fill_first_blocks(blockhash, instance);
+    /* Clearing the hash */
+    secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+
+    return ARGON2_OK;
+}
+
+int argon2_core(argon2_context *context, argon2_type type) {
+    argon2_instance_t instance;
+    instance.memory = NULL;
+    instance.type = type;
+
+    /* 3. Initialization: Hashing inputs, allocating memory, filling first
+     * blocks
+     */
+
+    int result = initialize(&instance, context);
+    if (ARGON2_OK != result) return result;
+
+    /* 4. Filling memory */
+    fill_memory_blocks(&instance);
+
+    /* 5. Finalization */
+    finalize(context, &instance);
+
+    return ARGON2_OK;
+}
--- a/algo/argon2/ar2/cores.h
+++ b/algo/argon2/ar2/cores.h
@@ -0,0 +1,216 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_CORES_H
+#define ARGON2_CORES_H
+
+#if defined(_MSC_VER)
+#include <Windows.h>
+#include <process.h>
+#define ALIGN(n) __declspec(align(n))
+#elif defined(__GNUC__) || defined(__clang)
+#define ALIGN(x) __attribute__((__aligned__(x)))
+#else
+#define ALIGN(x)
+#endif
+
+/*************************Argon2 internal
+ * constants**************************************************/
+
+enum argon2_core_constants {
+    /* Version of the algorithm */
+    ARGON2_VERSION_NUMBER = 0x10,
+
+    /* Memory block size in bytes */
+    ARGON2_BLOCK_SIZE = 1024,
+    ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
+    ARGON2_QWORDS_IN_BLOCK = 64,
+
+    /* Number of pseudo-random values generated by one call to Blake in Argon2i
+       to
+       generate reference block positions */
+    ARGON2_ADDRESSES_IN_BLOCK = 128,
+
+    /* Pre-hashing digest length and its extension*/
+    ARGON2_PREHASH_DIGEST_LENGTH = 64,
+    ARGON2_PREHASH_SEED_LENGTH = 72
+};
+
+/* Argon2 primitive type */
+typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type;
+
+/*************************Argon2 internal data
+ * types**************************************************/
+
+/*
+ * Structure for the (1KB) memory block implemented as 128 64-bit words.
+ * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
+ * bounds checking).
+ */
+typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } ALIGN(16) block;
+
+/*****************Functions that work with the block******************/
+
+/* Initialize each byte of the block with @in */
+void init_block_value(block *b, uint8_t in);
+
+/* Copy block @src to block @dst */
+void copy_block(block *dst, const block *src);
+
+/* XOR @src onto @dst bytewise */
+void xor_block(block *dst, const block *src);
+
+/*
+ * Argon2 instance: memory pointer, number of passes, amount of memory, type,
+ * and derived values.
+ * Used to evaluate the number and location of blocks to construct in each
+ * thread
+ */
+typedef struct Argon2_instance_t {
+    block *memory;          /* Memory pointer */
+    argon2_type type;
+    int print_internals; /* whether to print the memory blocks */
+} argon2_instance_t;
+
+/*
+ * Argon2 position: where we construct the block right now. Used to distribute
+ * work between threads.
+ */
+typedef struct Argon2_position_t {
+    uint32_t pass;
+    uint32_t lane;
+    uint8_t slice;
+    uint32_t index;
+} argon2_position_t;
+
+/*************************Argon2 core
+ * functions**************************************************/
+
+/* Allocates memory to the given pointer
+ * @param memory pointer to the pointer to the memory
+ * @param m_cost number of blocks to allocate in the memory
+ * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
+ */
+int allocate_memory(block **memory, uint32_t m_cost);
+
+/* Function that securely cleans the memory
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+void secure_wipe_memory(void *v, size_t n);
+
+/* Clears memory
+ * @param instance pointer to the current instance
+ * @param clear_memory indicates if we clear the memory with zeros.
+ */
+void clear_memory(argon2_instance_t *instance, int clear);
+
+/* Deallocates memory
+ * @param memory pointer to the blocks
+ */
+void free_memory(block *memory);
+
+/*
+ * Computes absolute position of reference block in the lane following a skewed
+ * distribution and using a pseudo-random value as input
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rand 32-bit pseudo-random value used to determine the position
+ * @param same_lane Indicates if the block will be taken from the current lane.
+ * If so we can reference the current segment
+ * @pre All pointers must be valid
+ */
+uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane);
+
+/*
+ * Function that validates all inputs against predefined restrictions and return
+ * an error code
+ * @param context Pointer to current Argon2 context
+ * @return ARGON2_OK if everything is all right, otherwise one of error codes
+ * (all defined in <argon2.h>
+ */
+int validate_inputs(const argon2_context *context);
+
+/*
+ * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
+ * password and secret if needed
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  blockhash Buffer for pre-hashing digest
+ * @param  type Argon2 type
+ * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
+ * allocated
+ */
+void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type);
+
+/*
+ * Function creates first 2 blocks per lane
+ * @param instance Pointer to the current instance
+ * @param blockhash Pointer to the pre-hashing digest
+ * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
+ */
+void fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
+
+/*
+ * Function allocates memory, hashes the inputs with Blake,  and creates first
+ * two blocks. Returns the pointer to the main memory with 2 blocks per lane
+ * initialized
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  instance Current Argon2 instance
+ * @return Zero if successful, -1 if memory failed to allocate. @context->state
+ * will be modified if successful.
+ */
+int initialize(argon2_instance_t *instance, argon2_context *context);
+
+/*
+ * XORing the last block of each lane, hashing it, making the tag. Deallocates
+ * the memory.
+ * @param context Pointer to current Argon2 context (use only the out parameters
+ * from it)
+ * @param instance Pointer to current instance of Argon2
+ * @pre instance->state must point to necessary amount of memory
+ * @pre context->out must point to outlen bytes of memory
+ * @pre if context->free_cbk is not NULL, it should point to a function that
+ * deallocates memory
+ */
+void finalize(const argon2_context *context, argon2_instance_t *instance);
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position);
+
+/*
+ * Function that fills the entire memory t_cost times based on the first two
+ * blocks in each lane
+ * @param instance Pointer to the current instance
+ */
+void fill_memory_blocks(argon2_instance_t *instance);
+
+/*
+ * Function that performs memory-hard hashing with certain degree of parallelism
+ * @param  context  Pointer to the Argon2 internal structure
+ * @return Error code if smth is wrong, ARGON2_OK otherwise
+ */
+int argon2_core(argon2_context *context, argon2_type type);
+
+#endif
--- a/algo/argon2/ar2/genkat.c
+++ b/algo/argon2/ar2/genkat.c
@@ -0,0 +1,186 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "argon2.h"
+#include "cores.h"
+
+void initial_kat(const uint8_t *blockhash, const argon2_context *context,
+                 argon2_type type)
+{
+    unsigned i;
+
+    if (blockhash != NULL && context != NULL) {
+        printf("=======================================");
+
+        switch (type) {
+        case Argon2_d:
+            printf("Argon2d\n");
+            break;
+
+        case Argon2_i:
+            printf("Argon2i\n");
+            break;
+
+        default:
+            break;
+        }
+
+        printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag "
+               "length: %u bytes\n",
+               context->m_cost, context->t_cost, context->lanes,
+               context->outlen);
+
+        printf("Password[%u]: ", context->pwdlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
+            printf("CLEARED\n");
+        } else {
+            for (i = 0; i < context->pwdlen; ++i) {
+                printf("%2.2x ", ((unsigned char *)context->pwd)[i]);
+            }
+
+            printf("\n");
+        }
+
+        printf("Salt[%u]: ", context->saltlen);
+
+        for (i = 0; i < context->saltlen; ++i) {
+            printf("%2.2x ", ((unsigned char *)context->salt)[i]);
+        }
+
+        printf("\n");
+
+        printf("Secret[%u]: ", context->secretlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
+            printf("CLEARED\n");
+        } else {
+            for (i = 0; i < context->secretlen; ++i) {
+                printf("%2.2x ", ((unsigned char *)context->secret)[i]);
+            }
+
+            printf("\n");
+        }
+
+        printf("Associated data[%u]: ", context->adlen);
+
+        for (i = 0; i < context->adlen; ++i) {
+            printf("%2.2x ", ((unsigned char *)context->ad)[i]);
+        }
+
+        printf("\n");
+
+        printf("Pre-hashing digest: ");
+
+        for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) {
+            printf("%2.2x ", ((unsigned char *)blockhash)[i]);
+        }
+
+        printf("\n");
+    }
+}
+
+void print_tag(const void *out, uint32_t outlen)
+{
+    unsigned i;
+    if (out != NULL) {
+        printf("Tag: ");
+
+        for (i = 0; i < outlen; ++i) {
+            printf("%2.2x ", ((uint8_t *)out)[i]);
+        }
+
+        printf("\n");
+    }
+}
+
+void internal_kat(const argon2_instance_t *instance, uint32_t pass)
+{
+    if (instance != NULL) {
+        uint32_t i, j;
+        printf("\n After pass %u:\n", pass);
+
+        for (i = 0; i < instance->memory_blocks; ++i) {
+            uint32_t how_many_words =
+                (instance->memory_blocks > ARGON2_WORDS_IN_BLOCK)
+                    ? 1
+                    : ARGON2_WORDS_IN_BLOCK;
+
+            for (j = 0; j < how_many_words; ++j)
+                printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j,
+                       instance->memory[i].v[j]);
+        }
+    }
+}
+
+static void fatal(const char *error) {
+    fprintf(stderr, "Error: %s\n", error);
+    exit(1);
+}
+
+static void generate_testvectors(const char *type)
+{
+#define TEST_OUTLEN 32
+#define TEST_PWDLEN 32
+#define TEST_SALTLEN 16
+#define TEST_SECRETLEN 8
+#define TEST_ADLEN 12
+    argon2_context context;
+
+    unsigned char out[TEST_OUTLEN];
+    unsigned char pwd[TEST_PWDLEN];
+    unsigned char salt[TEST_SALTLEN];
+    unsigned char secret[TEST_SECRETLEN];
+    unsigned char ad[TEST_ADLEN];
+    const allocate_fptr myown_allocator = NULL;
+    const deallocate_fptr myown_deallocator = NULL;
+
+    unsigned t_cost = 3;
+    unsigned m_cost = 16;
+    unsigned lanes = 4;
+
+    memset(pwd, 1, TEST_OUTLEN);
+    memset(salt, 2, TEST_SALTLEN);
+    memset(secret, 3, TEST_SECRETLEN);
+    memset(ad, 4, TEST_ADLEN);
+
+    context.out = out;
+    context.outlen = TEST_OUTLEN;
+    context.pwd = pwd;
+    context.pwdlen = TEST_PWDLEN;
+    context.salt = salt;
+    context.saltlen = TEST_SALTLEN;
+    context.secret = secret;
+    context.secretlen = TEST_SECRETLEN;
+    context.ad = ad;
+    context.adlen = TEST_ADLEN;
+    context.t_cost = t_cost;
+    context.m_cost = m_cost;
+    context.lanes = lanes;
+    context.threads = lanes;
+    context.allocate_cbk = myown_allocator;
+    context.free_cbk = myown_deallocator;
+    context.flags = 0;
+
+#undef TEST_OUTLEN
+#undef TEST_PWDLEN
+#undef TEST_SALTLEN
+#undef TEST_SECRETLEN
+#undef TEST_ADLEN
+
+    if (!strcmp(type, "d")) {
+        argon2d(&context);
+    } else if (!strcmp(type, "i")) {
+        argon2i(&context);
+    } else
+        fatal("wrong Argon2 type");
+}
+
+int main(int argc, char *argv[])
+{
+    const char *type = (argc > 1) ? argv[1] : "i";
+    generate_testvectors(type);
+    return ARGON2_OK;
+}
--- a/algo/argon2/ar2/genkat.h
+++ b/algo/argon2/ar2/genkat.h
@@ -0,0 +1,45 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_KAT_H
+#define ARGON2_KAT_H
+
+/*
+ * Initial KAT function that prints the inputs to the file
+ * @param  blockhash  Array that contains pre-hashing digest
+ * @param  context Holds inputs
+ * @param  type Argon2 type
+ * @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes
+ * @pre context member pointers must point to allocated memory of size according
+ * to the length values
+ */
+void initial_kat(const uint8_t *blockhash, const argon2_context *context,
+                 argon2_type type);
+
+/*
+ * Function that prints the output tag
+ * @param  out  output array pointer
+ * @param  outlen digest length
+ * @pre out must point to @a outlen bytes
+ **/
+void print_tag(const void *out, uint32_t outlen);
+
+/*
+ * Function that prints the internal state at given moment
+ * @param  instance pointer to the current instance
+ * @param  pass current pass number
+ * @pre instance must have necessary memory allocated
+ **/
+void internal_kat(const argon2_instance_t *instance, uint32_t pass);
+
+#endif
--- a/algo/argon2/ar2/opt.c
+++ b/algo/argon2/ar2/opt.c
@@ -0,0 +1,185 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include <immintrin.h>
+
+#include "argon2.h"
+#include "cores.h"
+#include "opt.h"
+
+#include "blake2/blake2.h"
+#include "blake2/blamka-round-opt.h"
+
+void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
+{
+    __m128i ALIGN(16) block_XY[ARGON2_QWORDS_IN_BLOCK];
+    uint32_t i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        block_XY[i] = state[i] = _mm_xor_si128(
+            state[i], _mm_load_si128(&ref_block[i]));
+    }
+
+    BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]);
+    BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]);
+    BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]);
+    BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]);
+    BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]);
+    BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]);
+    BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]);
+    BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]);
+    /*for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
+                     state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
+                     state[8 * i + 6], state[8 * i + 7]);
+    }*/
+
+    BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]);
+    BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]);
+    BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]);
+    BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]);
+    BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]);
+    BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]);
+    BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]);
+    BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]);
+    /*for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
+                     state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
+                     state[8 * 6 + i], state[8 * 7 + i]);
+    }*/
+
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
+        state[i] = _mm_xor_si128(state[i], block_XY[i]);
+        _mm_storeu_si128(&next_block[i], state[i]);
+    }
+}
+
+static const uint64_t bad_rands[32] = {
+    UINT64_C(17023632018251376180), UINT64_C(4911461131397773491),
+    UINT64_C(15927076453364631751), UINT64_C(7860239898779391109),
+
+    UINT64_C(11820267568857244377), UINT64_C(12188179869468676617),
+    UINT64_C(3732913385414474778),  UINT64_C(7651458777762572084),
+
+    UINT64_C(3062274162574341415),  UINT64_C(17922653540258786897),
+    UINT64_C(17393848266100524980), UINT64_C(8539695715554563839),
+
+    UINT64_C(13824538050656654359), UINT64_C(12078939433126460936),
+    UINT64_C(15331979418564540430), UINT64_C(12058346794217174273),
+
+    UINT64_C(13593922096015221049), UINT64_C(18356682276374416500),
+    UINT64_C(4968040514092703824),  UINT64_C(11202790346130235567),
+
+    UINT64_C(2276229735041314644), UINT64_C(220837743321691382),
+    UINT64_C(4861211596230784273), UINT64_C(6330592584132590331),
+
+    UINT64_C(3515580430960296763), UINT64_C(9869356316971855173),
+    UINT64_C(485533243489193056),  UINT64_C(14596447761048148032),
+
+    UINT64_C(16531790085730132900), UINT64_C(17328824500878824371),
+    UINT64_C(8548260058287621283),  UINT64_C(8641748798041936364)
+};
+
+void generate_addresses(const argon2_instance_t *instance,
+                        const argon2_position_t *position,
+                        uint64_t *pseudo_rands)
+{
+    uint8_t offset = position->pass * 16 + position->slice * 4;
+    pseudo_rands[0] = bad_rands[offset++];
+    pseudo_rands[1] = bad_rands[offset++];
+    pseudo_rands[2] = bad_rands[offset++];
+    pseudo_rands[3] = bad_rands[offset++];
+
+    /*if ((position->pass == 1 && position->slice == 3))
+      print64("pseudo_rands", pseudo_rands, 4);*/
+}
+
+#define SEGMENT_LENGTH 4
+#define LANE_LENGTH 16
+#define POS_LANE 0
+
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position)
+{
+    block *ref_block = NULL, *curr_block = NULL;
+    uint64_t pseudo_rand, ref_index;
+    uint32_t prev_offset, curr_offset;
+    uint8_t i;
+    __m128i state[64];
+    int data_independent_addressing = (instance->type == Argon2_i);
+
+    /* Pseudo-random values that determine the reference block position */
+    uint64_t *pseudo_rands = NULL;
+
+    pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32);
+
+    if (data_independent_addressing) {
+        generate_addresses(instance, &position, pseudo_rands);
+    }
+
+    i = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        i = 2; /* we have already generated the first two blocks */
+    }
+
+    /*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/
+    /* Offset of the current block */
+    curr_offset = position.slice * 4 + i;
+
+    if (0 == curr_offset % 16) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + /*instance->lane_length - 1*/15;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+    for (; i < SEGMENT_LENGTH;
+         ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % LANE_LENGTH == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            pseudo_rand = pseudo_rands[i];
+        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);
+
+        /* 2 Creating a new block */
+        ref_block = instance->memory + ref_index;
+        curr_block = instance->memory + curr_offset;
+        fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
+    }
+
+    free(pseudo_rands);
+}
--- a/algo/argon2/ar2/opt.h
+++ b/algo/argon2/ar2/opt.h
@@ -0,0 +1,49 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_OPT_H
+#define ARGON2_OPT_H
+
+/*
+ * Function fills a new memory block. Differs from the
+ * @param state Pointer to the just produced block. Content will be updated(!)
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be constructed
+ * @pre all block pointers must be valid
+ */
+void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
+
+/*
+ * Generate pseudo-random values to reference blocks in the segment and puts
+ * them into the array
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rands Pointer to the array of 64-bit values
+ * @pre pseudo_rands must point to @a instance->segment_length allocated values
+ */
+void generate_addresses(const argon2_instance_t *instance,
+                        const argon2_position_t *position,
+                        uint64_t *pseudo_rands);
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads.
+ * Identical to the reference code except that it calls optimized FillBlock()
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position);
+
+#endif /* ARGON2_OPT_H */
--- a/algo/argon2/ar2/ref.c
+++ b/algo/argon2/ar2/ref.c
@@ -0,0 +1,174 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "cores.h"
+#include "ref.h"
+
+#include "blake2/blamka-round-ref.h"
+#include "blake2/blake2-impl.h"
+#include "blake2/blake2.h"
+
+void fill_block(const block *prev_block, const block *ref_block,
+                block *next_block) {
+    block blockR, block_tmp;
+    unsigned i;
+
+    copy_block(&blockR, ref_block);
+    xor_block(&blockR, prev_block);
+    copy_block(&block_tmp, &blockR);
+
+    /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
+       (16,17,..31)... finally (112,113,...127) */
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND_NOMSG(
+            blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
+            blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
+            blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
+            blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
+            blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
+            blockR.v[16 * i + 15]);
+    }
+
+    /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
+       (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
+    for (i = 0; i < 8; i++) {
+        BLAKE2_ROUND_NOMSG(
+            blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
+            blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
+            blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
+            blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
+            blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
+            blockR.v[2 * i + 113]);
+    }
+
+    copy_block(next_block, &block_tmp);
+    xor_block(next_block, &blockR);
+}
+
+void generate_addresses(const argon2_instance_t *instance,
+                        const argon2_position_t *position,
+                        uint64_t *pseudo_rands) {
+    block zero_block, input_block, address_block;
+    uint32_t i;
+
+    init_block_value(&zero_block, 0);
+    init_block_value(&input_block, 0);
+    init_block_value(&address_block, 0);
+
+    if (instance != NULL && position != NULL) {
+        input_block.v[0] = position->pass;
+        input_block.v[1] = position->lane;
+        input_block.v[2] = position->slice;
+        input_block.v[3] = 16;
+        input_block.v[4] = 2;
+        input_block.v[5] = instance->type;
+
+        for (i = 0; i < 4; ++i) {
+            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+                input_block.v[6]++;
+                fill_block(&zero_block, &input_block, &address_block);
+                fill_block(&zero_block, &address_block, &address_block);
+            }
+
+            pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+        }
+    }
+}
+
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position) {
+    block *ref_block = NULL, *curr_block = NULL;
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    uint32_t starting_index;
+    uint32_t i;
+    int data_independent_addressing = (instance->type == Argon2_i);
+    /* Pseudo-random values that determine the reference block position */
+    uint64_t *pseudo_rands = NULL;
+
+    if (instance == NULL) {
+        return;
+    }
+
+    pseudo_rands =
+        (uint64_t *)malloc(sizeof(uint64_t) * 4);
+
+    if (pseudo_rands == NULL) {
+        return;
+    }
+
+    if (data_independent_addressing) {
+        generate_addresses(instance, &position, pseudo_rands);
+    }
+
+    starting_index = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; /* we have already generated the first two blocks */
+    }
+
+    /* Offset of the current block */
+    curr_offset = position.lane * 16 +
+                  position.slice * 4 + starting_index;
+
+    if (0 == curr_offset % 16) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + 16 - 1;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % 16 == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+        if (data_independent_addressing) {
+            pseudo_rand = pseudo_rands[i];
+        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % 1;
+
+        if ((position.pass == 0) && (position.slice == 0)) {
+            /* Can not reference other lanes yet */
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+                                ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        ref_block =
+            instance->memory + 16 * ref_lane + ref_index;
+        curr_block = instance->memory + curr_offset;
+        fill_block(instance->memory + prev_offset, ref_block, curr_block);
+    }
+
+    free(pseudo_rands);
+}
--- a/algo/argon2/ar2/ref.h
+++ b/algo/argon2/ar2/ref.h
@@ -0,0 +1,49 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#ifndef ARGON2_REF_H
+#define ARGON2_REF_H
+
+/*
+ * Function fills a new memory block
+ * @param prev_block Pointer to the previous block
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be constructed
+ * @pre all block pointers must be valid
+ */
+void fill_block(const block *prev_block, const block *ref_block,
+                block *next_block);
+
+/*
+ * Generate pseudo-random values to reference blocks in the segment and puts
+ * them into the array
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rands Pointer to the array of 64-bit values
+ * @pre pseudo_rands must point to @a instance->segment_length allocated values
+ */
+void generate_addresses(const argon2_instance_t *instance,
+                        const argon2_position_t *position,
+                        uint64_t *pseudo_rands);
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position);
+
+#endif /* ARGON2_REF_H */
--- a/algo/argon2/ar2/run.c
+++ b/algo/argon2/ar2/run.c
@@ -0,0 +1,223 @@
+/*
+ * Argon2 source code package
+ *
+ * Written by Daniel Dinu and Dmitry Khovratovich, 2015
+ *
+ * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along
+ * with
+ * this software. If not, see
+ * <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "argon2.h"
+#include "cores.h"
+
+#define T_COST_DEF 3
+#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */
+#define LANES_DEF 1
+#define THREADS_DEF 1
+#define OUT_LEN 32
+#define SALT_LEN 16
+
+#define UNUSED_PARAMETER(x) (void)(x)
+
+static void usage(const char *cmd) {
+    printf("Usage:  %s pwd salt [-y version] [-t iterations] [-m memory] [-p "
+           "parallelism]\n",
+           cmd);
+
+    printf("Parameters:\n");
+    printf("\tpwd\t\tThe password to hash\n");
+    printf("\tsalt\t\tThe salt to use, at most 16 characters\n");
+    printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n");
+    printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n",
+           T_COST_DEF);
+    printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n",
+           LOG_M_COST_DEF);
+    printf("\t-p N\t\tSets parallelism to N threads (default %d)\n",
+           THREADS_DEF);
+}
+
+static void fatal(const char *error) {
+    fprintf(stderr, "Error: %s\n", error);
+    exit(1);
+}
+
+/*
+Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the
+Base64-encoded hash string
+@out output array with at least 32 bytes allocated
+@pwd NULL-terminated string, presumably from argv[]
+@salt salt array with at least SALTLEN_DEF bytes allocated
+@t_cost number of iterations
+@m_cost amount of requested memory in KB
+@lanes amount of requested parallelism
+@threads actual parallelism
+@type String, only "d" and "i" are accepted
+*/
+static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost,
+                uint32_t m_cost, uint32_t lanes, uint32_t threads,
+                const char *type) {
+    clock_t start_time, stop_time;
+    unsigned pwd_length;
+    argon2_context context;
+    int i;
+
+    start_time = clock();
+
+    if (!pwd) {
+        fatal("password missing");
+    }
+
+    if (!salt) {
+        secure_wipe_memory(pwd, strlen(pwd));
+        fatal("salt missing");
+    }
+
+    pwd_length = strlen(pwd);
+
+    UNUSED_PARAMETER(threads);
+
+    context.out = out;
+    context.outlen = OUT_LEN;
+    context.pwd = (uint8_t *)pwd;
+    context.pwdlen = pwd_length;
+    context.salt = salt;
+    context.saltlen = SALT_LEN;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.t_cost = t_cost;
+    context.m_cost = m_cost;
+    context.lanes = lanes;
+    context.threads = lanes;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = ARGON2_FLAG_CLEAR_PASSWORD;
+
+    if (!strcmp(type, "d")) {
+        int result = argon2d(&context);
+        if (result != ARGON2_OK)
+            fatal(error_message(result));
+    } else if (!strcmp(type, "i")) {
+        int result = argon2i(&context);
+        if (result != ARGON2_OK)
+            fatal(error_message(result));
+    } else {
+        secure_wipe_memory(pwd, strlen(pwd));
+        fatal("wrong Argon2 type");
+    }
+
+    stop_time = clock();
+
+    /* add back when proper decoding */
+    /*
+    char encoded[300];
+    encode_string(encoded, sizeof encoded, &context);
+    printf("%s\n", encoded);
+    */
+    printf("Hash:\t\t");
+    for (i = 0; i < context.outlen; ++i) {
+        printf("%02x", context.out[i]);
+    }
+    printf("\n");
+
+    printf("%2.3f seconds\n",
+           ((double)stop_time - start_time) / (CLOCKS_PER_SEC));
+}
+
+int main(int argc, char *argv[]) {
+    unsigned char out[OUT_LEN];
+    uint32_t m_cost = 1 << LOG_M_COST_DEF;
+    uint32_t t_cost = T_COST_DEF;
+    uint32_t lanes = LANES_DEF;
+    uint32_t threads = THREADS_DEF;
+    char *pwd = NULL;
+    uint8_t salt[SALT_LEN];
+    const char *type = "i";
+    int i;
+
+    if (argc < 3) {
+        usage(argv[0]);
+        return ARGON2_MISSING_ARGS;
+    }
+
+    /* get password and salt from command line */
+    pwd = argv[1];
+    if (strlen(argv[2]) > SALT_LEN) {
+        fatal("salt too long");
+    }
+    memset(salt, 0x00, SALT_LEN); /* pad with null bytes */
+    memcpy(salt, argv[2], strlen(argv[2]));
+
+    /* parse options */
+    for (i = 3; i < argc; i++) {
+        const char *a = argv[i];
+        unsigned long input = 0;
+        if (!strcmp(a, "-m")) {
+            if (i < argc - 1) {
+                i++;
+                input = strtoul(argv[i], NULL, 10);
+                if (input == 0 || input == ULONG_MAX ||
+                    input > ARGON2_MAX_MEMORY_BITS) {
+                    fatal("bad numeric input for -m");
+                }
+                m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF));
+                if (m_cost > ARGON2_MAX_MEMORY) {
+                    fatal("m_cost overflow");
+                }
+                continue;
+            } else {
+                fatal("missing -m argument");
+            }
+        } else if (!strcmp(a, "-t")) {
+            if (i < argc - 1) {
+                i++;
+                input = strtoul(argv[i], NULL, 10);
+                if (input == 0 || input == ULONG_MAX ||
+                    input > ARGON2_MAX_TIME) {
+                    fatal("bad numeric input for -t");
+                }
+                t_cost = input;
+                continue;
+            } else {
+                fatal("missing -t argument");
+            }
+        } else if (!strcmp(a, "-p")) {
+            if (i < argc - 1) {
+                i++;
+                input = strtoul(argv[i], NULL, 10);
+                if (input == 0 || input == ULONG_MAX ||
+                    input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) {
+                    fatal("bad numeric input for -p");
+                }
+                threads = input;
+                lanes = threads;
+                continue;
+            } else {
+                fatal("missing -p argument");
+            }
+        } else if (!strcmp(a, "-d")) {
+            type = "d";
+        } else {
+            fatal("unknown argument");
+        }
+    }
+    printf("Type:\t\tArgon2%c\n", type[0]);
+    printf("Iterations:\t%" PRIu32 " \n", t_cost);
+    printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost);
+    printf("Parallelism:\t%" PRIu32 " \n", lanes);
+    run(out, pwd, salt, t_cost, m_cost, lanes, threads, type);
+
+    return ARGON2_OK;
+}
--- a/algo/argon2/ar2/sj/scrypt-jane-hash.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-hash.h
@@ -0,0 +1,38 @@
+#if defined(SCRYPT_SKEIN512)
+#include "scrypt-jane-hash_skein512.h"
+#else
+	#define SCRYPT_HASH "ERROR"
+	#define SCRYPT_HASH_BLOCK_SIZE 64
+	#define SCRYPT_HASH_DIGEST_SIZE 64
+	typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
+	typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+	static void scrypt_hash_init(scrypt_hash_state *S) {}
+	static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
+	static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
+	static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
+	#error must define a hash function!
+#endif
+
+#include "scrypt-jane-pbkdf2.h"
+
+#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
+
+static int
+scrypt_test_hash(void) {
+	scrypt_hash_state st;
+	scrypt_hash_digest hash, final;
+	uint8_t msg[SCRYPT_TEST_HASH_LEN];
+	size_t i;
+
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
+		msg[i] = (uint8_t)i;
+
+	scrypt_hash_init(&st);
+	for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
+		scrypt_hash(hash, msg, i);
+		scrypt_hash_update(&st, hash, sizeof(hash));
+	}
+	scrypt_hash_finish(&st, final);
+	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
+}
+
--- a/algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-hash_skein512.h
@@ -0,0 +1,188 @@
+#define SCRYPT_HASH "Skein-512"
+#define SCRYPT_HASH_BLOCK_SIZE 64
+#define SCRYPT_HASH_DIGEST_SIZE 64
+
+typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+
+typedef struct scrypt_hash_state_t {
+	uint64_t X[8], T[2];
+	uint32_t leftover;
+	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+} scrypt_hash_state;
+
+#include <stdio.h>
+
+static void
+skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
+	uint64_t X[8], key[8], Xt[9+18], T[3+1];
+	size_t r;
+
+	while (blocks--) {
+		T[0] = S->T[0] + add;
+		T[1] = S->T[1];
+		T[2] = T[0] ^ T[1];
+		key[0] = U8TO64_LE(in +  0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
+		key[1] = U8TO64_LE(in +  8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
+		key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
+		key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
+		key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
+		key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
+		key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
+		key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
+		Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
+		in += SCRYPT_HASH_BLOCK_SIZE;
+
+		for (r = 0; r < 18; r++)
+			Xt[r + 9] = Xt[r + 0];
+
+		for (r = 0; r < 18; r += 2) {
+			X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7],  9) ^ X[0];
+
+			X[0] += Xt[r + 1];
+			X[1] += Xt[r + 2];
+			X[2] += Xt[r + 3];
+			X[3] += Xt[r + 4];
+			X[4] += Xt[r + 5];
+			X[5] += Xt[r + 6] + T[1];
+			X[6] += Xt[r + 7] + T[2];
+			X[7] += Xt[r + 8] + r + 1;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+
+			X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
+			X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
+			X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
+			X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
+			X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
+			X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
+			X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
+			X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
+			X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
+			X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
+			X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
+			X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
+			X[6] += X[1]; X[1] = ROTL64(X[1],  8) ^ X[6];
+			X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
+			X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
+			X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
+
+			X[0] += Xt[r + 2];
+			X[1] += Xt[r + 3];
+			X[2] += Xt[r + 4];
+			X[3] += Xt[r + 5];
+			X[4] += Xt[r + 6];
+			X[5] += Xt[r + 7] + T[1];
+			X[6] += Xt[r + 8] + T[2];
+			X[7] += Xt[r + 9] + r + 2;
+
+			T[3] = T[0];
+			T[0] = T[1];
+			T[1] = T[2];
+			T[2] = T[3];
+		}
+
+		S->X[0] = key[0] ^ X[0];
+		S->X[1] = key[1] ^ X[1];
+		S->X[2] = key[2] ^ X[2];
+		S->X[3] = key[3] ^ X[3];
+		S->X[4] = key[4] ^ X[4];
+		S->X[5] = key[5] ^ X[5];
+		S->X[6] = key[6] ^ X[6];
+		S->X[7] = key[7] ^ X[7];
+
+		S->T[0] = T[0];
+		S->T[1] = T[1] & ~0x4000000000000000ull;
+	}
+}
+
+static void
+scrypt_hash_init(scrypt_hash_state *S) {
+	S->X[0] = 0x4903ADFF749C51CEull;
+	S->X[1] = 0x0D95DE399746DF03ull;
+	S->X[2] = 0x8FD1934127C79BCEull;
+	S->X[3] = 0x9A255629FF352CB1ull;
+	S->X[4] = 0x5DB62599DF6CA7B0ull;
+	S->X[5] = 0xEABE394CA9D5C3F4ull;
+	S->X[6] = 0x991112C71A75B523ull;
+	S->X[7] = 0xAE18A40B660FCC33ull;
+	S->T[0] = 0x0000000000000000ull;
+	S->T[1] = 0x7000000000000000ull;
+	S->leftover = 0;
+}
+
+static void
+scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
+	size_t blocks, want;
+
+	/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
+	if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
+		/* handle the previous data, we know there is enough for at least one block */
+		if (S->leftover) {
+			want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+			memcpy(S->buffer + S->leftover, in, want);
+			in += want;
+			inlen -= want;
+			S->leftover = 0;
+			skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
+		}
+
+		/* handle the current data if there's more than one block */
+		if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
+			blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
+			skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
+			inlen -= blocks;
+			in += blocks;
+		}
+	}
+
+	/* handle leftover data */
+	memcpy(S->buffer + S->leftover, in, inlen);
+	S->leftover += (int) inlen;
+}
+
+static void
+scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
+	memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
+	S->T[1] |= 0x8000000000000000ull;
+	skein512_blocks(S, S->buffer, 1, S->leftover);
+
+	memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
+	S->T[0] = 0;
+	S->T[1] = 0xff00000000000000ull;
+	skein512_blocks(S, S->buffer, 1, 8);
+
+	U64TO8_LE(&hash[ 0], S->X[0]);
+	U64TO8_LE(&hash[ 8], S->X[1]);
+	U64TO8_LE(&hash[16], S->X[2]);
+	U64TO8_LE(&hash[24], S->X[3]);
+	U64TO8_LE(&hash[32], S->X[4]);
+	U64TO8_LE(&hash[40], S->X[5]);
+	U64TO8_LE(&hash[48], S->X[6]);
+	U64TO8_LE(&hash[56], S->X[7]);
+}
+
+
+static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
+	0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
+	0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
+	0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
+	0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
+};
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx.h
@@ -0,0 +1,367 @@
+/* x64 */
+#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_avx_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_avx_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_avx_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_avx_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_avx_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vpsrlq xmm8, xmm10, 51)
+			a3(vpsrlq xmm9, xmm11, 51)
+			a3(vpsllq xmm10, xmm10, 13)
+			a3(vpsllq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm8)
+			a3(vpxor xmm5, xmm5, xmm9)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vpsrlq xmm10, xmm8, 25)
+			a3(vpsrlq xmm11, xmm9, 25)
+			a3(vpsllq xmm8, xmm8, 39)
+			a3(vpsllq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vpsrlq xmm10, xmm8, 51)
+			a3(vpsrlq xmm11, xmm9, 51)
+			a3(vpsllq xmm8, xmm8, 13)
+			a3(vpsllq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm10)
+			a3(vpxor xmm4, xmm4, xmm11)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vpsrlq xmm8, xmm10, 25)
+			a3(vpsrlq xmm9, xmm11, 25)
+			a3(vpsllq xmm10, xmm10, 39)
+			a3(vpsllq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_avx_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_avx_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
@@ -0,0 +1,221 @@
+/* x64 */
+#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_AVX2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_avx2)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa ymm0,[rax+0])
+	a2(vmovdqa ymm1,[rax+32])
+	a2(vmovdqa ymm2,[rax+64])
+	a2(vmovdqa ymm3,[rax+96])
+	aj(jz scrypt_ChunkMix_avx2_no_xor1)
+	a3(vpxor ymm0,ymm0,[r9+0])
+	a3(vpxor ymm1,ymm1,[r9+32])
+	a3(vpxor ymm2,ymm2,[r9+64])
+	a3(vpxor ymm3,ymm3,[r9+96])
+	a1(scrypt_ChunkMix_avx2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_avx2_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor ymm0,ymm0,[rsi+r9+0])
+		a3(vpxor ymm1,ymm1,[rsi+r9+32])
+		a3(vpxor ymm2,ymm2,[rsi+r9+64])
+		a3(vpxor ymm3,ymm3,[rsi+r9+96])
+		aj(jz scrypt_ChunkMix_avx2_no_xor2)
+		a3(vpxor ymm0,ymm0,[rdx+r9+0])
+		a3(vpxor ymm1,ymm1,[rdx+r9+32])
+		a3(vpxor ymm2,ymm2,[rdx+r9+64])
+		a3(vpxor ymm3,ymm3,[rdx+r9+96])
+		a1(scrypt_ChunkMix_avx2_no_xor2:)
+		a2(vmovdqa ymm6,ymm0)
+		a2(vmovdqa ymm7,ymm1)
+		a2(vmovdqa ymm8,ymm2)
+		a2(vmovdqa ymm9,ymm3)
+		a2(mov rax,4)
+		a1(scrypt_salsa64_avx2_loop: )
+			a3(vpaddq ymm4, ymm1, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm3)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm2, ymm2, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm2, ymm2, ymm4)
+			a3(vpaddq ymm4, ymm3, ymm2)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm1, ymm1, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm2, ymm1)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm1, ymm1, 0x39)
+			a3(vpermq ymm10, ymm2, 0x4e)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a3(vpermq ymm3, ymm3, 0x93)
+			a3(vpaddq ymm4, ymm3, ymm0)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpxor ymm1, ymm1, ymm4)
+			a3(vpaddq ymm4, ymm0, ymm1)
+			a3(vpsrlq ymm5, ymm4, 51)
+			a3(vpxor ymm10, ymm10, ymm5)
+			a3(vpsllq ymm4, ymm4, 13)
+			a3(vpxor ymm10, ymm10, ymm4)
+			a3(vpaddq ymm4, ymm1, ymm10)
+			a3(vpsrlq ymm5, ymm4, 25)
+			a3(vpxor ymm3, ymm3, ymm5)
+			a3(vpsllq ymm4, ymm4, 39)
+			a3(vpermq ymm1, ymm1, 0x93)
+			a3(vpxor ymm3, ymm3, ymm4)
+			a3(vpermq ymm2, ymm10, 0x4e)
+			a3(vpaddq ymm4, ymm10, ymm3)
+			a3(vpshufd ymm4, ymm4, 0xb1)
+			a3(vpermq ymm3, ymm3, 0x39)
+			a3(vpxor ymm0, ymm0, ymm4)
+			a1(dec rax)
+			aj(jnz scrypt_salsa64_avx2_loop)
+		a3(vpaddq ymm0,ymm0,ymm6)
+		a3(vpaddq ymm1,ymm1,ymm7)
+		a3(vpaddq ymm2,ymm2,ymm8)
+		a3(vpaddq ymm3,ymm3,ymm9)
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],ymm0)
+		a2(vmovdqa [rax+32],ymm1)
+		a2(vmovdqa [rax+64],ymm2)
+		a2(vmovdqa [rax+96],ymm3)
+		aj(jne scrypt_ChunkMix_avx2_loop)
+	a1(vzeroupper)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_avx2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_AVX2
+
+static void asm_calling_convention
+scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	y0 = ymmp[0];
+	y1 = ymmp[1];
+	y2 = ymmp[2];
+	y3 = ymmp[3];
+
+	if (Bxor) {
+		ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		ymmp = (ymmi *)scrypt_block(Bin, i);
+		y0 = _mm256_xor_si256(y0, ymmp[0]);
+		y1 = _mm256_xor_si256(y1, ymmp[1]);
+		y2 = _mm256_xor_si256(y2, ymmp[2]);
+		y3 = _mm256_xor_si256(y3, ymmp[3]);
+
+		if (Bxor) {
+			ymmp = (ymmi *)scrypt_block(Bxor, i);
+			y0 = _mm256_xor_si256(y0, ymmp[0]);
+			y1 = _mm256_xor_si256(y1, ymmp[1]);
+			y2 = _mm256_xor_si256(y2, ymmp[2]);
+			y3 = _mm256_xor_si256(y3, ymmp[3]);
+		}
+
+		t0 = y0;
+		t1 = y1;
+		t2 = y2;
+		t3 = y3;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm256_add_epi64(y0, y1);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y3);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y1 = _mm256_xor_si256(y1, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y1 = _mm256_xor_si256(y1, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
+			z0 = _mm256_add_epi64(y1, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			z0 = _mm256_add_epi64(y0, y3);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y1 = _mm256_xor_si256(y1, z0);
+			z0 = _mm256_add_epi64(y1, y0);
+			z1 = _mm256_srli_epi64(z0, 64-13);
+			y2 = _mm256_xor_si256(y2, z1);
+			z0 = _mm256_slli_epi64(z0, 13);
+			y2 = _mm256_xor_si256(y2, z0);
+			z0 = _mm256_add_epi64(y2, y1);
+			z1 = _mm256_srli_epi64(z0, 64-39);
+			y3 = _mm256_xor_si256(y3, z1);
+			z0 = _mm256_slli_epi64(z0, 39);
+			y3 = _mm256_xor_si256(y3, z0);
+			z0 = _mm256_add_epi64(y3, y2);
+			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			y0 = _mm256_xor_si256(y0, z0);
+			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
+			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
+			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
+		}
+
+		y0 = _mm256_add_epi64(y0, t0);
+		y1 = _mm256_add_epi64(y1, t1);
+		y2 = _mm256_add_epi64(y2, t2);
+		y3 = _mm256_add_epi64(y3, t3);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
+		ymmp[0] = y0;
+		ymmp[1] = y1;
+		ymmp[2] = y2;
+		ymmp[3] = y3;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-AVX2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
@@ -0,0 +1,449 @@
+/* x64 */
+#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSE2
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_sse2)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_sse2_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_sse2_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_sse2_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_sse2_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_sse2_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_sse2_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm8, xmm2)
+			a2(movdqa xmm9, xmm3)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(movdqa xmm2, xmm7)
+			a2(movdqa xmm3, xmm6)
+			a2(punpcklqdq xmm10, xmm6)
+			a2(punpcklqdq xmm11, xmm7)
+			a2(movdqa xmm6, xmm8)
+			a2(movdqa xmm7, xmm9)
+			a2(punpcklqdq xmm9, xmm9)
+			a2(punpcklqdq xmm8, xmm8)
+			a2(punpckhqdq xmm2, xmm10)
+			a2(punpckhqdq xmm3, xmm11)
+			a2(punpckhqdq xmm6, xmm9)
+			a2(punpckhqdq xmm7, xmm8)
+			aj(ja scrypt_salsa64_sse2_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_sse2_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_sse2)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSE2
+
+static void asm_calling_convention
+scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x4;
+			z1 = x5;
+			z2 = x2;
+			z3 = x3;
+			x4 = z1;
+			x5 = z0;
+			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
+			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
+			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
+			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSE2"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
+
+/* sse3/avx use this as well */
+#if defined(SCRYPT_SALSA64_INCLUDED)
+	/*
+		Default layout:
+		 0  1  2  3
+		 4  5  6  7
+		 8  9 10 11
+		12 13 14 15
+
+		SSE2 layout:
+		 0  5 10 15
+		12  1  6 11
+		 8 13  2  7
+		 4  9 14  3
+	*/
+
+
+	static void asm_calling_convention
+	salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
+		uint64_t t;
+		while (count--) {
+			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
+			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
+			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
+			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
+			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
+			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
+			blocks += 16;
+		}
+	}
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
@@ -0,0 +1,399 @@
+/* x64 */
+#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_SSSE3
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_ssse3)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(movdqa xmm0,[rax+0])
+	a2(movdqa xmm1,[rax+16])
+	a2(movdqa xmm2,[rax+32])
+	a2(movdqa xmm3,[rax+48])
+	a2(movdqa xmm4,[rax+64])
+	a2(movdqa xmm5,[rax+80])
+	a2(movdqa xmm6,[rax+96])
+	a2(movdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
+	a2(pxor xmm0,[r9+0])
+	a2(pxor xmm1,[r9+16])
+	a2(pxor xmm2,[r9+32])
+	a2(pxor xmm3,[r9+48])
+	a2(pxor xmm4,[r9+64])
+	a2(pxor xmm5,[r9+80])
+	a2(pxor xmm6,[r9+96])
+	a2(pxor xmm7,[r9+112])
+	a1(scrypt_ChunkMix_ssse3_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_ssse3_loop:)
+		a2(and rdx, rdx)
+		a2(pxor xmm0,[rsi+r9+0])
+		a2(pxor xmm1,[rsi+r9+16])
+		a2(pxor xmm2,[rsi+r9+32])
+		a2(pxor xmm3,[rsi+r9+48])
+		a2(pxor xmm4,[rsi+r9+64])
+		a2(pxor xmm5,[rsi+r9+80])
+		a2(pxor xmm6,[rsi+r9+96])
+		a2(pxor xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
+		a2(pxor xmm0,[rdx+r9+0])
+		a2(pxor xmm1,[rdx+r9+16])
+		a2(pxor xmm2,[rdx+r9+32])
+		a2(pxor xmm3,[rdx+r9+48])
+		a2(pxor xmm4,[rdx+r9+64])
+		a2(pxor xmm5,[rdx+r9+80])
+		a2(pxor xmm6,[rdx+r9+96])
+		a2(pxor xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_ssse3_no_xor2:)
+		a2(movdqa [rsp+0],xmm0)
+		a2(movdqa [rsp+16],xmm1)
+		a2(movdqa [rsp+32],xmm2)
+		a2(movdqa [rsp+48],xmm3)
+		a2(movdqa [rsp+64],xmm4)
+		a2(movdqa [rsp+80],xmm5)
+		a2(movdqa [rsp+96],xmm6)
+		a2(movdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_ssse3_loop: )
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm4, xmm10)
+			a2(pxor xmm5, xmm11)
+			a2(pxor xmm4, xmm8)
+			a2(pxor xmm5, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm4)
+			a2(paddq xmm11, xmm5)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm4)
+			a2(movdqa xmm9, xmm5)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			a2(sub rax, 2)
+			a2(movdqa xmm8, xmm0)
+			a2(movdqa xmm9, xmm1)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm6, xmm8)
+			a2(pxor xmm7, xmm9)
+			a2(movdqa xmm10, xmm0)
+			a2(movdqa xmm11, xmm1)
+			a2(paddq xmm10, xmm6)
+			a2(paddq xmm11, xmm7)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 51)
+			a2(psrlq xmm11, 51)
+			a2(psllq xmm8, 13)
+			a2(psllq xmm9, 13)
+			a2(pxor xmm5, xmm10)
+			a2(pxor xmm4, xmm11)
+			a2(pxor xmm5, xmm8)
+			a2(pxor xmm4, xmm9)
+			a2(movdqa xmm10, xmm6)
+			a2(movdqa xmm11, xmm7)
+			a2(paddq xmm10, xmm5)
+			a2(paddq xmm11, xmm4)
+			a2(movdqa xmm8, xmm10)
+			a2(movdqa xmm9, xmm11)
+			a2(psrlq xmm10, 25)
+			a2(psrlq xmm11, 25)
+			a2(psllq xmm8, 39)
+			a2(psllq xmm9, 39)
+			a2(pxor xmm2, xmm10)
+			a2(pxor xmm3, xmm11)
+			a2(pxor xmm2, xmm8)
+			a2(pxor xmm3, xmm9)
+			a2(movdqa xmm8, xmm5)
+			a2(movdqa xmm9, xmm4)
+			a2(paddq xmm8, xmm2)
+			a2(paddq xmm9, xmm3)
+			a3(pshufd xmm8, xmm8, 0xb1)
+			a3(pshufd xmm9, xmm9, 0xb1)
+			a2(pxor xmm0, xmm8)
+			a2(pxor xmm1, xmm9)
+			a2(movdqa xmm10, xmm2)
+			a2(movdqa xmm11, xmm3)
+			a2(movdqa xmm2, xmm6)
+			a2(movdqa xmm3, xmm7)
+			a3(palignr xmm2, xmm7, 8)
+			a3(palignr xmm3, xmm6, 8)
+			a2(movdqa xmm6, xmm11)
+			a2(movdqa xmm7, xmm10)
+			a3(palignr xmm6, xmm10, 8)
+			a3(palignr xmm7, xmm11, 8)
+			aj(ja scrypt_salsa64_ssse3_loop)
+		a2(paddq xmm0,[rsp+0])
+		a2(paddq xmm1,[rsp+16])
+		a2(paddq xmm2,[rsp+32])
+		a2(paddq xmm3,[rsp+48])
+		a2(paddq xmm4,[rsp+64])
+		a2(paddq xmm5,[rsp+80])
+		a2(paddq xmm6,[rsp+96])
+		a2(paddq xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(movdqa [rax+0],xmm0)
+		a2(movdqa [rax+16],xmm1)
+		a2(movdqa [rax+32],xmm2)
+		a2(movdqa [rax+48],xmm3)
+		a2(movdqa [rax+64],xmm4)
+		a2(movdqa [rax+80],xmm5)
+		a2(movdqa [rax+96],xmm6)
+		a2(movdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_ssse3_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_ssse3)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_SSSE3
+
+static void asm_calling_convention
+scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z2);
+			x5 = _mm_xor_si128(x5, z3);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z2 = _mm_srli_epi64(z0, 64-13);
+			z3 = _mm_srli_epi64(z1, 64-13);
+			z0 = _mm_slli_epi64(z0, 13);
+			z1 = _mm_slli_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z2);
+			x4 = _mm_xor_si128(x4, z3);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z2 = _mm_srli_epi64(z0, 64-39);
+			z3 = _mm_srli_epi64(z1, 64-39);
+			z0 = _mm_slli_epi64(z0, 39);
+			z1 = _mm_slli_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z2);
+			x3 = _mm_xor_si128(x3, z3);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-SSSE3"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64-xop.h
@@ -0,0 +1,335 @@
+/* x64 */
+#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
+
+#define SCRYPT_SALSA64_XOP
+
+asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
+asm_naked_fn(scrypt_ChunkMix_xop)
+	a1(push rbp)
+	a2(mov rbp, rsp)
+	a2(and rsp, ~63)
+	a2(sub rsp, 128)
+	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
+	a2(shl rcx,7)
+	a2(lea r9,[rcx-128])
+	a2(lea rax,[rsi+r9])
+	a2(lea r9,[rdx+r9])
+	a2(and rdx, rdx)
+	a2(vmovdqa xmm0,[rax+0])
+	a2(vmovdqa xmm1,[rax+16])
+	a2(vmovdqa xmm2,[rax+32])
+	a2(vmovdqa xmm3,[rax+48])
+	a2(vmovdqa xmm4,[rax+64])
+	a2(vmovdqa xmm5,[rax+80])
+	a2(vmovdqa xmm6,[rax+96])
+	a2(vmovdqa xmm7,[rax+112])
+	aj(jz scrypt_ChunkMix_xop_no_xor1)
+	a3(vpxor xmm0,xmm0,[r9+0])
+	a3(vpxor xmm1,xmm1,[r9+16])
+	a3(vpxor xmm2,xmm2,[r9+32])
+	a3(vpxor xmm3,xmm3,[r9+48])
+	a3(vpxor xmm4,xmm4,[r9+64])
+	a3(vpxor xmm5,xmm5,[r9+80])
+	a3(vpxor xmm6,xmm6,[r9+96])
+	a3(vpxor xmm7,xmm7,[r9+112])
+	a1(scrypt_ChunkMix_xop_no_xor1:)
+	a2(xor r9,r9)
+	a2(xor r8,r8)
+	a1(scrypt_ChunkMix_xop_loop:)
+		a2(and rdx, rdx)
+		a3(vpxor xmm0,xmm0,[rsi+r9+0])
+		a3(vpxor xmm1,xmm1,[rsi+r9+16])
+		a3(vpxor xmm2,xmm2,[rsi+r9+32])
+		a3(vpxor xmm3,xmm3,[rsi+r9+48])
+		a3(vpxor xmm4,xmm4,[rsi+r9+64])
+		a3(vpxor xmm5,xmm5,[rsi+r9+80])
+		a3(vpxor xmm6,xmm6,[rsi+r9+96])
+		a3(vpxor xmm7,xmm7,[rsi+r9+112])
+		aj(jz scrypt_ChunkMix_xop_no_xor2)
+		a3(vpxor xmm0,xmm0,[rdx+r9+0])
+		a3(vpxor xmm1,xmm1,[rdx+r9+16])
+		a3(vpxor xmm2,xmm2,[rdx+r9+32])
+		a3(vpxor xmm3,xmm3,[rdx+r9+48])
+		a3(vpxor xmm4,xmm4,[rdx+r9+64])
+		a3(vpxor xmm5,xmm5,[rdx+r9+80])
+		a3(vpxor xmm6,xmm6,[rdx+r9+96])
+		a3(vpxor xmm7,xmm7,[rdx+r9+112])
+		a1(scrypt_ChunkMix_xop_no_xor2:)
+		a2(vmovdqa [rsp+0],xmm0)
+		a2(vmovdqa [rsp+16],xmm1)
+		a2(vmovdqa [rsp+32],xmm2)
+		a2(vmovdqa [rsp+48],xmm3)
+		a2(vmovdqa [rsp+64],xmm4)
+		a2(vmovdqa [rsp+80],xmm5)
+		a2(vmovdqa [rsp+96],xmm6)
+		a2(vmovdqa [rsp+112],xmm7)
+		a2(mov rax,8)
+		a1(scrypt_salsa64_xop_loop: )
+			a3(vpaddq xmm8, xmm0, xmm2)
+			a3(vpaddq xmm9, xmm1, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm8)
+			a3(vpxor xmm7, xmm7, xmm9)
+			a3(vpaddq xmm10, xmm0, xmm6)
+			a3(vpaddq xmm11, xmm1, xmm7)
+			a3(vprotq xmm10, xmm10, 13)
+			a3(vprotq xmm11, xmm11, 13)
+			a3(vpxor xmm4, xmm4, xmm10)
+			a3(vpxor xmm5, xmm5, xmm11)
+			a3(vpaddq xmm8, xmm6, xmm4)
+			a3(vpaddq xmm9, xmm7, xmm5)
+			a3(vprotq xmm8, xmm8, 39)
+			a3(vprotq xmm9, xmm9, 39)
+			a3(vpxor xmm2, xmm2, xmm8)
+			a3(vpxor xmm3, xmm3, xmm9)
+			a3(vpaddq xmm10, xmm4, xmm2)
+			a3(vpaddq xmm11, xmm5, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm10)
+			a3(vpxor xmm1, xmm1, xmm11)
+			a2(vmovdqa xmm8, xmm2)
+			a2(vmovdqa xmm9, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm9, xmm8, 8)
+			a4(vpalignr xmm7, xmm8, xmm9, 8)
+			a3(vpaddq xmm10, xmm0, xmm2)
+			a3(vpaddq xmm11, xmm1, xmm3)
+			a3(vpshufd xmm10, xmm10, 0xb1)
+			a3(vpshufd xmm11, xmm11, 0xb1)
+			a3(vpxor xmm6, xmm6, xmm10)
+			a3(vpxor xmm7, xmm7, xmm11)
+			a3(vpaddq xmm8, xmm0, xmm6)
+			a3(vpaddq xmm9, xmm1, xmm7)
+			a3(vprotq xmm8, xmm8, 13)
+			a3(vprotq xmm9, xmm9, 13)
+			a3(vpxor xmm5, xmm5, xmm8)
+			a3(vpxor xmm4, xmm4, xmm9)
+			a3(vpaddq xmm10, xmm6, xmm5)
+			a3(vpaddq xmm11, xmm7, xmm4)
+			a3(vprotq xmm10, xmm10, 39)
+			a3(vprotq xmm11, xmm11, 39)
+			a3(vpxor xmm2, xmm2, xmm10)
+			a3(vpxor xmm3, xmm3, xmm11)
+			a3(vpaddq xmm8, xmm5, xmm2)
+			a3(vpaddq xmm9, xmm4, xmm3)
+			a3(vpshufd xmm8, xmm8, 0xb1)
+			a3(vpshufd xmm9, xmm9, 0xb1)
+			a3(vpxor xmm0, xmm0, xmm8)
+			a3(vpxor xmm1, xmm1, xmm9)
+			a2(vmovdqa xmm10, xmm2)
+			a2(vmovdqa xmm11, xmm3)
+			a4(vpalignr xmm2, xmm6, xmm7, 8)
+			a4(vpalignr xmm3, xmm7, xmm6, 8)
+			a4(vpalignr xmm6, xmm11, xmm10, 8)
+			a4(vpalignr xmm7, xmm10, xmm11, 8)
+			a2(sub rax, 2)
+			aj(ja scrypt_salsa64_xop_loop)
+		a3(vpaddq xmm0,xmm0,[rsp+0])
+		a3(vpaddq xmm1,xmm1,[rsp+16])
+		a3(vpaddq xmm2,xmm2,[rsp+32])
+		a3(vpaddq xmm3,xmm3,[rsp+48])
+		a3(vpaddq xmm4,xmm4,[rsp+64])
+		a3(vpaddq xmm5,xmm5,[rsp+80])
+		a3(vpaddq xmm6,xmm6,[rsp+96])
+		a3(vpaddq xmm7,xmm7,[rsp+112])
+		a2(lea rax,[r8+r9])
+		a2(xor r8,rcx)
+		a2(and rax,~0xff)
+		a2(add r9,128)
+		a2(shr rax,1)
+		a2(add rax, rdi)
+		a2(cmp r9,rcx)
+		a2(vmovdqa [rax+0],xmm0)
+		a2(vmovdqa [rax+16],xmm1)
+		a2(vmovdqa [rax+32],xmm2)
+		a2(vmovdqa [rax+48],xmm3)
+		a2(vmovdqa [rax+64],xmm4)
+		a2(vmovdqa [rax+80],xmm5)
+		a2(vmovdqa [rax+96],xmm6)
+		a2(vmovdqa [rax+112],xmm7)
+		aj(jne scrypt_ChunkMix_xop_loop)
+	a2(mov rsp, rbp)
+	a1(pop rbp)
+	a1(ret)
+asm_naked_fn_end(scrypt_ChunkMix_xop)
+
+#endif
+
+
+/* intrinsic */
+#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
+
+#define SCRYPT_SALSA64_XOP
+
+static void asm_calling_convention
+scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
+	uint32_t i, blocksPerChunk = r * 2, half = 0;
+	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1;
+	size_t rounds;
+
+	/* 1: X = B_{2r - 1} */
+	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
+	x0 = xmmp[0];
+	x1 = xmmp[1];
+	x2 = xmmp[2];
+	x3 = xmmp[3];
+	x4 = xmmp[4];
+	x5 = xmmp[5];
+	x6 = xmmp[6];
+	x7 = xmmp[7];
+
+	if (Bxor) {
+		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
+		/* 3: X = H(X ^ B_i) */
+		xmmp = (xmmi *)scrypt_block(Bin, i);
+		x0 = _mm_xor_si128(x0, xmmp[0]);
+		x1 = _mm_xor_si128(x1, xmmp[1]);
+		x2 = _mm_xor_si128(x2, xmmp[2]);
+		x3 = _mm_xor_si128(x3, xmmp[3]);
+		x4 = _mm_xor_si128(x4, xmmp[4]);
+		x5 = _mm_xor_si128(x5, xmmp[5]);
+		x6 = _mm_xor_si128(x6, xmmp[6]);
+		x7 = _mm_xor_si128(x7, xmmp[7]);
+
+		if (Bxor) {
+			xmmp = (xmmi *)scrypt_block(Bxor, i);
+			x0 = _mm_xor_si128(x0, xmmp[0]);
+			x1 = _mm_xor_si128(x1, xmmp[1]);
+			x2 = _mm_xor_si128(x2, xmmp[2]);
+			x3 = _mm_xor_si128(x3, xmmp[3]);
+			x4 = _mm_xor_si128(x4, xmmp[4]);
+			x5 = _mm_xor_si128(x5, xmmp[5]);
+			x6 = _mm_xor_si128(x6, xmmp[6]);
+			x7 = _mm_xor_si128(x7, xmmp[7]);
+		}
+
+		t0 = x0;
+		t1 = x1;
+		t2 = x2;
+		t3 = x3;
+		t4 = x4;
+		t5 = x5;
+		t6 = x6;
+		t7 = x7;
+
+		for (rounds = 8; rounds; rounds -= 2) {
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x4 = _mm_xor_si128(x4, z0);
+			x5 = _mm_xor_si128(x5, z1);
+
+			z0 = _mm_add_epi64(x4, x6);
+			z1 = _mm_add_epi64(x5, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x4);
+			z1 = _mm_add_epi64(x3, x5);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+
+			z0 = _mm_add_epi64(x0, x2);
+			z1 = _mm_add_epi64(x1, x3);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x6 = _mm_xor_si128(x6, z0);
+			x7 = _mm_xor_si128(x7, z1);
+
+			z0 = _mm_add_epi64(x6, x0);
+			z1 = _mm_add_epi64(x7, x1);
+			z0 = _mm_roti_epi64(z0, 13);
+			z1 = _mm_roti_epi64(z1, 13);
+			x5 = _mm_xor_si128(x5, z0);
+			x4 = _mm_xor_si128(x4, z1);
+
+			z0 = _mm_add_epi64(x5, x6);
+			z1 = _mm_add_epi64(x4, x7);
+			z0 = _mm_roti_epi64(z0, 39);
+			z1 = _mm_roti_epi64(z1, 39);
+			x2 = _mm_xor_si128(x2, z0);
+			x3 = _mm_xor_si128(x3, z1);
+
+			z0 = _mm_add_epi64(x2, x5);
+			z1 = _mm_add_epi64(x3, x4);
+			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
+			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
+			x0 = _mm_xor_si128(x0, z0);
+			x1 = _mm_xor_si128(x1, z1);
+
+			z0 = x2;
+			z1 = x3;
+			x2 = _mm_alignr_epi8(x6, x7, 8);
+			x3 = _mm_alignr_epi8(x7, x6, 8);
+			x6 = _mm_alignr_epi8(z1, z0, 8);
+			x7 = _mm_alignr_epi8(z0, z1, 8);
+		}
+
+		x0 = _mm_add_epi64(x0, t0);
+		x1 = _mm_add_epi64(x1, t1);
+		x2 = _mm_add_epi64(x2, t2);
+		x3 = _mm_add_epi64(x3, t3);
+		x4 = _mm_add_epi64(x4, t4);
+		x5 = _mm_add_epi64(x5, t5);
+		x6 = _mm_add_epi64(x6, t6);
+		x7 = _mm_add_epi64(x7, t7);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
+		xmmp[0] = x0;
+		xmmp[1] = x1;
+		xmmp[2] = x2;
+		xmmp[3] = x3;
+		xmmp[4] = x4;
+		xmmp[5] = x5;
+		xmmp[6] = x6;
+		xmmp[7] = x7;
+	}
+}
+
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	/* uses salsa64_core_tangle_sse2 */
+
+	#undef SCRYPT_MIX
+	#define SCRYPT_MIX "Salsa64/8-XOP"
+	#undef SCRYPT_SALSA64_INCLUDED
+	#define SCRYPT_SALSA64_INCLUDED
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-mix_salsa64.h
@@ -0,0 +1,41 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
+
+#undef SCRYPT_MIX
+#define SCRYPT_MIX "Salsa64/8 Ref"
+
+#undef SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_INCLUDED
+#define SCRYPT_SALSA64_BASIC
+
+static void
+salsa64_core_basic(uint64_t state[16]) {
+	const size_t rounds = 8;
+	uint64_t v[16], t;
+	size_t i;
+
+	for (i = 0; i < 16; i++) v[i] = state[i];
+
+	#define G(a,b,c,d) \
+		t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
+		t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
+		t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
+		t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
+
+	for (i = 0; i < rounds; i += 2) {
+		G( 0, 4, 8,12);
+		G( 5, 9,13, 1);
+		G(10,14, 2, 6);
+		G(15, 3, 7,11);
+		G( 0, 1, 2, 3);
+		G( 5, 6, 7, 4);
+		G(10,11, 8, 9);
+		G(15,12,13,14);
+	}
+
+	for (i = 0; i < 16; i++) state[i] += v[i];
+
+	#undef G
+}
+
+#endif
+
--- a/algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-pbkdf2.h
@@ -0,0 +1,112 @@
+typedef struct scrypt_hmac_state_t {
+	scrypt_hash_state inner, outer;
+} scrypt_hmac_state;
+
+
+static void
+scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
+	scrypt_hash_state st;
+	scrypt_hash_init(&st);
+	scrypt_hash_update(&st, m, mlen);
+	scrypt_hash_finish(&st, hash);
+}
+
+/* hmac */
+static void
+scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
+	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+	size_t i;
+
+	scrypt_hash_init(&st->inner);
+	scrypt_hash_init(&st->outer);
+
+	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
+		/* use the key directly if it's <= blocksize bytes */
+		memcpy(pad, key, keylen);
+	} else {
+		/* if it's > blocksize bytes, hash it */
+		scrypt_hash(pad, key, keylen);
+	}
+
+	/* inner = (key ^ 0x36) */
+	/* h(inner || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= 0x36;
+	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	/* outer = (key ^ 0x5c) */
+	/* h(outer || ...) */
+	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
+		pad[i] ^= (0x5c ^ 0x36);
+	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
+
+	scrypt_ensure_zero(pad, sizeof(pad));
+}
+
+static void
+scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
+	/* h(inner || m...) */
+	scrypt_hash_update(&st->inner, m, mlen);
+}
+
+static void
+scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
+	/* h(inner || m) */
+	scrypt_hash_digest innerhash;
+	scrypt_hash_finish(&st->inner, innerhash);
+
+	/* h(outer || h(inner || m)) */
+	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
+	scrypt_hash_finish(&st->outer, mac);
+
+	scrypt_ensure_zero(st, sizeof(*st));
+}
+
+static void
+scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
+	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
+	scrypt_hash_digest ti, u;
+	uint8_t be[4];
+	uint32_t i, j, blocks;
+	uint64_t c;
+
+	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
+
+	/* hmac(password, ...) */
+	scrypt_hmac_init(&hmac_pw, password, password_len);
+
+	/* hmac(password, salt...) */
+	hmac_pw_salt = hmac_pw;
+	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
+
+	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
+	for (i = 1; i <= blocks; i++) {
+		/* U1 = hmac(password, salt || be(i)) */
+		U32TO8_BE(be, i);
+		work = hmac_pw_salt;
+		scrypt_hmac_update(&work, be, 4);
+		scrypt_hmac_finish(&work, ti);
+		memcpy(u, ti, sizeof(u));
+
+		/* T[i] = U1 ^ U2 ^ U3... */
+		for (c = 0; c < N - 1; c++) {
+			/* UX = hmac(password, U{X-1}) */
+			work = hmac_pw;
+			scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
+			scrypt_hmac_finish(&work, u);
+
+			/* T[i] ^= UX */
+			for (j = 0; j < sizeof(u); j++)
+				ti[j] ^= u[j];
+		}
+
+		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		out += SCRYPT_HASH_DIGEST_SIZE;
+		bytes -= SCRYPT_HASH_DIGEST_SIZE;
+	}
+
+	scrypt_ensure_zero(ti, sizeof(ti));
+	scrypt_ensure_zero(u, sizeof(u));
+	scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
+	scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
+}
--- a/algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
@@ -0,0 +1,463 @@
+#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
+	#define X86ASM
+
+	/* gcc 2.95 royally screws up stack alignments on variables */
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
+		#define X86ASM_SSE
+		#define X86ASM_SSE2
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
+		#define X86ASM_SSSE3
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
+		#define X86ASM_AVX
+		#define X86ASM_XOP
+	#endif
+	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
+		#define X86ASM_AVX2
+	#endif
+#endif
+
+#if defined(CPU_X86_64) && defined(COMPILER_GCC)
+	#define X86_64ASM
+	#define X86_64ASM_SSE2
+	#if (COMPILER_GCC >= 40102)
+		#define X86_64ASM_SSSE3
+	#endif
+	#if (COMPILER_GCC >= 40400)
+		#define X86_64ASM_AVX
+		#define X86_64ASM_XOP
+	#endif
+	#if (COMPILER_GCC >= 40700)
+		#define X86_64ASM_AVX2
+	#endif
+#endif
+
+#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
+	#define X86_INTRINSIC
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
+		#define X86_INTRINSIC_AVX
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
+		#define X86_INTRINSIC_AVX2
+	#endif
+#endif
+
+#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
+	#define X86_INTRINSIC
+	#if defined(__SSE__)
+		#define X86_INTRINSIC_SSE
+	#endif
+	#if defined(__SSE2__)
+		#define X86_INTRINSIC_SSE2
+	#endif
+	#if defined(__SSSE3__)
+		#define X86_INTRINSIC_SSSE3
+	#endif
+	#if defined(__AVX__)
+		#define X86_INTRINSIC_AVX
+	#endif
+	#if defined(__XOP__)
+		#define X86_INTRINSIC_XOP
+	#endif
+	#if defined(__AVX2__)
+		#define X86_INTRINSIC_AVX2
+	#endif
+#endif
+
+/* only use simd on windows (or SSE2 on gcc)! */
+#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
+	#if defined(X86_INTRINSIC_SSE)
+		#include <mmintrin.h>
+		#include <xmmintrin.h>
+		typedef __m64 qmm;
+		typedef __m128 xmm;
+		typedef __m128d xmmd;
+	#endif
+	#if defined(X86_INTRINSIC_SSE2)
+		#include <emmintrin.h>
+		typedef __m128i xmmi;
+	#endif
+	#if defined(X86_INTRINSIC_SSSE3)
+		#include <tmmintrin.h>
+	#endif
+	#if defined(X86_INTRINSIC_AVX)
+		#include <immintrin.h>
+	#endif
+	#if defined(X86_INTRINSIC_XOP)
+		#if defined(COMPILER_MSVC)
+			#include <intrin.h>
+		#else
+			#include <x86intrin.h>
+		#endif
+	#endif
+	#if defined(X86_INTRINSIC_AVX2)
+		typedef __m256i ymmi;
+	#endif
+#endif
+
+#if defined(X86_INTRINSIC_SSE2)
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		xmmi v;
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		xmmi v;
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		xmmi v;
+	} packedelem64;
+#else
+	typedef union packedelem8_t {
+		uint8_t u[16];
+		uint32_t dw[4];
+	} packedelem8;
+
+	typedef union packedelem32_t {
+		uint32_t u[4];
+		uint8_t b[16];
+	} packedelem32;
+
+	typedef union packedelem64_t {
+		uint64_t u[2];
+		uint8_t b[16];
+	} packedelem64;
+#endif
+
+#if defined(X86_INTRINSIC_SSSE3)
+	static const packedelem8 ALIGN(16) ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
+	static const packedelem8 ALIGN(16) ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
+#endif
+
+/*
+	x86 inline asm for gcc/msvc. usage:
+
+	asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
+	asm_naked_fn(name)
+		a1(..)
+		a2(.., ..)
+		a3(.., .., ..)
+		64bit OR 0 paramters: a1(ret)
+		32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
+	asm_naked_fn_end(name)
+*/
+
+#if defined(X86ASM) || defined(X86_64ASM)
+
+#if defined(COMPILER_MSVC)
+	#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
+	#define a1(x) __asm {x}
+	#define a2(x, y) __asm {x, y}
+	#define a3(x, y, z) __asm {x, y, z}
+	#define a4(x, y, z, w) __asm {x, y, z, w}
+	#define aj(x) __asm {x}
+	#define asm_align8 a1(ALIGN 8)
+	#define asm_align16 a1(ALIGN 16)
+
+	#define asm_calling_convention STDCALL
+	#define aret(n) a1(ret n)
+	#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
+	#define asm_naked_fn(fn) {
+	#define asm_naked_fn_end(fn) }
+#elif defined(COMPILER_GCC)
+	#define GNU_AS1(x) #x ";\n"
+	#define GNU_AS2(x, y) #x ", " #y ";\n"
+	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
+	#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
+	#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
+	#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
+
+	#define a1(x) GNU_AS1(x)
+	#define a2(x, y) GNU_AS2(x, y)
+	#define a3(x, y, z) GNU_AS3(x, y, z)
+	#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
+	#define aj(x) GNU_ASJ(x)
+	#define asm_align8 ".p2align 3,,7"
+	#define asm_align16 ".p2align 4,,15"
+
+	#if defined(OS_WINDOWS)
+		#define asm_calling_convention CDECL
+		#define aret(n) a1(ret)
+
+		#if defined(X86_64ASM)
+			#define asm_naked_fn(fn) ; __asm__ ( \
+				".text\n"                        \
+				asm_align16 GNU_ASFN(fn)         \
+				"subq $136, %rsp;"               \
+			 	"movdqa %xmm6, 0(%rsp);"         \
+				"movdqa %xmm7, 16(%rsp);"        \
+			 	"movdqa %xmm8, 32(%rsp);"        \
+				"movdqa %xmm9, 48(%rsp);"        \
+			 	"movdqa %xmm10, 64(%rsp);"       \
+				"movdqa %xmm11, 80(%rsp);"       \
+				"movdqa %xmm12, 96(%rsp);"       \
+				"movq %rdi, 112(%rsp);"          \
+				"movq %rsi, 120(%rsp);"          \
+				"movq %rcx, %rdi;"               \
+				"movq %rdx, %rsi;"               \
+				"movq %r8, %rdx;"                \
+				"movq %r9, %rcx;"                \
+				"call 1f;"                       \
+				"movdqa 0(%rsp), %xmm6;"         \
+				"movdqa 16(%rsp), %xmm7;"        \
+				"movdqa 32(%rsp), %xmm8;"        \
+				"movdqa 48(%rsp), %xmm9;"        \
+				"movdqa 64(%rsp), %xmm10;"       \
+				"movdqa 80(%rsp), %xmm11;"       \
+				"movdqa 96(%rsp), %xmm12;"       \
+				"movq 112(%rsp), %rdi;"          \
+				"movq 120(%rsp), %rsi;"          \
+				"addq $136, %rsp;"               \
+				"ret;"                           \
+				".intel_syntax noprefix;"        \
+				".p2align 4,,15;"                \
+				"1:;"
+		#else
+			#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+		#endif
+	#else
+		#define asm_calling_convention STDCALL
+		#define aret(n) a1(ret n)
+		#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
+	#endif
+
+	#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
+	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
+
+	#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
+	#define asm_gcc_parms() ".att_syntax prefix;"
+	#define asm_gcc_trashed() __asm__ __volatile__("" :::
+	#define asm_gcc_end() );
+#else
+	need x86 asm
+#endif
+
+#endif /* X86ASM || X86_64ASM */
+
+
+#if defined(CPU_X86) || defined(CPU_X86_64)
+
+typedef enum cpu_flags_x86_t {
+	cpu_mmx = 1 << 0,
+	cpu_sse = 1 << 1,
+	cpu_sse2 = 1 << 2,
+	cpu_sse3 = 1 << 3,
+	cpu_ssse3 = 1 << 4,
+	cpu_sse4_1 = 1 << 5,
+	cpu_sse4_2 = 1 << 6,
+	cpu_avx = 1 << 7,
+	cpu_xop = 1 << 8,
+	cpu_avx2 = 1 << 9
+} cpu_flags_x86;
+
+typedef enum cpu_vendors_x86_t {
+	cpu_nobody,
+	cpu_intel,
+	cpu_amd
+} cpu_vendors_x86;
+
+typedef struct x86_regs_t {
+	uint32_t eax, ebx, ecx, edx;
+} x86_regs;
+
+#if defined(X86ASM)
+asm_naked_fn_proto(int, has_cpuid)(void)
+asm_naked_fn(has_cpuid)
+	a1(pushfd)
+	a1(pop eax)
+	a2(mov ecx, eax)
+	a2(xor eax, 0x200000)
+	a1(push eax)
+	a1(popfd)
+	a1(pushfd)
+	a1(pop eax)
+	a2(xor eax, ecx)
+	a2(shr eax, 21)
+	a2(and eax, 1)
+	a1(push ecx)
+	a1(popfd)
+	a1(ret)
+asm_naked_fn_end(has_cpuid)
+#endif /* X86ASM */
+
+
+static void NOINLINE
+get_cpuid(x86_regs *regs, uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	__cpuid((int *)regs, (int)flags);
+#else
+	#if defined(CPU_X86_64)
+		#define cpuid_bx rbx
+	#else
+		#define cpuid_bx ebx
+	#endif
+
+	asm_gcc()
+		a1(push cpuid_bx)
+		a2(xor ecx, ecx)
+		a1(cpuid)
+		a2(mov [%1 + 0], eax)
+		a2(mov [%1 + 4], ebx)
+		a2(mov [%1 + 8], ecx)
+		a2(mov [%1 + 12], edx)
+		a1(pop cpuid_bx)
+		asm_gcc_parms() : "+a"(flags) : "S"(regs)  : "%ecx", "%edx", "cc"
+	asm_gcc_end()
+#endif
+}
+
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+static uint64_t NOINLINE
+get_xgetbv(uint32_t flags) {
+#if defined(COMPILER_MSVC)
+	return _xgetbv(flags);
+#else
+	uint32_t lo, hi;
+	asm_gcc()
+		a1(xgetbv)
+		asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
+	asm_gcc_end()
+	return ((uint64_t)lo | ((uint64_t)hi << 32));
+#endif
+}
+#endif // AVX support
+
+#if defined(SCRYPT_TEST_SPEED)
+size_t cpu_detect_mask = (size_t)-1;
+#endif
+
+static size_t
+detect_cpu(void) {
+	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
+	//cpu_vendors_x86 vendor = cpu_nobody;
+	x86_regs regs;
+	uint32_t max_level, max_ext_level;
+	size_t cpu_flags = 0;
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	uint64_t xgetbv_flags;
+#endif
+
+#if defined(CPU_X86)
+	if (!has_cpuid())
+		return cpu_flags;
+#endif
+
+	get_cpuid(&regs, 0);
+	max_level = regs.eax;
+#if 0
+	vendor_string.i[0] = regs.ebx;
+	vendor_string.i[1] = regs.edx;
+	vendor_string.i[2] = regs.ecx;
+
+	if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
+		vendor = cpu_intel;
+	else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
+		vendor = cpu_amd;
+#endif
+	if (max_level & 0x00000500) {
+		/* "Intel P5 pre-B0" */
+		cpu_flags |= cpu_mmx;
+		return cpu_flags;
+	}
+
+	if (max_level < 1)
+		return cpu_flags;
+
+	get_cpuid(&regs, 1);
+#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
+	/* xsave/xrestore */
+	if (regs.ecx & (1 << 27)) {
+		xgetbv_flags = get_xgetbv(0);
+		if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
+	}
+#endif
+	if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
+	if (regs.ecx & (1 <<  9)) cpu_flags |= cpu_ssse3;
+	if (regs.ecx & (1      )) cpu_flags |= cpu_sse3;
+	if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
+	if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
+	if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
+
+	if (cpu_flags & cpu_avx) {
+		if (max_level >= 7) {
+			get_cpuid(&regs, 7);
+			if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
+		}
+
+		get_cpuid(&regs, 0x80000000);
+		max_ext_level = regs.eax;
+		if (max_ext_level >= 0x80000001) {
+			get_cpuid(&regs, 0x80000001);
+			if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
+		}
+	}
+
+
+#if defined(SCRYPT_TEST_SPEED)
+	cpu_flags &= cpu_detect_mask;
+#endif
+
+	return cpu_flags;
+}
+
+#if defined(SCRYPT_TEST_SPEED)
+static const char *
+get_top_cpuflag_desc(size_t flag) {
+	if (flag & cpu_avx2) return "AVX2";
+	else if (flag & cpu_xop) return "XOP";
+	else if (flag & cpu_avx) return "AVX";
+	else if (flag & cpu_sse4_2) return "SSE4.2";
+	else if (flag & cpu_sse4_1) return "SSE4.1";
+	else if (flag & cpu_ssse3) return "SSSE3";
+	else if (flag & cpu_sse2) return "SSE2";
+	else if (flag & cpu_sse) return "SSE";
+	else if (flag & cpu_mmx) return "MMX";
+	else return "Basic";
+}
+#endif
+
+/* enable the highest system-wide option */
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+	#if !defined(__AVX2__)
+		#undef X86_64ASM_AVX2
+		#undef X86ASM_AVX2
+		#undef X86_INTRINSIC_AVX2
+	#endif
+	#if !defined(__XOP__)
+		#undef X86_64ASM_XOP
+		#undef X86ASM_XOP
+		#undef X86_INTRINSIC_XOP
+	#endif
+	#if !defined(__AVX__)
+		#undef X86_64ASM_AVX
+		#undef X86ASM_AVX
+		#undef X86_INTRINSIC_AVX
+	#endif
+	#if !defined(__SSSE3__)
+		#undef X86_64ASM_SSSE3
+		#undef X86ASM_SSSE3
+		#undef X86_INTRINSIC_SSSE3
+	#endif
+	#if !defined(__SSE2__)
+		#undef X86_64ASM_SSE2
+		#undef X86ASM_SSE2
+		#undef X86_INTRINSIC_SSE2
+	#endif
+#endif
+
+#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
--- a/algo/argon2/ar2/sj/scrypt-jane-portable.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-portable.h
@@ -0,0 +1,310 @@
+/* determine os */
+#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
+	#include <windows.h>
+	#include <wincrypt.h>
+	#define OS_WINDOWS
+#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <fcntl.h>
+
+	#define OS_SOLARIS
+#else
+	#include <sys/mman.h>
+	#include <sys/time.h>
+	#include <sys/param.h> /* need this to define BSD */
+	#include <unistd.h>
+	#include <fcntl.h>
+
+	#define OS_NIX
+	#if defined(__linux__)
+		#include <endian.h>
+		#define OS_LINUX
+	#elif defined(BSD)
+		#define OS_BSD
+
+		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
+			#define OS_OSX
+		#elif defined(macintosh) || defined(Macintosh)
+			#define OS_MAC
+		#elif defined(__OpenBSD__)
+			#define OS_OPENBSD
+		#endif
+	#endif
+#endif
+
+
+/* determine compiler */
+#if defined(_MSC_VER)
+	#define COMPILER_MSVC_VS6       120000000
+	#define COMPILER_MSVC_VS6PP     121000000
+	#define COMPILER_MSVC_VS2002    130000000
+	#define COMPILER_MSVC_VS2003    131000000
+	#define COMPILER_MSVC_VS2005    140050727
+	#define COMPILER_MSVC_VS2008    150000000
+	#define COMPILER_MSVC_VS2008SP1 150030729
+	#define COMPILER_MSVC_VS2010    160000000
+	#define COMPILER_MSVC_VS2010SP1 160040219
+	#define COMPILER_MSVC_VS2012RC  170000000
+	#define COMPILER_MSVC_VS2012    170050727
+
+	#if _MSC_FULL_VER > 100000000
+		#define COMPILER_MSVC (_MSC_FULL_VER)
+	#else
+		#define COMPILER_MSVC (_MSC_FULL_VER * 10)
+	#endif
+
+	#if ((_MSC_VER == 1200) && defined(_mm_free))
+		#undef COMPILER_MSVC
+		#define COMPILER_MSVC COMPILER_MSVC_VS6PP
+	#endif
+
+	#pragma warning(disable : 4127) /* conditional expression is constant */
+	#pragma warning(disable : 4100) /* unreferenced formal parameter */
+
+	#ifndef _CRT_SECURE_NO_WARNINGS
+	#define _CRT_SECURE_NO_WARNINGS
+	#endif
+
+	#include <float.h>
+	#include <stdlib.h> /* _rotl */
+	#include <intrin.h>
+
+	typedef unsigned char uint8_t;
+	typedef unsigned short uint16_t;
+	typedef unsigned int uint32_t;
+	typedef signed int int32_t;
+	typedef unsigned __int64 uint64_t;
+	typedef signed __int64 int64_t;
+
+	#define ROTL32(a,b) _rotl(a,b)
+	#define ROTR32(a,b) _rotr(a,b)
+	#define ROTL64(a,b) _rotl64(a,b)
+	#define ROTR64(a,b) _rotr64(a,b)
+	#undef NOINLINE
+	#define NOINLINE __declspec(noinline)
+	#undef NORETURN
+	#define NORETURN
+	#undef INLINE
+	#define INLINE __forceinline
+	#undef FASTCALL
+	#define FASTCALL __fastcall
+	#undef CDECL
+	#define CDECL __cdecl
+	#undef STDCALL
+	#define STDCALL __stdcall
+	#undef NAKED
+	#define NAKED __declspec(naked)
+	#define ALIGN(n) __declspec(align(n))
+#endif
+#if defined(__ICC)
+	#define COMPILER_INTEL
+#endif
+#if defined(__GNUC__)
+	#if (__GNUC__ >= 3)
+		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
+	#else
+		#define COMPILER_GCC_PATCHLEVEL 0
+	#endif
+	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
+	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
+	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
+	#undef NOINLINE
+	#if (COMPILER_GCC >= 30000)
+		#define NOINLINE __attribute__((noinline))
+	#else
+		#define NOINLINE
+	#endif
+	#undef NORETURN
+	#if (COMPILER_GCC >= 30000)
+		#define NORETURN __attribute__((noreturn))
+	#else
+		#define NORETURN
+	#endif
+	#undef INLINE
+	#if (COMPILER_GCC >= 30000)
+		#define INLINE __attribute__((always_inline))
+	#else
+		#define INLINE inline
+	#endif
+	#undef FASTCALL
+	#if (COMPILER_GCC >= 30400)
+		#define FASTCALL __attribute__((fastcall))
+	#else
+		#define FASTCALL
+	#endif
+	#undef CDECL
+	#define CDECL __attribute__((cdecl))
+	#undef STDCALL
+	#define STDCALL __attribute__((stdcall))
+	#define ALIGN(n) __attribute__((aligned(n)))
+	#include <stdint.h>
+#endif
+#if defined(__MINGW32__) || defined(__MINGW64__)
+	#define COMPILER_MINGW
+#endif
+#if defined(__PATHCC__)
+	#define COMPILER_PATHCC
+#endif
+
+#define OPTIONAL_INLINE
+#if defined(OPTIONAL_INLINE)
+	#undef OPTIONAL_INLINE
+	#define OPTIONAL_INLINE INLINE
+#else
+	#define OPTIONAL_INLINE
+#endif
+
+#define CRYPTO_FN NOINLINE STDCALL
+
+/* determine cpu */
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
+	#define CPU_X86_64
+#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
+	#define CPU_X86 500
+#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
+	#define CPU_X86 400
+#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
+	#define CPU_X86 300
+#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
+	#define CPU_IA64
+#endif
+
+#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
+	#define CPU_SPARC
+	#if defined(__sparcv9)
+		#define CPU_SPARC64
+	#endif
+#endif
+
+#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
+	#define CPU_64BITS
+	#undef FASTCALL
+	#define FASTCALL
+	#undef CDECL
+	#define CDECL
+	#undef STDCALL
+	#define STDCALL
+#endif
+
+#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
+	#define CPU_PPC
+	#if defined(_ARCH_PWR7)
+		#define CPU_POWER7
+	#elif defined(__64BIT__)
+		#define CPU_PPC64
+	#else
+		#define CPU_PPC32
+	#endif
+#endif
+
+#if defined(__hppa__) || defined(__hppa)
+	#define CPU_HPPA
+#endif
+
+#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+	#define CPU_ALPHA
+#endif
+
+/* endian */
+
+#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
+	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
+	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
+#define CPU_LE
+#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
+	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
+	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
+#define CPU_BE
+#else
+	/* unknown endian! */
+#endif
+
+
+#define U8TO32_BE(p)                                            \
+	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
+	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
+
+#define U8TO32_LE(p)                                            \
+	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
+	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+#define U32TO8_BE(p, v)                                           \
+	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
+	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
+
+#define U32TO8_LE(p, v)                                           \
+	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
+	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
+
+#define U8TO64_BE(p)                                                  \
+	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
+
+#define U8TO64_LE(p)                                                  \
+	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
+
+#define U64TO8_BE(p, v)                        \
+	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
+	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
+
+#define U64TO8_LE(p, v)                        \
+	U32TO8_LE((p),     (uint32_t)((v)      )); \
+	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
+
+#define U32_SWAP(v) {                                             \
+	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
+    (v) = ((v) << 16) | ((v) >> 16);                              \
+}
+
+#define U64_SWAP(v) {                                                                       \
+	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
+	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
+    (v) = ((v) << 32) | ((v) >> 32);                                                        \
+}
+
+static int
+scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
+	uint32_t differentbits = 0;
+	while (len--)
+		differentbits |= (*x++ ^ *y++);
+	return (1 & ((differentbits - 1) >> 8));
+}
+
+static void
+scrypt_ensure_zero(void *p, size_t len) {
+#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
+		__stosb((unsigned char *)p, 0, len);
+#elif (defined(CPU_X86) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushl %%edi;\n"
+		"pushl %%ecx;\n"
+		"rep stosb;\n"
+		"popl %%ecx;\n"
+		"popl %%edi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
+	__asm__ __volatile__(
+		"pushq %%rdi;\n"
+		"pushq %%rcx;\n"
+		"rep stosb;\n"
+		"popq %%rcx;\n"
+		"popq %%rdi;\n"
+		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
+	);
+#else
+	volatile uint8_t *b = (volatile uint8_t *)p;
+	size_t i;
+	for (i = 0; i < len; i++)
+		b[i] = 0;
+#endif
+}
+
+#include "scrypt-jane-portable-x86.h"
+
+#if !defined(asm_calling_convention)
+#define asm_calling_convention
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix-basic.h
@@ -0,0 +1,74 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+/* function type returned by scrypt_getROMix, used with cpu detection */
+typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
+#endif
+
+/* romix pre/post nop function */
+static void asm_calling_convention
+scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
+	(void)blocks; (void)nblocks;
+}
+
+/* romix pre/post endian conversion function */
+static void asm_calling_convention
+scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
+#if !defined(CPU_LE)
+	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
+	size_t i;
+	if (endian_test.w == 0x100) {
+		nblocks *= SCRYPT_BLOCK_WORDS;
+		for (i = 0; i < nblocks; i++) {
+			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
+		}
+	}
+#else
+	(void)blocks; (void)nblocks;
+#endif
+}
+
+/* chunkmix test function */
+typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
+typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
+
+static int
+scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
+	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
+	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#else
+	scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
+#endif
+	uint8_t final[16];
+	size_t i;
+
+	for (i = 0; i < words; i++) {
+		v = (scrypt_mix_word_t)i;
+		v = (v << 8) | v;
+		v = (v << 16) | v;
+		chunk[0][i] = v;
+	}
+
+	prefn(chunk[0], blocks);
+	mixfn(chunk[1], chunk[0], NULL, r);
+	postfn(chunk[1], blocks);
+
+	/* grab the last 16 bytes of the final block */
+	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
+		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
+	}
+
+	return scrypt_verify(expected, final, 16);
+}
+
+/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
+static scrypt_mix_word_t *
+scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
+	return base + (i * len);
+}
+
+/* returns a pointer to block i */
+static scrypt_mix_word_t *
+scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
+	return base + (i * SCRYPT_BLOCK_WORDS);
+}
--- a/algo/argon2/ar2/sj/scrypt-jane-romix-template.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix-template.h
@@ -0,0 +1,122 @@
+#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
+
+#if defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_ROMIX_FN
+#define SCRYPT_ROMIX_FN scrypt_ROMix
+#endif
+
+#undef SCRYPT_HAVE_ROMIX
+#define SCRYPT_HAVE_ROMIX
+
+#if !defined(SCRYPT_CHUNKMIX_FN)
+
+#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
+
+/*
+	Bout = ChunkMix(Bin)
+
+	2*r: number of blocks in the chunk
+*/
+static void asm_calling_convention
+SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
+#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
+	scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
+#else
+	scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
+#endif
+	uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
+
+	/* 1: X = B_{2r - 1} */
+	block = scrypt_block(Bin, blocksPerChunk - 1);
+	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+		X[i] = block[i];
+
+	if (Bxor) {
+		block = scrypt_block(Bxor, blocksPerChunk - 1);
+		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
+			X[i] ^= block[i];
+	}
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
+		/* 3: X = H(X ^ B_i) */
+		block = scrypt_block(Bin, i);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			X[j] ^= block[j];
+
+		if (Bxor) {
+			block = scrypt_block(Bxor, i);
+			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+				X[j] ^= block[j];
+		}
+		SCRYPT_MIX_FN(X);
+
+		/* 4: Y_i = X */
+		/* 6: B'[0..r-1] = Y_even */
+		/* 6: B'[r..2r-1] = Y_odd */
+		block = scrypt_block(Bout, (i / 2) + half);
+		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
+			block[j] = X[j];
+	}
+}
+#endif
+
+/*
+	X = ROMix(X)
+
+	X: chunk to mix
+	Y: scratch chunk
+	N: number of rounds
+	V[N]: array of chunks to randomly index in to
+	2*r: number of blocks in a chunk
+*/
+
+static void NOINLINE FASTCALL
+SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
+	uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
+	scrypt_mix_word_t *block = V;
+
+	SCRYPT_ROMIX_TANGLE_FN(X, 2);
+
+	/* 1: X = B */
+	/* implicit */
+
+	/* 2: for i = 0 to N - 1 do */
+	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
+	for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
+		/* 3: V_i = X */
+		/* 4: X = H(X) */
+		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
+	}
+	SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < /*N*/512; i += 2) {
+		/* 7: j = Integerify(X) % N */
+		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
+
+		/* 7: j = Integerify(Y) % N */
+		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
+
+		/* 8: X = H(Y ^ V_j) */
+		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
+	}
+
+	/* 10: B' = X */
+	/* implicit */
+
+	SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
+}
+
+#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
+
+
+#undef SCRYPT_CHUNKMIX_FN
+#undef SCRYPT_ROMIX_FN
+#undef SCRYPT_MIX_FN
+#undef SCRYPT_ROMIX_TANGLE_FN
+#undef SCRYPT_ROMIX_UNTANGLE_FN
+
--- a/algo/argon2/ar2/sj/scrypt-jane-romix.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-romix.h
@@ -0,0 +1,23 @@
+#if defined(SCRYPT_SALSA64)
+#include "scrypt-jane-salsa64.h"
+#else
+	#define SCRYPT_MIX_BASE "ERROR"
+	typedef uint32_t scrypt_mix_word_t;
+	#define SCRYPT_WORDTO8_LE U32TO8_LE
+	#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
+	#define SCRYPT_BLOCK_BYTES 64
+	#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+	#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+		static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
+		static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
+	#else
+		static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
+	#endif
+	static int scrypt_test_mix(void) { return 0; }
+	#error must define a mix function!
+#endif
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+#undef SCRYPT_MIX
+#define SCRYPT_MIX SCRYPT_MIX_BASE
+#endif
--- a/algo/argon2/ar2/sj/scrypt-jane-salsa64.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-salsa64.h
@@ -0,0 +1,183 @@
+#define SCRYPT_MIX_BASE "Salsa64/8"
+
+typedef uint64_t scrypt_mix_word_t;
+
+#define SCRYPT_WORDTO8_LE U64TO8_LE
+#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
+
+#define SCRYPT_BLOCK_BYTES 128
+#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
+
+/* must have these here in case block bytes is ever != 64 */
+#include "scrypt-jane-romix-basic.h"
+
+#include "scrypt-jane-mix_salsa64-avx2.h"
+#include "scrypt-jane-mix_salsa64-xop.h"
+#include "scrypt-jane-mix_salsa64-avx.h"
+#include "scrypt-jane-mix_salsa64-ssse3.h"
+#include "scrypt-jane-mix_salsa64-sse2.h"
+#include "scrypt-jane-mix_salsa64.h"
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
+	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
+	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
+	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
+	#include "scrypt-jane-romix-template.h"
+#endif
+
+/* cpu agnostic */
+#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
+#define SCRYPT_MIX_FN salsa64_core_basic
+#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
+#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
+#include "scrypt-jane-romix-template.h"
+
+#if !defined(SCRYPT_CHOOSE_COMPILETIME)
+static scrypt_ROMixfn
+scrypt_getROMix(void) {
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		return scrypt_ROMix_avx2;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		return scrypt_ROMix_xop;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		return scrypt_ROMix_avx;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		return scrypt_ROMix_ssse3;
+	else
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		return scrypt_ROMix_sse2;
+	else
+#endif
+
+	return scrypt_ROMix_basic;
+}
+#endif
+
+
+#if defined(SCRYPT_TEST_SPEED)
+static size_t
+available_implementations(void) {
+	size_t cpuflags = detect_cpu();
+	size_t flags = 0;
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		flags |= cpu_avx2;
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		flags |= cpu_xop;
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		flags |= cpu_avx;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		flags |= cpu_ssse3;
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		flags |= cpu_sse2;
+#endif
+
+	return flags;
+}
+#endif
+
+static int
+scrypt_test_mix(void) {
+	static const uint8_t expected[16] = {
+		0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
+	};
+
+	int ret = 1;
+	size_t cpuflags = detect_cpu();
+
+#if defined(SCRYPT_SALSA64_AVX2)
+	if (cpuflags & cpu_avx2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_XOP)
+	if (cpuflags & cpu_xop)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_AVX)
+	if (cpuflags & cpu_avx)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSSE3)
+	if (cpuflags & cpu_ssse3)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_SSE2)
+	if (cpuflags & cpu_sse2)
+		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
+#endif
+
+#if defined(SCRYPT_SALSA64_BASIC)
+	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
+#endif
+
+	return ret;
+}
+
--- a/algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
+++ b/algo/argon2/ar2/sj/scrypt-jane-test-vectors.h
@@ -0,0 +1,28 @@
+typedef struct scrypt_test_setting_t {
+	const char *pw, *salt;
+	uint8_t Nfactor, rfactor, pfactor;
+} scrypt_test_setting;
+
+static const scrypt_test_setting post_settings[] = {
+	{"", "", 3, 0, 0},
+	{"password", "NaCl", 9, 3, 4},
+	{0, 0, 0, 0, 0}
+};
+
+#if defined(SCRYPT_SKEIN512)
+	#if defined(SCRYPT_SALSA64)
+		static const uint8_t post_vectors[][64] = {
+			{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
+			 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
+			 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
+			 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
+			{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
+			 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
+			 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
+			 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
+		};
+	#endif
+#else
+	static const uint8_t post_vectors[][64] = {{0}};
+#endif
+
--- a/algo/argon2/argon2a.c
+++ b/algo/argon2/argon2a.c
@@ -0,0 +1,92 @@
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <openssl/sha.h>
+#include "ar2/argon2.h"
+#include "ar2/cores.h"
+#include "ar2/ar2-scrypt-jane.h"
+#include "algo-gate-api.h"
+
+#define T_COSTS 2
+#define M_COSTS 16
+#define MASK 8
+#define ZERO 0
+
+inline void argon_call(void *out, void *in, void *salt, int type)
+{
+	argon2_context context;
+
+	context.out = (uint8_t *)out;
+	context.pwd = (uint8_t *)in;
+	context.salt = (uint8_t*)salt;
+	context.pwdlen = 0;
+	context.allocate_cbk = NULL;
+	context.free_cbk = NULL;
+
+	argon2_core(&context, type);
+}
+
+void argon2hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hashA[8], hashB[8];
+
+	my_scrypt((const unsigned char *)input, 80,
+		(const unsigned char *)input, 80,
+		(unsigned char *)hashA);
+
+	argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);
+
+	my_scrypt((const unsigned char *)hashB, 32,
+		(const unsigned char *)hashB, 32,
+		(unsigned char *)output);
+}
+
+int scanhash_argon2(int thr_id, struct work* work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t nonce = first_nonce;
+
+        swab32_array( endiandata, pdata, 20 );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		argon2hash(hash, endiandata);
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			work_set_target_ratio(work, hash);
+			return 1;
+		}
+		nonce++;
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+int64_t argon2_get_max64 ()
+{
+  return 0x1ffLL;
+}
+
+bool register_argon2_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->scanhash        = (void*)&scanhash_argon2;
+  gate->hash            = (void*)&argon2hash;
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->set_target      = (void*)&scrypt_set_target;
+  gate->get_max64       = (void*)&argon2_get_max64;
+  return true;
+};
+
--- a/algo/axiom.c
+++ b/algo/axiom.c
@@ -0,0 +1,88 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/shabal/sph_shabal.h"
+
+static __thread uint32_t _ALIGN(128) M[65536][8];
+
+void axiomhash(void *output, const void *input)
+{
+	sph_shabal256_context ctx;
+	const int N = 65536;
+
+	sph_shabal256_init(&ctx);
+	sph_shabal256(&ctx, input, 80);
+	sph_shabal256_close(&ctx, M[0]);
+
+	for(int i = 1; i < N; i++) {
+		sph_shabal256_init(&ctx);
+		sph_shabal256(&ctx, M[i-1], 32);
+		sph_shabal256_close(&ctx, M[i]);
+	}
+
+	for(int b = 0; b < N; b++)
+	{
+		const int p = b > 0 ? b - 1 : 0xFFFF;
+		const int q = M[p][0] % 0xFFFF;
+		const int j = (b + q) % N;
+
+		sph_shabal256_init(&ctx);
+#if 0
+		sph_shabal256(&ctx, M[p], 32);
+		sph_shabal256(&ctx, M[j], 32);
+#else
+		uint8_t _ALIGN(128) hash[64];
+		memcpy(hash, M[p], 32);
+		memcpy(&hash[32], M[j], 32);
+		sph_shabal256(&ctx, hash, 64);
+#endif
+		sph_shabal256_close(&ctx, M[b]);
+	}
+	memcpy(output, M[N-1], 32);
+}
+
+int scanhash_axiom(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t _ALIGN(128) hash64[8];
+	uint32_t _ALIGN(128) endiandata[20];
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+        for (int k = 0; k < 19; k++)
+                be32enc(&endiandata[k], pdata[k]);
+
+	do {
+		be32enc(&endiandata[19], n);
+		axiomhash(hash64, endiandata);
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+
+	return 0;
+}
+
+bool register_axiom_algo( algo_gate_t* gate )
+{
+    gate->scanhash  = (void*)&scanhash_axiom;
+    gate->hash      = (void*)&axiomhash;
+    gate->hash_alt  = (void*)&axiomhash;
+    gate->get_max64 = (void*)&get_max64_0x40LL;
+    return true;
+}
--- a/algo/blake/.dirstamp
+++ b/algo/blake/.dirstamp
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -0,0 +1,108 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+#include "sph_blake.h"
+
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+/* Move init out of loop, so init once externally,
+ * and then use one single memcpy */
+static __thread sph_blake256_context blake_mid;
+static __thread bool ctx_midstate_done = false;
+
+static void init_blake_hash(void)
+{
+	sph_blake256_init(&blake_mid);
+	ctx_midstate_done = true;
+}
+
+void blakehash(void *state, const void *input)
+{
+	sph_blake256_context ctx;
+
+	uint8_t hash[64];
+	uint8_t *ending = (uint8_t*) input;
+	ending += 64;
+
+	// do one memcopy to get a fresh context
+	if (!ctx_midstate_done) {
+		init_blake_hash();
+		sph_blake256(&blake_mid, input, 64);
+	}
+
+	memcpy(&ctx, &blake_mid, sizeof(blake_mid));
+
+	sph_blake256(&ctx, ending, 16);
+	sph_blake256_close(&ctx, hash);
+
+	memcpy(state, hash, 32);
+
+}
+
+int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t HTarget = ptarget[7];
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t _ALIGN(32) endiandata[20];
+	uint32_t n = first_nonce;
+
+	ctx_midstate_done = false;
+
+	if (opt_benchmark)
+		HTarget = 0x7f;
+
+	// we need big endian data...
+        swab32_array( endiandata, pdata, 20 );
+
+#ifdef DEBUG_ALGO
+	applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
+#endif
+
+	do {
+		be32enc(&endiandata[19], n);
+		blakehash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+		if (hash64[7] <= HTarget && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			return true;
+		}
+#else
+		if (!(n % 0x1000) && !thr_id) printf(".");
+		if (hash64[7] == 0) {
+			printf("[%d]",thr_id);
+			if (fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+				return true;
+			}
+		}
+#endif
+		n++; pdata[19] = n;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_blake_algo( algo_gate_t* gate )
+{
+  gate->scanhash  = (void*)&scanhash_blake;
+  gate->hash      = (void*)&blakehash;
+  gate->hash_alt  = (void*)&blakehash;
+  gate->get_max64 = (void*)&blake_get_max64;
+  return true;
+}
+
+
--- a/algo/blake/blake2.c
+++ b/algo/blake/blake2.c
@@ -0,0 +1,85 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <string.h>
+#include <stdint.h>
+
+#include "crypto/blake2s.h"
+
+static __thread blake2s_state s_midstate;
+static __thread blake2s_state s_ctx;
+#define MIDLEN 76
+
+void blake2s_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
+	blake2s_state blake2_ctx;
+
+	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
+	blake2s_update(&blake2_ctx, input, 80);
+	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+
+	memcpy(output, hash, 32);
+}
+
+static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.buflen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
+	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
+}
+
+int scanhash_blake2s(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t _ALIGN(64) hash64[8];
+	uint32_t _ALIGN(64) endiandata[20];
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+        swab32_array( endiandata, pdata, 20 );
+
+	// midstate
+	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
+	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
+	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+
+	do {
+		be32enc(&endiandata[19], n);
+		blake2s_hash_end(hash64, endiandata);
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+
+	return 0;
+}
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+
+bool register_blake2s_algo( algo_gate_t* gate )
+{
+  gate->scanhash  = (void*)&scanhash_blake2s;
+  gate->hash      = (void*)&blake2s_hash;
+  gate->get_max64 = (void*)&blake2s_get_max64;
+  return true;
+};
+
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -0,0 +1,129 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+#define BLAKE32_ROUNDS 8
+#include "sph_blake.h"
+
+void blakecoin_init(void *cc);
+void blakecoin(void *cc, const void *data, size_t len);
+void blakecoin_close(void *cc, void *dst);
+
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+#include <openssl/sha.h>
+
+/* Move init out of loop, so init once externally,
+ * and then use one single memcpy */
+static sph_blake256_context blake_mid;
+static bool ctx_midstate_done = false;
+
+static void init_blake_hash(void)
+{
+	blakecoin_init(&blake_mid);
+	ctx_midstate_done = true;
+}
+
+void blakecoinhash(void *state, const void *input)
+{
+	sph_blake256_context ctx;
+
+	uint8_t hash[64];
+	uint8_t *ending = (uint8_t*) input;
+	ending += 64;
+
+	// do one memcopy to get a fresh context
+	if (!ctx_midstate_done) {
+		init_blake_hash();
+		blakecoin(&blake_mid, input, 64);
+	}
+	memcpy(&ctx, &blake_mid, sizeof(blake_mid));
+
+	blakecoin(&ctx, ending, 16);
+	blakecoin_close(&ctx, hash);
+
+	memcpy(state, hash, 32);
+}
+
+int scanhash_blakecoin(int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t HTarget = ptarget[7];
+
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t _ALIGN(32) endiandata[20];
+
+	uint32_t n = first_nonce;
+
+	ctx_midstate_done = false;
+
+	if (opt_benchmark)
+		HTarget = 0x7f;
+
+	// we need big endian data...
+//        be32enc_array( endiandata, pdata, 19 );
+        for (int kk=0; kk < 19; kk++) 
+                be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
+
+
+#ifdef DEBUG_ALGO
+	applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
+#endif
+
+	do {
+		be32enc(&endiandata[19], n);
+		blakecoinhash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+		if (hash64[7] <= HTarget && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			return true;
+		}
+#else
+		if (!(n % 0x1000) && !thr_id) printf(".");
+		if (hash64[7] == 0) {
+			printf("[%d]",thr_id);
+			if (fulltest(hash64, ptarget)) {
+				*hashes_done = n - first_nonce + 1;
+				return true;
+			}
+		}
+#endif
+		n++; pdata[19] = n;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
+{
+ SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
+}
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+    gate->scanhash = (void*)&scanhash_blakecoin;
+    gate->hash     = (void*)&blakecoinhash;
+    gate->hash_alt = (void*)&blakecoinhash;
+    gate->get_max64 = (void*)&blakecoin_get_max64;
+    return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -0,0 +1,248 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+#include "sph_blake.h"
+
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#ifndef min
+#define min(a,b) (a>b ? b : a)
+#endif
+#ifndef max 
+#define max(a,b) (a<b ? b : a)
+#endif
+
+#define DECRED_NBITS_INDEX 29
+#define DECRED_NTIME_INDEX 34
+#define DECRED_NONCE_INDEX 35
+#define DECRED_XNONCE_INDEX 36
+#define DECRED_DATA_SIZE 192
+#define DECRED_WORK_COMPARE_SIZE 140
+
+static __thread sph_blake256_context blake_mid;
+static __thread bool ctx_midstate_done = false;
+
+void decred_hash(void *state, const void *input)
+{
+        #define MIDSTATE_LEN 128
+        sph_blake256_context ctx;
+
+        uint8_t *ending = (uint8_t*) input;
+        ending += MIDSTATE_LEN;
+
+        if (!ctx_midstate_done) {
+                sph_blake256_init(&blake_mid);
+                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
+                ctx_midstate_done = true;
+        }
+        memcpy(&ctx, &blake_mid, sizeof(blake_mid));
+
+        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
+        sph_blake256_close(&ctx, state);
+}
+
+void decred_hash_simple(void *state, const void *input)
+{
+        sph_blake256_context ctx;
+        sph_blake256_init(&ctx);
+        sph_blake256(&ctx, input, 180);
+        sph_blake256_close(&ctx, state);
+}
+
+int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t _ALIGN(128) endiandata[48];
+        uint32_t _ALIGN(128) hash32[8];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+        #define DCR_NONCE_OFT32 35
+
+        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
+        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+
+        uint32_t n = first_nonce;
+
+        ctx_midstate_done = false;
+
+#if 1
+        memcpy(endiandata, pdata, 180);
+#else
+        for (int k=0; k < (180/4); k++)
+                be32enc(&endiandata[k], pdata[k]);
+#endif
+
+#ifdef DEBUG_ALGO
+        if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
+#endif
+
+        do {
+                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
+                endiandata[DCR_NONCE_OFT32] = n;
+                decred_hash(hash32, endiandata);
+
+                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
+                        work_set_target_ratio(work, hash32);
+                        *hashes_done = n - first_nonce + 1;
+#ifdef DEBUG_ALGO
+                        applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
+                        applog_hash(ptarget);
+                        applog_compare_hash(hash32, ptarget);
+#endif
+                        pdata[DCR_NONCE_OFT32] = n;
+                        return 1;
+                }
+
+                n++;
+
+        } while (n < max_nonce && !work_restart[thr_id].restart);
+
+        *hashes_done = n - first_nonce + 1;
+        pdata[DCR_NONCE_OFT32] = n;
+        return 0;
+}
+
+uint32_t *decred_get_nonceptr( uint32_t *work_data )
+{
+   return &work_data[ DECRED_NONCE_INDEX ];
+}
+
+// does decred need a custom stratum_get_g_work to fix nicehash
+//  bad extranonce2 size?
+// 
+// does decred need a custom init_nonce?
+// does it need to increment nonce, seems not because gen_work_now always
+// returns true
+
+void decred_calc_network_diff( struct work* work )
+{
+   // sample for diff 43.281 : 1c05ea29
+   // todo: endian reversed on longpoll could be zr5 specific...
+   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
+   uint32_t bits = ( nbits & 0xffffff );
+   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+   int m;
+   net_diff = (double)0x0000ffff / (double)bits;
+
+   for ( m = shift; m < 29; m++ )
+       net_diff *= 256.0;
+   for ( m = 29; m < shift; m++ )
+       net_diff /= 256.0;
+   if ( shift == 28 )
+       net_diff *= 256.0; // testnet
+   if ( opt_debug_diff )
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", net_diff,
+               shift, bits);
+}
+
+void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
+{
+   // some random extradata to make the work unique
+   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
+   work->height = work->data[32];
+   if (!have_longpoll && work->height > *net_blocks + 1)
+   {
+      char netinfo[64] = { 0 };
+      if (opt_showdiff && net_diff > 0.)
+      {
+         if (net_diff != work->targetdiff)
+	    sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
+                   work->targetdiff);
+	 else
+	     sprintf(netinfo, ", diff %.3f", net_diff);
+       }
+       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
+                       netinfo);
+       *net_blocks = work->height - 1;
+   }
+}
+
+void decred_be_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
+   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
+                                     sctx->xnonce1_size );
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+
+// data shared between gen_merkle_root and build_extraheader.
+uint32_t decred_extraheader[32] = { 0 };
+int decred_headersize = 0;
+
+void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
+{
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   decred_headersize = min((int)sctx->job.coinbase_size - 32, 
+                  sizeof(decred_extraheader) );
+   memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
+}
+
+void decred_build_extraheader( struct work* work, struct stratum_ctx* sctx )
+{
+   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   int i;
+   for ( i = 0; i < 8; i++ ) // prevhash
+      work->data[1 + i] = swab32( work->data[1 + i] );
+   for ( i = 0; i < 8; i++ ) // merkle
+      work->data[9 + i] = swab32( work->data[9 + i] );
+   for ( i = 0; i < decred_headersize/4; i++ ) // header
+      work->data[17 + i] = decred_extraheader[i];
+   // extradata
+   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
+      work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
+      work->data[i] = 0;
+   work->data[37] = (rand()*4) << 8;
+   sctx->bloc_height = work->data[32];
+   //applog_hex(work->data, 180);
+   //applog_hex(&work->data[36], 36);
+}
+
+bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return true;
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return false;
+}
+
+bool register_decred_algo( algo_gate_t* gate )
+{
+  gate->optimizations         = SSE2_OPT;
+  gate->scanhash              = (void*)&scanhash_decred;
+  gate->hash                  = (void*)&decred_hash;
+  gate->hash_alt              = (void*)&decred_hash;
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
+  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->gen_merkle_root       = (void*)&decred_gen_merkle_root;
+  gate->build_extraheader     = (void*)&decred_build_extraheader;
+  gate->prevent_dupes         = (void*)&decred_prevent_dupes;
+  gate->nbits_index           = DECRED_NBITS_INDEX;
+  gate->ntime_index           = DECRED_NTIME_INDEX;
+  gate->nonce_index           = DECRED_NONCE_INDEX;
+  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE; 
+  allow_mininginfo            = false;
+  have_gbt                    = false;
+  return true;
+}
+
--- a/algo/blake/mod_blakecoin.c
+++ b/algo/blake/mod_blakecoin.c
@@ -0,0 +1,531 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKECOIN implementation. (Stripped to 256 bits only)
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ * @author   Tanguy Pruvot (cpuminer implementation)
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_blake.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+#if SPH_64
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+#endif
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+#define DECL_STATE32 \
+	sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \
+	sph_u32 S0, S1, S2, S3, T0, T1;
+
+#define READ_STATE32(state)   do { \
+		H0 = (state)->H[0]; \
+		H1 = (state)->H[1]; \
+		H2 = (state)->H[2]; \
+		H3 = (state)->H[3]; \
+		H4 = (state)->H[4]; \
+		H5 = (state)->H[5]; \
+		H6 = (state)->H[6]; \
+		H7 = (state)->H[7]; \
+		S0 = (state)->S[0]; \
+		S1 = (state)->S[1]; \
+		S2 = (state)->S[2]; \
+		S3 = (state)->S[3]; \
+		T0 = (state)->T0; \
+		T1 = (state)->T1; \
+	} while (0)
+
+#define WRITE_STATE32(state)   do { \
+		(state)->H[0] = H0; \
+		(state)->H[1] = H1; \
+		(state)->H[2] = H2; \
+		(state)->H[3] = H3; \
+		(state)->H[4] = H4; \
+		(state)->H[5] = H5; \
+		(state)->H[6] = H6; \
+		(state)->H[7] = H7; \
+		(state)->S[0] = S0; \
+		(state)->S[1] = S1; \
+		(state)->S[2] = S2; \
+		(state)->S[3] = S3; \
+		(state)->T0 = T0; \
+		(state)->T1 = T1; \
+	} while (0)
+
+#define BLAKE32_ROUNDS 8
+
+#define COMPRESS32   do { \
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = H0; \
+		V1 = H1; \
+		V2 = H2; \
+		V3 = H3; \
+		V4 = H4; \
+		V5 = H5; \
+		V6 = H6; \
+		V7 = H7; \
+		V8 = S0 ^ CS0; \
+		V9 = S1 ^ CS1; \
+		VA = S2 ^ CS2; \
+		VB = S3 ^ CS3; \
+		VC = T0 ^ CS4; \
+		VD = T0 ^ CS5; \
+		VE = T1 ^ CS6; \
+		VF = T1 ^ CS7; \
+		M0 = sph_dec32be_aligned(buf +  0); \
+		M1 = sph_dec32be_aligned(buf +  4); \
+		M2 = sph_dec32be_aligned(buf +  8); \
+		M3 = sph_dec32be_aligned(buf + 12); \
+		M4 = sph_dec32be_aligned(buf + 16); \
+		M5 = sph_dec32be_aligned(buf + 20); \
+		M6 = sph_dec32be_aligned(buf + 24); \
+		M7 = sph_dec32be_aligned(buf + 28); \
+		M8 = sph_dec32be_aligned(buf + 32); \
+		M9 = sph_dec32be_aligned(buf + 36); \
+		MA = sph_dec32be_aligned(buf + 40); \
+		MB = sph_dec32be_aligned(buf + 44); \
+		MC = sph_dec32be_aligned(buf + 48); \
+		MD = sph_dec32be_aligned(buf + 52); \
+		ME = sph_dec32be_aligned(buf + 56); \
+		MF = sph_dec32be_aligned(buf + 60); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		ROUND_S(4); \
+		ROUND_S(5); \
+		ROUND_S(6); \
+		ROUND_S(7); \
+		if (BLAKE32_ROUNDS == 14) { \
+		ROUND_S(8); \
+		ROUND_S(9); \
+		ROUND_S(0); \
+		ROUND_S(1); \
+		ROUND_S(2); \
+		ROUND_S(3); \
+		} \
+		H0 ^= S0 ^ V0 ^ V8; \
+		H1 ^= S1 ^ V1 ^ V9; \
+		H2 ^= S2 ^ V2 ^ VA; \
+		H3 ^= S3 ^ V3 ^ VB; \
+		H4 ^= S0 ^ V4 ^ VC; \
+		H5 ^= S1 ^ V5 ^ VD; \
+		H6 ^= S2 ^ V6 ^ VE; \
+		H7 ^= S3 ^ V7 ^ VF; \
+	} while (0)
+
+
+static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+
+static void
+blake32_init(sph_blake_small_context *sc,
+	const sph_u32 *iv, const sph_u32 *salt)
+{
+	memcpy(sc->H, iv, 8 * sizeof(sph_u32));
+	memcpy(sc->S, salt, 4 * sizeof(sph_u32));
+	sc->T0 = sc->T1 = 0;
+	sc->ptr = 0;
+}
+
+static void
+blake32(sph_blake_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE32
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE32(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((T0 = SPH_T32(T0 + 512)) < 512)
+				T1 = SPH_T32(T1 + 1);
+			COMPRESS32;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE32(sc);
+	sc->ptr = ptr;
+}
+
+static void
+blake32_close(sph_blake_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	union {
+		unsigned char buf[64];
+		sph_u32 dummy;
+	} u;
+	size_t ptr, k;
+	unsigned bit_len;
+	unsigned z;
+	sph_u32 th, tl;
+	unsigned char *out;
+
+	ptr = sc->ptr;
+	bit_len = ((unsigned)ptr << 3) + n;
+	z = 0x80 >> n;
+	u.buf[ptr] = ((ub & -z) | z) & 0xFF;
+	tl = sc->T0 + bit_len;
+	th = sc->T1;
+	if (ptr == 0 && n == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+	} else if (sc->T0 == 0) {
+		sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+		sc->T1 = SPH_T32(sc->T1 - 1);
+	} else {
+		sc->T0 -= 512 - bit_len;
+	}
+	if (bit_len <= 446) {
+		memset(u.buf + ptr + 1, 0, 55 - ptr);
+		if (out_size_w32 == 8)
+			u.buf[55] |= 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+	} else {
+		memset(u.buf + ptr + 1, 0, 63 - ptr);
+		blake32(sc, u.buf + ptr, 64 - ptr);
+		sc->T0 = SPH_C32(0xFFFFFE00);
+		sc->T1 = SPH_C32(0xFFFFFFFF);
+		memset(u.buf, 0, 56);
+		if (out_size_w32 == 8)
+			u.buf[55] = 1;
+		sph_enc32be_aligned(u.buf + 56, th);
+		sph_enc32be_aligned(u.buf + 60, tl);
+		blake32(sc, u.buf, 64);
+	}
+	out = dst;
+	for (k = 0; k < out_size_w32; k ++)
+		sph_enc32be(out + (k << 2), sc->H[k]);
+}
+
+void
+blakecoin_init(void *cc)
+{
+	blake32_init(cc, IV256, salt_zero_small);
+}
+
+void
+blakecoin(void *cc, const void *data, size_t len)
+{
+	blake32(cc, data, len);
+}
+
+static void
+blakecoin_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	blake32_close(cc, ub, n, dst, 8);
+	blakecoin_init(cc);
+}
+
+void
+blakecoin_close(void *cc, void *dst)
+{
+	blakecoin_addbits_and_close(cc, 0, 0, dst);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -0,0 +1,121 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "sph_blake.h"
+
+//#define DEBUG_ALGO
+
+extern void pentablakehash(void *output, const void *input)
+{
+	unsigned char _ALIGN(32) hash[128];
+	// same as uint32_t hashA[16], hashB[16];
+	#define hashB hash+64
+
+	sph_blake512_context     ctx_blake;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hashB);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hashB, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hashB);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hashB, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+	memcpy(output, hash, 32);
+
+}
+
+int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
+      uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+	uint32_t _ALIGN(32) hash64[8];
+	uint32_t _ALIGN(32) endiandata[32];
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+        swab32_array( endiandata, pdata, 20 );
+
+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				pentablakehash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+} 
+
+bool register_pentablake_algo( algo_gate_t* gate )
+{
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -0,0 +1,327 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BLAKE_H__
+#define SPH_BLAKE_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for BLAKE-224.
+ */
+#define SPH_SIZE_blake224   224
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-384.
+ */
+#define SPH_SIZE_blake384   384
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+/**
+ * This structure is a context for BLAKE-224 and BLAKE-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[8];
+	sph_u32 S[4];
+	sph_u32 T0, T1;
+#endif
+} sph_blake_small_context;
+
+/**
+ * This structure is a context for BLAKE-224 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake224_context;
+
+/**
+ * This structure is a context for BLAKE-256 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_small_context sph_blake256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BLAKE-384 and BLAKE-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BLAKE computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BLAKE
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[8];
+	sph_u64 S[4];
+	sph_u64 T0, T1;
+#endif
+} sph_blake_big_context;
+
+/**
+ * This structure is a context for BLAKE-384 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake384_context;
+
+/**
+ * This structure is a context for BLAKE-512 computations. It is
+ * identical to the common <code>sph_blake_small_context</code>.
+ */
+typedef sph_blake_big_context sph_blake512_context;
+
+#endif
+
+/**
+ * Initialize a BLAKE-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-224 context (pointer to a
+ *             <code>sph_blake224_context</code>)
+ */
+void sph_blake224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param dst   the destination buffer
+ */
+void sph_blake224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-256 context (pointer to a
+ *             <code>sph_blake256_context</code>)
+ */
+void sph_blake256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param dst   the destination buffer
+ */
+void sph_blake256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BLAKE-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-384 context (pointer to a
+ *             <code>sph_blake384_context</code>)
+ */
+void sph_blake384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param dst   the destination buffer
+ */
+void sph_blake384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BLAKE-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BLAKE-512 context (pointer to a
+ *             <code>sph_blake512_context</code>)
+ */
+void sph_blake512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BLAKE-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_blake512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BLAKE-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param dst   the destination buffer
+ */
+void sph_blake512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BLAKE-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_blake512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/blake/sse2/blake.c
+++ b/algo/blake/sse2/blake.c
@@ -0,0 +1,477 @@
+/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
+/*
+ * BLAKE implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "../sph_blake.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u64 blkIV512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#define Z00   0
+#define Z01   1
+#define Z02   2
+#define Z03   3
+#define Z04   4
+#define Z05   5
+#define Z06   6
+#define Z07   7
+#define Z08   8
+#define Z09   9
+#define Z0A   A
+#define Z0B   B
+#define Z0C   C
+#define Z0D   D
+#define Z0E   E
+#define Z0F   F
+
+#define Z10   E
+#define Z11   A
+#define Z12   4
+#define Z13   8
+#define Z14   9
+#define Z15   F
+#define Z16   D
+#define Z17   6
+#define Z18   1
+#define Z19   C
+#define Z1A   0
+#define Z1B   2
+#define Z1C   B
+#define Z1D   7
+#define Z1E   5
+#define Z1F   3
+
+#define Z20   B
+#define Z21   8
+#define Z22   C
+#define Z23   0
+#define Z24   5
+#define Z25   2
+#define Z26   F
+#define Z27   D
+#define Z28   A
+#define Z29   E
+#define Z2A   3
+#define Z2B   6
+#define Z2C   7
+#define Z2D   1
+#define Z2E   9
+#define Z2F   4
+
+#define Z30   7
+#define Z31   9
+#define Z32   3
+#define Z33   1
+#define Z34   D
+#define Z35   C
+#define Z36   B
+#define Z37   E
+#define Z38   2
+#define Z39   6
+#define Z3A   5
+#define Z3B   A
+#define Z3C   4
+#define Z3D   0
+#define Z3E   F
+#define Z3F   8
+
+#define Z40   9
+#define Z41   0
+#define Z42   5
+#define Z43   7
+#define Z44   2
+#define Z45   4
+#define Z46   A
+#define Z47   F
+#define Z48   E
+#define Z49   1
+#define Z4A   B
+#define Z4B   C
+#define Z4C   6
+#define Z4D   8
+#define Z4E   3
+#define Z4F   D
+
+#define Z50   2
+#define Z51   C
+#define Z52   6
+#define Z53   A
+#define Z54   0
+#define Z55   B
+#define Z56   8
+#define Z57   3
+#define Z58   4
+#define Z59   D
+#define Z5A   7
+#define Z5B   5
+#define Z5C   F
+#define Z5D   E
+#define Z5E   1
+#define Z5F   9
+
+#define Z60   C
+#define Z61   5
+#define Z62   1
+#define Z63   F
+#define Z64   E
+#define Z65   D
+#define Z66   4
+#define Z67   A
+#define Z68   0
+#define Z69   7
+#define Z6A   6
+#define Z6B   3
+#define Z6C   9
+#define Z6D   2
+#define Z6E   8
+#define Z6F   B
+
+#define Z70   D
+#define Z71   B
+#define Z72   7
+#define Z73   E
+#define Z74   C
+#define Z75   1
+#define Z76   3
+#define Z77   9
+#define Z78   5
+#define Z79   0
+#define Z7A   F
+#define Z7B   4
+#define Z7C   8
+#define Z7D   6
+#define Z7E   2
+#define Z7F   A
+
+#define Z80   6
+#define Z81   F
+#define Z82   E
+#define Z83   9
+#define Z84   B
+#define Z85   3
+#define Z86   0
+#define Z87   8
+#define Z88   C
+#define Z89   2
+#define Z8A   D
+#define Z8B   7
+#define Z8C   1
+#define Z8D   4
+#define Z8E   A
+#define Z8F   5
+
+#define Z90   A
+#define Z91   2
+#define Z92   8
+#define Z93   4
+#define Z94   7
+#define Z95   6
+#define Z96   1
+#define Z97   5
+#define Z98   F
+#define Z99   B
+#define Z9A   9
+#define Z9B   E
+#define Z9C   3
+#define Z9D   C
+#define Z9E   D
+#define Z9F   0
+
+#define Mx(r, i)    Mx_(Z ## r ## i)
+#define Mx_(n)      Mx__(n)
+#define Mx__(n)     M ## n
+
+#define CSx(r, i)   CSx_(Z ## r ## i)
+#define CSx_(n)     CSx__(n)
+#define CSx__(n)    CS ## n
+
+#define CS0   SPH_C32(0x243F6A88)
+#define CS1   SPH_C32(0x85A308D3)
+#define CS2   SPH_C32(0x13198A2E)
+#define CS3   SPH_C32(0x03707344)
+#define CS4   SPH_C32(0xA4093822)
+#define CS5   SPH_C32(0x299F31D0)
+#define CS6   SPH_C32(0x082EFA98)
+#define CS7   SPH_C32(0xEC4E6C89)
+#define CS8   SPH_C32(0x452821E6)
+#define CS9   SPH_C32(0x38D01377)
+#define CSA   SPH_C32(0xBE5466CF)
+#define CSB   SPH_C32(0x34E90C6C)
+#define CSC   SPH_C32(0xC0AC29B7)
+#define CSD   SPH_C32(0xC97C50DD)
+#define CSE   SPH_C32(0x3F84D5B5)
+#define CSF   SPH_C32(0xB5470917)
+
+
+
+#define CBx(r, i)   CBx_(Z ## r ## i)
+#define CBx_(n)     CBx__(n)
+#define CBx__(n)    CB ## n
+
+#define CB0   SPH_C64(0x243F6A8885A308D3)
+#define CB1   SPH_C64(0x13198A2E03707344)
+#define CB2   SPH_C64(0xA4093822299F31D0)
+#define CB3   SPH_C64(0x082EFA98EC4E6C89)
+#define CB4   SPH_C64(0x452821E638D01377)
+#define CB5   SPH_C64(0xBE5466CF34E90C6C)
+#define CB6   SPH_C64(0xC0AC29B7C97C50DD)
+#define CB7   SPH_C64(0x3F84D5B5B5470917)
+#define CB8   SPH_C64(0x9216D5D98979FB1B)
+#define CB9   SPH_C64(0xD1310BA698DFB5AC)
+#define CBA   SPH_C64(0x2FFD72DBD01ADFB7)
+#define CBB   SPH_C64(0xB8E1AFED6A267E96)
+#define CBC   SPH_C64(0xBA7C9045F12C7F99)
+#define CBD   SPH_C64(0x24A19947B3916CF7)
+#define CBE   SPH_C64(0x0801F2E2858EFC16)
+#define CBF   SPH_C64(0x636920D871574E69)
+
+
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+
+
+#define GB(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T64(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR64(d ^ a, 32); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 25); \
+		a = SPH_T64(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR64(d ^ a, 16); \
+		c = SPH_T64(c + d); \
+		b = SPH_ROTR64(b ^ c, 11); \
+	} while (0)
+
+#define ROUND_B(r)   do { \
+		GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
+		GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
+		GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
+		GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
+		GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
+		GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
+		GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
+		GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
+	} while (0)
+
+
+#define COMPRESS64   do { \
+                int r; \
+                int b=0; \
+		sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
+		sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
+		sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
+		sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
+		V0 = blkH0, \
+		V1 = blkH1, \
+		V2 = blkH2, \
+		V3 = blkH3, \
+		V4 = blkH4, \
+		V5 = blkH5, \
+		V6 = blkH6, \
+		V7 = blkH7; \
+		V8 = blkS0 ^ CB0, \
+		V9 = blkS1 ^ CB1, \
+		VA = blkS2 ^ CB2, \
+		VB = blkS3 ^ CB3, \
+		VC = hashctA ^ CB4, \
+		VD = hashctA ^ CB5, \
+		VE = hashctB ^ CB6, \
+		VF = hashctB ^ CB7; \
+		M0 = sph_dec64be_aligned(buf +   0), \
+		M1 = sph_dec64be_aligned(buf +   8), \
+		M2 = sph_dec64be_aligned(buf +  16), \
+		M3 = sph_dec64be_aligned(buf +  24), \
+		M4 = sph_dec64be_aligned(buf +  32), \
+		M5 = sph_dec64be_aligned(buf +  40), \
+		M6 = sph_dec64be_aligned(buf +  48), \
+		M7 = sph_dec64be_aligned(buf +  56), \
+		M8 = sph_dec64be_aligned(buf +  64), \
+		M9 = sph_dec64be_aligned(buf +  72), \
+		MA = sph_dec64be_aligned(buf +  80), \
+		MB = sph_dec64be_aligned(buf +  88), \
+		MC = sph_dec64be_aligned(buf +  96), \
+		MD = sph_dec64be_aligned(buf + 104), \
+		ME = sph_dec64be_aligned(buf + 112), \
+		MF = sph_dec64be_aligned(buf + 120); \
+                /* loop once and a half */ \
+                /* save some space */ \
+                for (;;) { \
+		    ROUND_B(0); \
+		    ROUND_B(1); \
+		    ROUND_B(2); \
+		    ROUND_B(3); \
+		    ROUND_B(4); \
+		    ROUND_B(5); \
+                    if (b)  break; \
+                    b = 1; \
+		    ROUND_B(6); \
+		    ROUND_B(7); \
+		    ROUND_B(8); \
+		    ROUND_B(9); \
+                }; \
+		blkH0 ^= blkS0 ^ V0 ^ V8, \
+		blkH1 ^= blkS1 ^ V1 ^ V9, \
+		blkH2 ^= blkS2 ^ V2 ^ VA, \
+		blkH3 ^= blkS3 ^ V3 ^ VB, \
+		blkH4 ^= blkS0 ^ V4 ^ VC, \
+		blkH5 ^= blkS1 ^ V5 ^ VD, \
+		blkH6 ^= blkS2 ^ V6 ^ VE, \
+		blkH7 ^= blkS3 ^ V7 ^ VF; \
+	} while (0)
+/*
+*/
+#define DECL_BLK \
+	sph_u64 blkH0; \
+	sph_u64 blkH1; \
+	sph_u64 blkH2; \
+	sph_u64 blkH3; \
+	sph_u64 blkH4; \
+	sph_u64 blkH5; \
+	sph_u64 blkH6; \
+	sph_u64 blkH7; \
+	sph_u64 blkS0; \
+	sph_u64 blkS1; \
+	sph_u64 blkS2; \
+	sph_u64 blkS3; \
+
+/* load initial constants */
+#define BLK_I \
+do { \
+    blkH0 = SPH_C64(0x6A09E667F3BCC908); \
+    blkH1 = SPH_C64(0xBB67AE8584CAA73B); \
+    blkH2 = SPH_C64(0x3C6EF372FE94F82B); \
+    blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \
+    blkH4 = SPH_C64(0x510E527FADE682D1); \
+    blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \
+    blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \
+    blkH7 = SPH_C64(0x5BE0CD19137E2179); \
+    blkS0 = 0; \
+    blkS1 = 0; \
+    blkS2 = 0; \
+    blkS3 = 0; \
+    hashctB = SPH_T64(0- 1); \
+} while (0)
+
+/* copy in 80 for initial hash */
+#define BLK_W \
+do { \
+    memcpy(hashbuf, input, 80); \
+    hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \
+    hashptr = 80; \
+} while (0)
+
+/* copy in 64 for looped hash */
+#define BLK_U \
+do { \
+    memcpy(hashbuf, hash , 64); \
+    hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \
+    hashptr = 64; \
+} while (0)
+
+/* blake compress function */
+/* hash = blake512(loaded) */
+#define BLK_C \
+do { \
+    \
+    union { \
+        unsigned char buf[128]; \
+        sph_u64 dummy; \
+    } u; \
+    size_t ptr; \
+    unsigned bit_len; \
+ \
+    ptr = hashptr; \
+    bit_len = ((unsigned)ptr << 3) + 0; \
+    u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \
+    memset(u.buf + ptr + 1, 0, 111 - ptr); \
+    u.buf[111] |= 1; \
+    sph_enc64be_aligned(u.buf + 112, 0); \
+    sph_enc64be_aligned(u.buf + 120, bit_len); \
+    do { \
+    const void *data = u.buf + ptr; \
+    unsigned char *buf; \
+    buf = hashbuf; \
+    size_t clen; \
+    clen = (sizeof(char)*128) - hashptr; \
+    memcpy(buf + hashptr, data, clen); \
+    hashctA = SPH_T64(hashctA + 1024); \
+    hashctB = SPH_T64(hashctB + 1); \
+    COMPRESS64; \
+    } while (0); \
+    /* end blake64(sc, u.buf + ptr, 128 - ptr); */ \
+    sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \
+    sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \
+    sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \
+    sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \
+    sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \
+    sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \
+    sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \
+    sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \
+} while (0) 
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/blake/sse2/blake/sse41/api.h
+++ b/algo/blake/sse2/blake/sse41/api.h
@@ -0,0 +1,2 @@
+#define CRYPTO_BYTES 64
+
--- a/algo/blake/sse2/blake/sse41/architectures
+++ b/algo/blake/sse2/blake/sse41/architectures
@@ -0,0 +1,2 @@
+amd64
+x86
--- a/algo/blake/sse2/blake/sse41/config.h
+++ b/algo/blake/sse2/blake/sse41/config.h
@@ -0,0 +1,8 @@
+#ifndef __BLAKE512_CONFIG_H__
+#define __BLAKE512_CONFIG_H__
+
+#define AVOID_BRANCHING 1
+//#define HAVE_XOP 1
+
+#endif
+
--- a/algo/blake/sse2/blake/sse41/hash.c
+++ b/algo/blake/sse2/blake/sse41/hash.c
@@ -0,0 +1,287 @@
+
+#include "hash.h"
+/*
+#ifndef NOT_SUPERCOP
+
+#include "crypto_hash.h"
+#include "crypto_uint64.h"
+#include "crypto_uint32.h"
+#include "crypto_uint8.h"
+
+typedef crypto_uint64 u64;
+typedef crypto_uint32 u32;
+typedef crypto_uint8 u8; 
+
+#else
+
+typedef unsigned long long u64; 
+typedef unsigned int u32; 
+typedef unsigned char u8; 
+
+#endif
+*/
+#define U8TO32(p) \
+  (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
+   ((u32)((p)[2]) <<  8) | ((u32)((p)[3])      ))
+#define U8TO64(p) \
+  (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
+#define U32TO8(p, v) \
+    (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
+    (p)[2] = (u8)((v) >>  8); (p)[3] = (u8)((v)      ); 
+#define U64TO8(p, v) \
+    U32TO8((p),     (u32)((v) >> 32));	\
+    U32TO8((p) + 4, (u32)((v)      )); 
+/*
+typedef struct  
+{ 
+	__m128i h[4];
+  u64 s[4], t[2];
+  u32 buflen, nullt;
+  u8 buf[128];
+} state __attribute__ ((aligned (64)));
+*/
+static const u8 padding[129] =
+{ 
+	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+static inline int blake512_compress( hashState_blake * state, const u8 * datablock ) 
+{
+
+  __m128i row1l,row1h;
+  __m128i row2l,row2h;
+  __m128i row3l,row3h;
+  __m128i row4l,row4h;
+
+  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
+  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+  __m128i b0, b1, b2, b3;
+
+  m0 = _mm_loadu_si128((__m128i*)(datablock +   0));
+  m1 = _mm_loadu_si128((__m128i*)(datablock +  16));
+  m2 = _mm_loadu_si128((__m128i*)(datablock +  32));
+  m3 = _mm_loadu_si128((__m128i*)(datablock +  48));
+  m4 = _mm_loadu_si128((__m128i*)(datablock +  64));
+  m5 = _mm_loadu_si128((__m128i*)(datablock +  80));
+  m6 = _mm_loadu_si128((__m128i*)(datablock +  96));
+  m7 = _mm_loadu_si128((__m128i*)(datablock + 112));
+
+  m0 = BSWAP64(m0);
+  m1 = BSWAP64(m1);
+  m2 = BSWAP64(m2);
+  m3 = BSWAP64(m3);
+  m4 = BSWAP64(m4);
+  m5 = BSWAP64(m5);
+  m6 = BSWAP64(m6);
+  m7 = BSWAP64(m7);
+
+  row1l = state->h[0];
+  row1h = state->h[1];
+  row2l = state->h[2];
+  row2h = state->h[3];
+  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
+  row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL);
+
+  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
+  row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL);
+
+#ifdef AVOID_BRANCHING
+  do
+  {
+    const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt));
+    const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask);
+    const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask);
+    row4l = _mm_xor_si128(row4l, xor1);
+    row4h = _mm_xor_si128(row4h, xor2);
+  } while(0);
+#else
+  if(!state->nullt)
+  {
+  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
+  	row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1]));
+  }
+#endif
+
+  ROUND( 0);
+  ROUND( 1);
+  ROUND( 2);
+  ROUND( 3);
+  ROUND( 4);
+  ROUND( 5);
+  ROUND( 6);
+  ROUND( 7);
+  ROUND( 8);
+  ROUND( 9);
+  ROUND(10);
+  ROUND(11);
+  ROUND(12);
+  ROUND(13);
+  ROUND(14);
+  ROUND(15);
+
+  row1l = _mm_xor_si128(row3l,row1l);
+  row1h = _mm_xor_si128(row3h,row1h);
+
+  state->h[0] = _mm_xor_si128(row1l, state->h[0]);
+  state->h[1] = _mm_xor_si128(row1h, state->h[1]);
+
+  row2l = _mm_xor_si128(row4l,row2l);
+  row2h = _mm_xor_si128(row4h,row2h);
+
+  state->h[2] = _mm_xor_si128(row2l, state->h[2]);
+  state->h[3] = _mm_xor_si128(row2h, state->h[3]);
+  
+  return 0;
+}
+
+static inline void blake512_init( hashState_blake * S, u64 databitlen )
+{
+  memset(S, 0, sizeof(hashState_blake));
+  S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL);
+  S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL);
+  S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL);
+  S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL);
+  S->buflen = databitlen;
+}
+
+
+static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) 
+{
+
+
+  int left = (S->buflen >> 3); 
+  int fill = 128 - left;
+
+  if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
+    memcpy( (void *) (S->buf + left), (void *) data, fill );
+    S->t[0] += 1024;
+    blake512_compress( S, S->buf );
+    data += fill;
+    datalen  -= (fill << 3);       
+    left = 0;
+  }
+
+  while( datalen >= 1024 ) {  
+    S->t[0] += 1024;
+    blake512_compress( S, data );
+    data += 128;
+    datalen  -= 1024;
+  }
+
+  if( datalen > 0 ) {
+    memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
+    S->buflen = (left<<3) + datalen;
+  }
+  else S->buflen=0;
+}
+
+static inline void blake512_final( hashState_blake * S, u8 * digest ) 
+{
+
+  u8 msglen[16], zo=0x01,oo=0x81;
+  u64 lo=S->t[0] + S->buflen, hi = S->t[1];
+  if ( lo < S->buflen ) hi++;
+  U64TO8(  msglen + 0, hi );
+  U64TO8(  msglen + 8, lo );
+
+  if ( S->buflen == 888 ) /* one padding byte */
+  { 
+    S->t[0] -= 8; 
+    blake512_update( S, &oo, 8 );
+  }
+  else 
+  {
+    if ( S->buflen < 888 ) /* enough space to fill the block */
+    { 
+      if ( S->buflen == 0 ) S->nullt=1;
+      S->t[0] -= 888 - S->buflen;
+      blake512_update( S, padding, 888 - S->buflen );
+    }
+    else /* NOT enough space, need 2 compressions */ 
+    { 
+      S->t[0] -= 1024 - S->buflen; 
+      blake512_update( S, padding, 1024 - S->buflen );
+      S->t[0] -= 888;
+      blake512_update( S, padding+1, 888 );
+      S->nullt = 1;
+    }
+    blake512_update( S, &zo, 8 );
+    S->t[0] -= 8;
+  }
+  S->t[0] -= 128;
+  blake512_update( S, msglen, 128 );    
+
+  do
+  {
+    const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
+    _mm_storeu_si128((__m128i*)(digest +  0), BSWAP64(S->h[0]));
+    _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1]));
+    _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2]));
+    _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3]));
+  } while(0);
+}
+
+/*
+int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) 
+{
+
+  hashState_blake S;
+  blake512_init( &S );
+  blake512_update( &S, in, inlen*8 );
+  blake512_final( &S, out );
+  return 0;
+}
+*/
+/*
+#ifdef NOT_SUPERCOP
+
+int main() 
+{
+
+  int i, v;
+  u8 data[144], digest[64];
+  u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, 
+	       0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, 
+	       0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, 
+	       0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3};
+  u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, 
+	       0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, 
+	       0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, 
+	       0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE};
+
+  for(i=0; i<144; ++i) data[i]=0;  
+
+  crypto_hash( digest, data, 1 );    
+  v=0;
+  for(i=0; i<64; ++i) {
+    printf("%02X", digest[i]);
+    if ( digest[i] != test1[i]) v=1;
+  }
+  if (v) printf("\nerror\n");
+  else  printf("\nok\n");
+
+  for(i=0; i<144; ++i) data[i]=0;  
+
+  crypto_hash( digest, data, 144 );    
+  v=0;
+  for(i=0; i<64; ++i) {
+    printf("%02X", digest[i]);
+    if ( digest[i] != test2[i]) v=1;
+  }
+  if (v) printf("\nerror\n");
+  else printf("\nok\n");
+
+  return 0;
+}
+
+#endif
+
+*/
+
+
--- a/algo/blake/sse2/blake/sse41/hash.h
+++ b/algo/blake/sse2/blake/sse41/hash.h
@@ -0,0 +1,74 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <x86intrin.h>
+
+#include "config.h"
+#include "rounds.h"
+/*
+#ifndef NOT_SUPERCOP
+
+#include "crypto_hash.h"
+#include "crypto_uint64.h"
+#include "crypto_uint32.h"
+#include "crypto_uint8.h"
+
+typedef crypto_uint64 u64;
+typedef crypto_uint32 u32;
+typedef crypto_uint8 u8; 
+
+#else
+*/
+typedef unsigned long long u64; 
+typedef unsigned int u32; 
+typedef unsigned char u8; 
+
+typedef struct  
+{ 
+	__m128i h[4];
+  u64 s[4], t[2];
+  u32 buflen, nullt;
+  u8 buf[128];
+} hashState_blake __attribute__ ((aligned (64)));
+/*
+#endif
+
+#define U8TO32(p) \
+  (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
+   ((u32)((p)[2]) <<  8) | ((u32)((p)[3])      ))
+#define U8TO64(p) \
+  (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
+#define U32TO8(p, v) \
+    (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
+    (p)[2] = (u8)((v) >>  8); (p)[3] = (u8)((v)      ); 
+#define U64TO8(p, v) \
+    U32TO8((p),     (u32)((v) >> 32));	\
+    U32TO8((p) + 4, (u32)((v)      )); 
+*/
+
+/*
+static const u8 padding[129] =
+{ 
+	0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+*/
+static inline void blake512_init( hashState_blake * S, u64 datalen );
+
+
+static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ;
+
+static inline void blake512_final( hashState_blake * S, u8 * digest ) ;
+
+
+int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ;
+
+
+
+
+
+
--- a/algo/blake/sse2/blake/sse41/implementors
+++ b/algo/blake/sse2/blake/sse41/implementors
@@ -0,0 +1,2 @@
+Jean-Philippe Aumasson
+Samuel Neves
--- a/algo/blake/sse2/blake/sse41/rounds.h
+++ b/algo/blake/sse2/blake/sse41/rounds.h
@@ -0,0 +1,871 @@
+
+#ifndef __BLAKE512_ROUNDS_H__
+#define __BLAKE512_ROUNDS_H__
+
+#ifndef HAVE_XOP
+	#define BSWAP64(x) _mm_shuffle_epi8((x), u8to64)
+
+	#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+		: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) 
+#else
+	#define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64)
+#endif
+
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m4, m6); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m3, m7, 8); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m2); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m1); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m1); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m5, 8); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m7); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m5, m1, 0xF0); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m4); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m2, m0, 8); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m5); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m2, 0xF0); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m5); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m1, m5); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m0, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m7, m5, 0xF0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m1, 0xF0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m0, 8); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m4, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m3); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m5); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m1); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m2, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m7, m0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m7, m4, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m6, m0, 0xF0); \
+t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m7, m2); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m2, m7); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m5, m6, 8); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m5, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m3); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m6, m1, 0xF0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m7, m5, 8); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m2, m7); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m4, m1); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m2); \
+t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m3, m5); \
+t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m7); \
+t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m0, m5, 8); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m7, m4); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m4, m1, 8); \
+t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+t0 = m6; \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m5, m0, 8); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m3, 0xF0); \
+t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = m2; \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m0); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m2, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m7, m4); \
+t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m1, m6); \
+t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m7, m5, 8); \
+t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m0, m1); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m3); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m5); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m2); \
+t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m4, m6); \
+t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m5, m4); \
+t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m3, m7, 8); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m2); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m1); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m1); \
+t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_1(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m5, 8); \
+t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m2, m7); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m1, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m5, m1, 0xF0); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m3, m4); \
+t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_12_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m7, m3); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_alignr_epi8(m2, m0, 8); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m3, m1); \
+t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m6, m5); \
+t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m0); \
+t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m6, m7); \
+t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m1, m2, 0xF0); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_13_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m3, m5); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m4, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m1, m5); \
+t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_2(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m0, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m2, m7, 0xF0); \
+t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m7, m5, 0xF0); \
+t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m3, m1, 0xF0); \
+t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_14_4(b0, b1) \
+do \
+{ \
+t0 = _mm_alignr_epi8(m6, m0, 8); \
+t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m4, m6, 0xF0); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_1(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m1, m3); \
+t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpacklo_epi64(m0, m4); \
+t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_2(b0, b1) \
+do \
+{ \
+t0 = _mm_unpacklo_epi64(m6, m5); \
+t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m5, m1); \
+t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_3(b0, b1) \
+do \
+{ \
+t0 = _mm_blend_epi16(m2, m3, 0xF0); \
+t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_unpackhi_epi64(m7, m0); \
+t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+#define LOAD_MSG_15_4(b0, b1) \
+do \
+{ \
+t0 = _mm_unpackhi_epi64(m6, m2); \
+t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
+b0 = _mm_xor_si128(t0, t1); \
+t2 = _mm_blend_epi16(m7, m4, 0xF0); \
+t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
+b1 = _mm_xor_si128(t2, t3); \
+} while(0) 
+
+
+
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -32); \
+  row4h = _mm_roti_epi64(row4h, -32); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -25); \
+  row2h = _mm_roti_epi64(row2h, -25); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -16); \
+  row4h = _mm_roti_epi64(row4h, -16); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -11); \
+  row2h = _mm_roti_epi64(row2h, -11); \
+
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0; 
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0; 
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
--- a/algo/bmw/.dirstamp
+++ b/algo/bmw/.dirstamp
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -0,0 +1,66 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <string.h>
+#include <stdint.h>
+
+#include "sph_bmw.h"
+
+void bmwhash(void *output, const void *input)
+{
+/*
+ 	uint32_t hash[16];
+	sph_bmw256_context ctx;
+
+	sph_bmw256_init(&ctx);
+	sph_bmw256(&ctx, input, 80);
+	sph_bmw256_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+*/
+}
+
+int scanhash_bmw(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+ 	uint32_t _ALIGN(64) hash64[8];
+	uint32_t _ALIGN(64) endiandata[20];
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+        for (int k = 0; k < 19; k++)
+                be32enc(&endiandata[k], pdata[k]);
+
+	do {
+		be32enc(&endiandata[19], n);
+		bmwhash(hash64, endiandata);
+		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return true;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+
+	return 0;
+}
+
+bool register_bmw256_algo( algo_gate_t* gate )
+{
+    algo_not_implemented();
+    return false;
+//    gate->scanhash = (void*)&scanhash_bmw;
+//    gate->hash     = (void*)&bmwhash;
+    return true;
+};
+
--- a/algo/bmw/sph_bmw.c
+++ b/algo/bmw/sph_bmw.c
@@ -0,0 +1,965 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "sph_bmw.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0x00010203), SPH_C32(0x04050607),
+	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
+	SPH_C32(0x10111213), SPH_C32(0x14151617),
+	SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F),
+	SPH_C32(0x20212223), SPH_C32(0x24252627),
+	SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F),
+	SPH_C32(0x30313233), SPH_C32(0x34353637),
+	SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV384[] = {
+	SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F),
+	SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F),
+	SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F),
+	SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F),
+	SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F),
+	SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F),
+	SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F),
+	SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#if SPH_64
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qab   do { \
+		unsigned u; \
+		sph_u64 Wb[16]; \
+		Wb[ 0] = Wb0; \
+		Wb[ 1] = Wb1; \
+		Wb[ 2] = Wb2; \
+		Wb[ 3] = Wb3; \
+		Wb[ 4] = Wb4; \
+		Wb[ 5] = Wb5; \
+		Wb[ 6] = Wb6; \
+		Wb[ 7] = Wb7; \
+		Wb[ 8] = Wb8; \
+		Wb[ 9] = Wb9; \
+		Wb[10] = Wb10; \
+		Wb[11] = Wb11; \
+		Wb[12] = Wb12; \
+		Wb[13] = Wb13; \
+		Wb[14] = Wb14; \
+		Wb[15] = Wb15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		unsigned u; \
+		for (u = 16; u < 18; u ++) \
+			qt[u] = expand1b(Qb, M, H, u); \
+		for (u = 18; u < 32; u ++) \
+			qt[u] = expand2b(Qb, M, H, u); \
+	} while (0)
+
+#else
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#endif
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#if SPH_64
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#endif
+
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#else
+	sph_u32 mv[16];
+
+	mv[ 0] = sph_dec32le_aligned(data +  0);
+	mv[ 1] = sph_dec32le_aligned(data +  4);
+	mv[ 2] = sph_dec32le_aligned(data +  8);
+	mv[ 3] = sph_dec32le_aligned(data + 12);
+	mv[ 4] = sph_dec32le_aligned(data + 16);
+	mv[ 5] = sph_dec32le_aligned(data + 20);
+	mv[ 6] = sph_dec32le_aligned(data + 24);
+	mv[ 7] = sph_dec32le_aligned(data + 28);
+	mv[ 8] = sph_dec32le_aligned(data + 32);
+	mv[ 9] = sph_dec32le_aligned(data + 36);
+	mv[10] = sph_dec32le_aligned(data + 40);
+	mv[11] = sph_dec32le_aligned(data + 44);
+	mv[12] = sph_dec32le_aligned(data + 48);
+	mv[13] = sph_dec32le_aligned(data + 52);
+	mv[14] = sph_dec32le_aligned(data + 56);
+	mv[15] = sph_dec32le_aligned(data + 60);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+
+#if SPH_64
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+#if SPH_LITTLE_FAST
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#else
+	sph_u64 mv[16];
+
+	mv[ 0] = sph_dec64le_aligned(data +   0);
+	mv[ 1] = sph_dec64le_aligned(data +   8);
+	mv[ 2] = sph_dec64le_aligned(data +  16);
+	mv[ 3] = sph_dec64le_aligned(data +  24);
+	mv[ 4] = sph_dec64le_aligned(data +  32);
+	mv[ 5] = sph_dec64le_aligned(data +  40);
+	mv[ 6] = sph_dec64le_aligned(data +  48);
+	mv[ 7] = sph_dec64le_aligned(data +  56);
+	mv[ 8] = sph_dec64le_aligned(data +  64);
+	mv[ 9] = sph_dec64le_aligned(data +  72);
+	mv[10] = sph_dec64le_aligned(data +  80);
+	mv[11] = sph_dec64le_aligned(data +  88);
+	mv[12] = sph_dec64le_aligned(data +  96);
+	mv[13] = sph_dec64le_aligned(data + 104);
+	mv[14] = sph_dec64le_aligned(data + 112);
+	mv[15] = sph_dec64le_aligned(data + 120);
+#define M(x)    (mv[x])
+#endif
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+static void
+bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+static void
+bmw64(sph_bmw_big_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u64 htmp[16];
+	sph_u64 *h1, *h2;
+
+	sc->bit_count += (sph_u64)len << 3;
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u64 *ht;
+
+			compress_big(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u64 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_big(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+	compress_big(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc64le_aligned(buf + 8 * u, h2[u]);
+	compress_big(buf, final_b, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+		sph_enc64le(out + 8 * u, h1[v]);
+}
+
+#endif
+
+/* see sph_bmw.h */
+void
+sph_bmw224_init(void *cc)
+{
+	bmw32_init(cc, IV224);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_close(void *cc, void *dst)
+{
+	sph_bmw224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 7);
+//	sph_bmw224_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_init(void *cc)
+{
+	bmw32_init(cc, IV256);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256(void *cc, const void *data, size_t len)
+{
+	bmw32(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_close(void *cc, void *dst)
+{
+	sph_bmw256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw32_close(cc, ub, n, dst, 8);
+//	sph_bmw256_init(cc);
+}
+
+#if SPH_64
+
+/* see sph_bmw.h */
+void
+sph_bmw384_init(void *cc)
+{
+	bmw64_init(cc, IV384);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_close(void *cc, void *dst)
+{
+	sph_bmw384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 6);
+//	sph_bmw384_init(cc);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_init(void *cc)
+{
+	bmw64_init(cc, IV512);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512(void *cc, const void *data, size_t len)
+{
+	bmw64(cc, data, len);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_close(void *cc, void *dst)
+{
+	sph_bmw512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_bmw.h */
+void
+sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_close(cc, ub, n, dst, 8);
+//	sph_bmw512_init(cc);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -0,0 +1,328 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_bmw_small_context;
+
+/**
+ * This structure is a context for BMW-224 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw224_context;
+
+/**
+ * This structure is a context for BMW-256 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_small_context sph_bmw256_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} sph_bmw_big_context;
+
+/**
+ * This structure is a context for BMW-384 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw384_context;
+
+/**
+ * This structure is a context for BMW-512 computations. It is
+ * identical to the common <code>sph_bmw_small_context</code>.
+ */
+typedef sph_bmw_big_context sph_bmw512_context;
+
+#endif
+
+/**
+ * Initialize a BMW-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-224 context (pointer to a
+ *             <code>sph_bmw224_context</code>)
+ */
+void sph_bmw224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-256 context (pointer to a
+ *             <code>sph_bmw256_context</code>)
+ */
+void sph_bmw256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+/**
+ * Initialize a BMW-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-384 context (pointer to a
+ *             <code>sph_bmw384_context</code>)
+ */
+void sph_bmw384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a BMW-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the BMW-512 context (pointer to a
+ *             <code>sph_bmw512_context</code>)
+ */
+void sph_bmw512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the BMW-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_bmw512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current BMW-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the BMW-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_bmw512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -0,0 +1,517 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "../sph_bmw.h"
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u64 bmwIV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if 0
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#define DECL_BMW \
+    sph_u64 bmwH[16]; \
+
+/* load initial constants */
+#define BMW_I \
+do { \
+    memcpy(bmwH, bmwIV512, sizeof bmwH); \
+    hashptr = 0; \
+    hashctA = 0; \
+} while (0) 
+
+/* load hash for loop */
+#define BMW_U \
+do { \
+    const void *data = hash; \
+    size_t len = 64; \
+    unsigned char *buf; \
+    \
+    hashctA += (sph_u64)len << 3; \
+    buf = hashbuf; \
+    memcpy(buf, data, 64); \
+    hashptr = 64; \
+} while (0)  
+
+
+/* bmw512 hash loaded */
+/* hash = blake512(loaded) */
+#define BMW_C \
+do { \
+    void *dst = hash; \
+    size_t out_size_w64 = 8; \
+    unsigned char *data; \
+    sph_u64 *dh; \
+    unsigned char *out; \
+    size_t ptr, u, v; \
+    unsigned z; \
+    sph_u64 h1[16], h2[16], *h; \
+    data = hashbuf; \
+    ptr = hashptr; \
+    z = 0x80 >> 0; \
+    data[ptr ++] = ((0 & -z) | z) & 0xFF; \
+    memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \
+    sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \
+    SPH_T64(hashctA + 0)); \
+    /* for break loop */ \
+    /* one copy of inline FOLD */ \
+    /* FOLD uses, */ \
+    /* uint64 *h, data */ \
+    /* uint64 dh, state */ \
+        h = bmwH; \
+        dh = h2; \
+    for (;;) { \
+        FOLDb; \
+        /* dh gets changed for 2nd run */ \
+        if (dh == h1) break; \
+        for (u = 0; u < 16; u ++) \
+        sph_enc64le_aligned(data + 8 * u, h2[u]); \
+        dh = h1; \
+        h = final_b; \
+    } \
+    /* end wrapped for break loop */ \
+    out = dst; \
+    for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \
+    sph_enc64le(out + 8 * u, h1[v]); \
+} while (0) 
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/bmw/sse2/bmw.c.new
+++ b/algo/bmw/sse2/bmw.c.new
@@ -0,0 +1,525 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include "../sph_bmw.h"
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u64 bmwIV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+#define M16_16    0,  1,  3,  4,  7, 10, 11
+#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#if SPH_64
+
+#define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
+                  ^ SPH_ROTL64(x,  4) ^ SPH_ROTL64(x, 37))
+#define sb1(x)    (((x) >> 1) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
+#define sb2(x)    (((x) >> 2) ^ SPH_T64((x) << 1) \
+                  ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
+#define sb3(x)    (((x) >> 2) ^ SPH_T64((x) << 2) \
+                  ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
+#define sb4(x)    (((x) >> 1) ^ (x))
+#define sb5(x)    (((x) >> 2) ^ (x))
+#define rb1(x)    SPH_ROTL64(x,  5)
+#define rb2(x)    SPH_ROTL64(x, 11)
+#define rb3(x)    SPH_ROTL64(x, 27)
+#define rb4(x)    SPH_ROTL64(x, 32)
+#define rb5(x)    SPH_ROTL64(x, 37)
+#define rb6(x)    SPH_ROTL64(x, 43)
+#define rb7(x)    SPH_ROTL64(x, 53)
+
+#define Kb(j)   SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
+
+#if 0
+
+static const sph_u64 Kb_tab[] = {
+	Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
+	Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
+};
+
+#define rol_off(mf, j, off) \
+	SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
+
+#define add_elt_b(mf, hf, j) \
+	(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
+		- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
+
+#define expand1b(qf, mf, hf, i) \
+	SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
+		+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
+		+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
+		+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
+		+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
+		+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
+		+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
+		+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#define expand2b(qf, mf, hf, i) \
+	SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
+		+ qf((i) - 14) + rb2(qf((i) - 13)) \
+		+ qf((i) - 12) + rb3(qf((i) - 11)) \
+		+ qf((i) - 10) + rb4(qf((i) - 9)) \
+		+ qf((i) - 8) + rb5(qf((i) - 7)) \
+		+ qf((i) - 6) + rb6(qf((i) - 5)) \
+		+ qf((i) - 4) + rb7(qf((i) - 3)) \
+		+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
+		+ add_elt_b(mf, hf, (i) - 16))
+
+#else
+
+#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
+		- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
+
+#define expand1b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
+		+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
+		+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
+		+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1b(qf, mf, hf, i16) \
+	expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1b_(qf, mf, hf, i16, ix, iy) \
+	expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2b_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
+		+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
+		+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
+		+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
+		+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2b(qf, mf, hf, i16) \
+	expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2b_(qf, mf, hf, i16, ix, iy) \
+	expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#endif
+
+#endif
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+
+#define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
+#define Wb1    MAKE_W(SPH_T64,  6, -,  8, +, 11, +, 14, -, 15)
+#define Wb2    MAKE_W(SPH_T64,  0, +,  7, +,  9, -, 12, +, 15)
+#define Wb3    MAKE_W(SPH_T64,  0, -,  1, +,  8, -, 10, +, 13)
+#define Wb4    MAKE_W(SPH_T64,  1, +,  2, +,  9, -, 11, -, 14)
+#define Wb5    MAKE_W(SPH_T64,  3, -,  2, +, 10, -, 12, +, 15)
+#define Wb6    MAKE_W(SPH_T64,  4, -,  0, -,  3, -, 11, +, 13)
+#define Wb7    MAKE_W(SPH_T64,  1, -,  4, -,  5, -, 12, -, 14)
+#define Wb8    MAKE_W(SPH_T64,  2, -,  5, -,  6, +, 13, -, 15)
+#define Wb9    MAKE_W(SPH_T64,  0, -,  3, +,  6, -,  7, +, 14)
+#define Wb10   MAKE_W(SPH_T64,  8, -,  1, -,  4, -,  7, +, 15)
+#define Wb11   MAKE_W(SPH_T64,  8, -,  0, -,  2, -,  5, +,  9)
+#define Wb12   MAKE_W(SPH_T64,  1, +,  3, -,  6, -,  9, +, 10)
+#define Wb13   MAKE_W(SPH_T64,  2, +,  4, +,  7, +, 10, +, 11)
+#define Wb14   MAKE_W(SPH_T64,  3, -,  5, +,  8, -, 11, -, 12)
+#define Wb15   MAKE_W(SPH_T64, 12, -,  4, -,  6, -,  9, +, 13)
+
+#define MAKE_Qab   do { \
+		qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
+		qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
+		qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
+		qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
+		qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
+		qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
+		qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
+		qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
+		qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
+		qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
+		qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
+		qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
+		qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
+		qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
+		qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
+		qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbb   do { \
+		qt[16] = expand1b(Qb, M, H, 16); \
+		qt[17] = expand1b(Qb, M, H, 17); \
+		qt[18] = expand2b(Qb, M, H, 18); \
+		qt[19] = expand2b(Qb, M, H, 19); \
+		qt[20] = expand2b(Qb, M, H, 20); \
+		qt[21] = expand2b(Qb, M, H, 21); \
+		qt[22] = expand2b(Qb, M, H, 22); \
+		qt[23] = expand2b(Qb, M, H, 23); \
+		qt[24] = expand2b(Qb, M, H, 24); \
+		qt[25] = expand2b(Qb, M, H, 25); \
+		qt[26] = expand2b(Qb, M, H, 26); \
+		qt[27] = expand2b(Qb, M, H, 27); \
+		qt[28] = expand2b(Qb, M, H, 28); \
+		qt[29] = expand2b(Qb, M, H, 29); \
+		qt[30] = expand2b(Qb, M, H, 30); \
+		qt[31] = expand2b(Qb, M, H, 31); \
+	} while (0)
+
+#define MAKE_Qb   do { \
+		MAKE_Qab; \
+		MAKE_Qbb; \
+	} while (0)
+
+#define Qb(j)   (qt[j])
+
+#define FOLD(type, mkQ, tt, rol, mf, qf, dhf)   do { \
+		type qt[32], xl, xh; \
+		mkQ; \
+		xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
+			^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
+		xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
+			^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
+		dhf( 0) = tt(((xh <<  5) ^ (qf(16) >>  5) ^ mf( 0)) \
+			+ (xl ^ qf(24) ^ qf( 0))); \
+		dhf( 1) = tt(((xh >>  7) ^ (qf(17) <<  8) ^ mf( 1)) \
+			+ (xl ^ qf(25) ^ qf( 1))); \
+		dhf( 2) = tt(((xh >>  5) ^ (qf(18) <<  5) ^ mf( 2)) \
+			+ (xl ^ qf(26) ^ qf( 2))); \
+		dhf( 3) = tt(((xh >>  1) ^ (qf(19) <<  5) ^ mf( 3)) \
+			+ (xl ^ qf(27) ^ qf( 3))); \
+		dhf( 4) = tt(((xh >>  3) ^ (qf(20) <<  0) ^ mf( 4)) \
+			+ (xl ^ qf(28) ^ qf( 4))); \
+		dhf( 5) = tt(((xh <<  6) ^ (qf(21) >>  6) ^ mf( 5)) \
+			+ (xl ^ qf(29) ^ qf( 5))); \
+		dhf( 6) = tt(((xh >>  4) ^ (qf(22) <<  6) ^ mf( 6)) \
+			+ (xl ^ qf(30) ^ qf( 6))); \
+		dhf( 7) = tt(((xh >> 11) ^ (qf(23) <<  2) ^ mf( 7)) \
+			+ (xl ^ qf(31) ^ qf( 7))); \
+		dhf( 8) = tt(rol(dhf(4),  9) + (xh ^ qf(24) ^ mf( 8)) \
+			+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
+		dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
+			+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
+		dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
+			+ ((xl << 6) ^ qf(17) ^ qf(10))); \
+		dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
+			+ ((xl << 4) ^ qf(18) ^ qf(11))); \
+		dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
+			+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
+		dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
+			+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
+		dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
+			+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
+		dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
+			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
+	} while (0)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
+#define FOLDb   FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
+
+#define DECL_BMW \
+    sph_u64 bmwH[16]; \
+
+/* load initial constants */
+#define BMW_I \
+do { \
+    memcpy(bmwH, bmwIV512, sizeof bmwH); \
+    hashptr = 0; \
+    hashctA = 0; \
+} while (0) 
+
+/* load hash for loop */
+#define BMW_U \
+do { \
+    const void *data = hash; \
+    size_t len = 64; \
+    unsigned char *buf; \
+    \
+    hashctA += (sph_u64)len << 3; \
+    buf = hashbuf; \
+    memcpy(buf, data, 64); \
+    hashptr = 64; \
+} while (0)  
+
+
+/* bmw512 hash loaded */
+/* hash = blake512(loaded) */
+#define BMW_C \
+do { \
+    void *dst = hash; \
+    size_t out_size_w64 = 8; \
+    unsigned char *data; \
+    sph_u64 *dh; \
+    unsigned char *out; \
+    size_t ptr, u, v; \
+    unsigned z; \
+    sph_u64 h1[16], h2[16], *h; \
+    data = hashbuf; \
+    ptr = hashptr; \
+    z = 0x80 >> 0; \
+    data[ptr ++] = ((0 & -z) | z) & 0xFF; \
+    memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \
+    sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \
+    SPH_T64(hashctA + 0)); \
+    /* for break loop */ \
+    /* one copy of inline FOLD */ \
+    /* FOLD uses, */ \
+    /* uint64 *h, data */ \
+    /* uint64 dh, state */ \
+        h = bmwH; \
+        dh = h2; \
+    for (;;) { \
+        FOLDb; \
+        /* dh gets changed for 2nd run */ \
+        if (dh == h1) break; \
+        for (u = 0; u < 16; u ++) \
+        sph_enc64le_aligned(data + 8 * u, h2[u]); \
+        dh = h1; \
+        h = final_b; \
+    } \
+    /* end wrapped for break loop */ \
+    out = dst; \
+    sph_enc64le(out,      h1[8]); \
+    sph_enc64le(out +  8, h1[9]); \
+    sph_enc64le(out + 16, h1[10]); \
+    sph_enc64le(out + 24, h1[11]); \
+    sph_enc64le(out + 32, h1[12]); \
+    sph_enc64le(out + 40, h1[13]); \
+    sph_enc64le(out + 48, h1[14]); \
+    sph_enc64le(out + 56, h1[15]); \
+/*    for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)*/ \
+/*    sph_enc64le(out + 8 * u, h1[v]);*/ \
+} while (0) 
+
+static void
+compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
+{
+
+#define M(x)    sph_dec64le_aligned(data + 8 * (x))
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDb;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u64 final_b[16] = {
+	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
+	SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
+	SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
+	SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
+	SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
+	SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
+	SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
+	SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
+};
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/bmw/sse2/sph_bmw.h
+++ b/algo/bmw/sse2/sph_bmw.h
@@ -0,0 +1,61 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_BMW_H__
+#define SPH_BMW_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "sph_types.h"
+
+#define SPH_SIZE_bmw512   512
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	sph_u64 bmwH[16];
+#endif
+} sph_bmw_big_context;
+
+typedef sph_bmw_big_context sph_bmw512_context;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/cryptonight/.dirstamp
+++ b/algo/cryptonight/.dirstamp
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -0,0 +1,365 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#if defined(__arm__) || defined(_MSC_VER)
+#ifndef NOASM
+#define NOASM
+#endif
+#endif
+
+#include "crypto/oaes_lib.h"
+#include "crypto/c_keccak.h"
+#include "crypto/c_groestl.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "crypto/int-util.h"
+#include "crypto/hash-ops.h"
+
+#if USE_INT128
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
+typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
+#elif defined (_MSC_VER)
+/* only for mingw64 on windows */
+#undef  USE_INT128
+#define USE_INT128 (0)
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+#endif
+
+#define LITE 1
+#if LITE /* cryptonight-light */
+#define MEMORY (1 << 20)
+#define ITER   (1 << 19)
+#else
+#define MEMORY (1 << 21) /* 2 MiB */
+#define ITER   (1 << 20)
+#endif
+
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
+
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+	union hash_state hs;
+	struct {
+		uint8_t k[64];
+		uint8_t init[INIT_SIZE_BYTE];
+	};
+};
+#pragma pack(pop)
+
+static void do_blake_hash(const void* input, size_t len, char* output) {
+	blake256_hash((uint8_t*)output, input, len);
+}
+
+static void do_groestl_hash(const void* input, size_t len, char* output) {
+	groestl(input, len * 8, (uint8_t*)output);
+}
+
+static void do_jh_hash(const void* input, size_t len, char* output) {
+	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
+	assert(likely(SUCCESS == r));
+}
+
+static void do_skein_hash(const void* input, size_t len, char* output) {
+	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
+	assert(likely(SKEIN_SUCCESS == r));
+}
+
+extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#if !defined(_MSC_VER) && !defined(NOASM)
+extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#else
+#define fast_aesb_single_round     aesb_single_round
+#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
+#endif
+
+#if defined(NOASM) || !defined(__x86_64__)
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+	assert(ac <= *product_hi);
+
+	return product_lo;
+}
+#else
+extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
+#endif
+
+static void (* const extra_hashes[4])(const void *, size_t, char *) = {
+		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+
+
+static inline size_t e2i(const uint8_t* a) {
+#if !LITE
+	return ((uint32_t *)a)[0] & 0x1FFFF0;
+#else
+	return ((uint32_t *)a)[0] & 0xFFFF0;
+#endif
+}
+
+static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = lo;
+}
+
+static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
+#if USE_INT128
+	*((uint128_t*) a) ^= *((uint128_t*) b);
+#else
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+#endif
+}
+
+static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+#if USE_INT128
+	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
+#else
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+#endif
+}
+
+struct cryptonight_ctx {
+	uint8_t _ALIGN(16) long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
+	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+};
+
+static void cryptolight_hash_ctx(void* output, const void* input, int len, struct cryptonight_ctx* ctx)
+{
+        len = 76;
+	hash_process(&ctx->state.hs, (const uint8_t*) input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	size_t i, j;
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		/* Dependency chain: address -> read value ------+
+		 * written value <-+ hard function (AES or MUL) <+
+		 * next address  <-+
+		 */
+		/* Iteration 1 */
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		/* Iteration 2 */
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		/* Iteration 3 */
+		j = e2i(ctx->a);
+		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		/* Iteration 4 */
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	hash_permutation(&ctx->state.hs);
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+void cryptolight_hash(void* output, const void* input, int len) {
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+	cryptolight_hash_ctx(output, input, len, ctx);
+	free(ctx);
+}
+
+static void cryptolight_hash_ctx_aes_ni(void* output, const void* input,
+                       int len, struct cryptonight_ctx* ctx)
+{
+	hash_process(&ctx->state.hs, (const uint8_t*)input, len);
+	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
+	size_t i, j;
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 0], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 1], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 2], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 3], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 4], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 5], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 6], ctx->aes_ctx->key->exp_data);
+		fast_aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * 7], ctx->aes_ctx->key->exp_data);
+		memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a);
+	xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		/* Dependency chain: address -> read value ------+
+		 * written value <-+ hard function (AES or MUL) <+
+		 * next address  <-+
+		 */
+		/* Iteration 1 */
+		j = e2i(ctx->a);
+		fast_aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
+		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		/* Iteration 2 */
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		/* Iteration 3 */
+		j = e2i(ctx->a);
+		fast_aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
+		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		/* Iteration 4 */
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+	}
+
+	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		xor_blocks(&ctx->text[0 * AES_BLOCK_SIZE], &ctx->long_state[i + 0 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[0 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[1 * AES_BLOCK_SIZE], &ctx->long_state[i + 1 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[1 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[2 * AES_BLOCK_SIZE], &ctx->long_state[i + 2 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[2 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[3 * AES_BLOCK_SIZE], &ctx->long_state[i + 3 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[3 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[4 * AES_BLOCK_SIZE], &ctx->long_state[i + 4 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[4 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[5 * AES_BLOCK_SIZE], &ctx->long_state[i + 5 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[5 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[6 * AES_BLOCK_SIZE], &ctx->long_state[i + 6 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[6 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		xor_blocks(&ctx->text[7 * AES_BLOCK_SIZE], &ctx->long_state[i + 7 * AES_BLOCK_SIZE]);
+		fast_aesb_pseudo_round_mut(&ctx->text[7 * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+	}
+	memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE);
+	hash_permutation(&ctx->state.hs);
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx->state.hs.b[0] & 3](&ctx->state, 200, output);
+	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+}
+
+int scanhash_cryptolight(int thr_id, struct work *work,
+		uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
+	uint32_t n = *nonceptr - 1;
+	const uint32_t first_nonce = n + 1;
+	//const uint32_t Htarg = ptarget[7];
+	uint32_t _ALIGN(32) hash[HASH_SIZE / 4];
+
+	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
+
+#ifndef NO_AES_NI
+		do {
+			*nonceptr = ++n;
+			cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
+			if (unlikely(hash[7] < ptarget[7])) {
+				*hashes_done = n - first_nonce + 1;
+				free(ctx);
+				return true;
+			}
+		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+#else
+		do {
+			*nonceptr = ++n;
+			cryptolight_hash_ctx(hash, pdata, 76, ctx);
+			if (unlikely(hash[7] < ptarget[7])) {
+				*hashes_done = n - first_nonce + 1;
+				free(ctx);
+				return true;
+			}
+		} while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+#endif
+	free(ctx);
+	*hashes_done = n - first_nonce + 1;
+	return 0;
+}
+
+bool register_cryptolight_algo( algo_gate_t* gate )
+{
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash  = (void*)&scanhash_cryptolight;
+  gate->hash      = (void*)&cryptolight_hash;
+  gate->hash_suw  = (void*)&cryptolight_hash; 
+  gate->get_max64 = (void*)&get_max64_0x40LL;
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -0,0 +1,244 @@
+#include <x86intrin.h>
+#include <memory.h>
+#include "cryptonight.h"
+#include "miner.h"
+#include "crypto/c_keccak.h"
+
+void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
+void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
+void that_fucking_loop(uint8_t a[16], uint8_t b[16], uint8_t *long_state);
+
+static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
+{
+	__m128i tmp4;
+	*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
+	tmp4 = _mm_slli_si128(*tmp1, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp1 = _mm_xor_si128(*tmp1, tmp4);
+	*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
+}
+
+static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
+{
+#ifndef NO_AES_NI
+	__m128i tmp2, tmp4;
+	
+	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
+	tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
+	tmp4 = _mm_slli_si128(*tmp3, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	tmp4 = _mm_slli_si128(tmp4, 0x04);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
+	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
+#endif
+}
+
+// Special thanks to Intel for helping me
+// with ExpandAESKey256() and its subroutines
+static inline void ExpandAESKey256(char *keybuf)
+{
+#ifndef NO_AES_NI
+	__m128i tmp1, tmp2, tmp3, *keys;
+	
+	keys = (__m128i *)keybuf;
+	
+	tmp1 = _mm_load_si128((__m128i *)keybuf);
+	tmp3 = _mm_load_si128((__m128i *)(keybuf+0x10));
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[2] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[3] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[4] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[5] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[6] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[7] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[8] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[9] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[10] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[11] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[12] = tmp1;
+	ExpandAESKey256_sub2(&tmp1, &tmp3);
+	keys[13] = tmp3;
+	
+	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
+	ExpandAESKey256_sub1(&tmp1, &tmp2);
+	keys[14] = tmp1;
+#endif
+}
+
+typedef struct 
+{
+    uint8_t long_state[MEMORY] __attribute((aligned(16)));
+    union cn_slow_hash_state state;
+    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
+    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
+    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
+    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16)));
+//    oaes_ctx* aes_ctx;
+} cryptonight_ctx;
+
+static __thread cryptonight_ctx ctx;
+
+void cryptonight_hash_aes( void *restrict output, const void *input, int len )
+{
+#ifndef NO_AES_NI
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+    uint8_t ExpandedKey[256];
+    size_t i, j;
+    
+    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+    memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE);
+    ExpandAESKey256(ExpandedKey);
+    
+    __m128i *longoutput, *expkey, *xmminput;
+	longoutput = (__m128i *)ctx.long_state;
+	expkey = (__m128i *)ExpandedKey;
+	xmminput = (__m128i *)ctx.text;
+    
+    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
+    
+    for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    {
+	for(j = 0; j < 10; j++)
+	{
+		xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
+		xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
+		xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
+		xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
+		xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
+		xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
+		xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
+		xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
+	}
+	_mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
+	_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
+    }
+
+     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+     ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
+
+//    for (i = 0; i < 2; i++) 
+//    {
+//     ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^  ((uint64_t *)ctx.state.k)[i+4];
+//     ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6];
+//    }
+
+    __m128i b_x = _mm_load_si128((__m128i *)ctx.b);
+    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
+    a[0] = ctx.a[0];
+    a[1] = ctx.a[1];
+	
+    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
+    {	  
+	__m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
+	__m128i a_x = _mm_load_si128((__m128i *)a);
+	uint64_t c[2];
+	c_x = _mm_aesenc_si128(c_x, a_x);
+
+	_mm_store_si128((__m128i *)c, c_x);
+	__builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1);
+	
+	b_x = _mm_xor_si128(b_x, c_x);
+	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
+
+	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+	uint64_t b[2];
+	b[0] = nextblock[0];
+	b[1] = nextblock[1];
+
+	{
+	  uint64_t hi, lo;
+	 // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+
+	  __asm__("mulq %3\n\t"
+		  : "=d" (hi),
+		"=a" (lo)
+		  : "%a" (c[0]),
+		"rm" (b[0])
+		  : "cc" );
+	  
+	  a[0] += hi;
+	  a[1] += lo;
+	}
+	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
+	dst[0] = a[0];
+	dst[1] = a[1];
+
+	a[0] ^= b[0];
+	a[1] ^= b[1];
+	b_x = c_x;
+	__builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3);
+    }
+
+    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+    memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE);
+    ExpandAESKey256(ExpandedKey);
+    
+    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
+    
+    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) 
+    {	
+         xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
+         xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
+         xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
+         xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
+         xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
+         xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
+         xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
+         xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
+		
+         for(j = 0; j < 10; j++)
+         {
+            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
+	    xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
+	    xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
+	    xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
+	    xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
+	    xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
+	    xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
+	    xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
+	 }
+    }
+        
+    memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
+
+    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+#endif
+}
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -0,0 +1,110 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+// Modified for CPUminer by Lucas Jones
+
+#include "cpuminer-config.h"
+//#include "miner.h"
+#include "algo-gate-api.h"
+
+#ifndef NO_AES_NI
+  #include "algo/groestl/aes_ni/hash-groestl256.h"
+#endif
+
+#include "crypto/c_groestl.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "cryptonight.h"
+
+/*
+#if defined __unix__ && (!defined __APPLE__)
+#include <sys/mman.h>
+#elif defined _WIN32
+#include <windows.h>
+#endif
+*/
+
+void do_blake_hash(const void* input, size_t len, char* output) {
+    blake256_hash((uint8_t*)output, input, len);
+}
+
+void do_groestl_hash(const void* input, size_t len, char* output) {
+#ifdef NO_AES_NI
+    groestl(input, len * 8, (uint8_t*)output);
+#else
+    hashState_groestl256 ctx;
+    init_groestl256( &ctx );
+    update_groestl256( &ctx, input, len * 8 );
+    final_groestl256( &ctx, output );
+#endif
+}
+
+void do_jh_hash(const void* input, size_t len, char* output) {
+    jh_hash(32 * 8, input, 8 * len, (uint8_t*)output);
+}
+
+void do_skein_hash(const void* input, size_t len, char* output) {
+    skein_hash(8 * 32, input, 8 * len, (uint8_t*)output);
+}
+
+void (* const extra_hashes[4])( const void *, size_t, char *) =
+    { do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash };
+
+void cryptonight_hash( void *restrict output, const void *input, int len )
+{
+
+#ifdef NO_AES_NI
+  cryptonight_hash_ctx ( output, input, len );
+#else 
+  cryptonight_hash_aes( output, input, len );
+#endif
+}
+
+void cryptonight_hash_suw( void *restrict output, const void *input )
+{
+#ifdef NO_AES_NI
+  cryptonight_hash_ctx ( output, input, 76 );
+#else
+  cryptonight_hash_aes( output, input, 76 );
+#endif
+}
+
+int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+ {
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+
+    uint32_t *nonceptr = (uint32_t*) (((char*)pdata) + 39);
+    uint32_t n = *nonceptr - 1;
+    const uint32_t first_nonce = n + 1;
+    const uint32_t Htarg = ptarget[7];
+    uint32_t hash[32 / 4] __attribute__((aligned(32)));
+    do
+    {
+       *nonceptr = ++n;
+       cryptonight_hash( hash, pdata, 76 );
+       if (unlikely( hash[7] < Htarg ))
+       {
+           *hashes_done = n - first_nonce + 1;
+	   return true;
+       }
+    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
+    
+    *hashes_done = n - first_nonce + 1;
+    return 0;
+}
+
+bool register_cryptonight_algo( algo_gate_t* gate )
+{
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash         = (void*)&scanhash_cryptonight;
+  gate->hash             = (void*)&cryptonight_hash;
+  gate->hash_suw         = (void*)&cryptonight_hash_suw;  
+  gate->get_max64        = (void*)&get_max64_0x40LL;
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -0,0 +1,242 @@
+// Copyright (c) 2012-2013 The Cryptonote developers
+// Distributed under the MIT/X11 software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+// Modified for CPUminer by Lucas Jones
+
+#include "miner.h"
+
+#if defined(__arm__) || defined(_MSC_VER)
+#ifndef NOASM
+#define NOASM
+#endif
+#endif
+
+#include "crypto/oaes_lib.h"
+#include "crypto/c_keccak.h"
+#include "crypto/c_groestl.h"
+#include "crypto/c_blake256.h"
+#include "crypto/c_jh.h"
+#include "crypto/c_skein.h"
+#include "crypto/int-util.h"
+#include "crypto/hash-ops.h"
+//#include "cryptonight.h"
+
+#if USE_INT128
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 4 && __GNUC_MINOR__ < 6
+typedef unsigned int uint128_t __attribute__ ((__mode__ (TI)));
+#elif defined (_MSC_VER)
+/* only for mingw64 on windows */
+#undef  USE_INT128
+#define USE_INT128 (0)
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+#endif
+
+#define LITE 0
+#if LITE /* cryptonight-light */
+#define MEMORY (1 << 20)
+#define ITER   (1 << 19)
+#else
+#define MEMORY (1 << 21) /* 2 MiB */
+#define ITER   (1 << 20)
+#endif
+
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
+
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+	union hash_state hs;
+	struct {
+		uint8_t k[64];
+		uint8_t init[INIT_SIZE_BYTE];
+	};
+};
+#pragma pack(pop)
+
+static void do_blake_hash(const void* input, size_t len, char* output) {
+	blake256_hash((uint8_t*)output, input, len);
+}
+
+static void do_groestl_hash(const void* input, size_t len, char* output) {
+	groestl(input, len * 8, (uint8_t*)output);
+}
+
+static void do_jh_hash(const void* input, size_t len, char* output) {
+	int r = jh_hash(HASH_SIZE * 8, input, 8 * len, (uint8_t*)output);
+	assert(likely(SUCCESS == r));
+}
+
+static void do_skein_hash(const void* input, size_t len, char* output) {
+	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
+	assert(likely(SKEIN_SUCCESS == r));
+}
+
+extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#if !defined(_MSC_VER) && !defined(NOASM)
+extern int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
+extern int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
+#else
+#define fast_aesb_single_round     aesb_single_round
+#define fast_aesb_pseudo_round_mut aesb_pseudo_round_mut
+#endif
+
+
+#if defined(NOASM) || !defined(__x86_64__)
+static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) {
+	// multiplier   = ab = a * 2^32 + b
+	// multiplicand = cd = c * 2^32 + d
+	// ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
+	uint64_t a = hi_dword(multiplier);
+	uint64_t b = lo_dword(multiplier);
+	uint64_t c = hi_dword(multiplicand);
+	uint64_t d = lo_dword(multiplicand);
+
+	uint64_t ac = a * c;
+	uint64_t ad = a * d;
+	uint64_t bc = b * c;
+	uint64_t bd = b * d;
+
+	uint64_t adbc = ad + bc;
+	uint64_t adbc_carry = adbc < ad ? 1 : 0;
+
+	// multiplier * multiplicand = product_hi * 2^64 + product_lo
+	uint64_t product_lo = bd + (adbc << 32);
+	uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
+	*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
+	assert(ac <= *product_hi);
+
+	return product_lo;
+}
+#else
+extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
+#endif
+
+static void (* const extra_hashes[4])(const void *, size_t, char *) = {
+		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
+};
+
+static inline size_t e2i(const uint8_t* a) {
+#if !LITE
+	return ((uint32_t *)a)[0] & 0x1FFFF0;
+#else
+	return ((uint32_t *)a)[0] & 0xFFFF0;
+#endif
+}
+
+static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
+	hi += ((uint64_t*) c)[0];
+
+	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
+	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
+	((uint64_t*) dst)[0] = hi;
+	((uint64_t*) dst)[1] = lo;
+}
+
+static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
+#if USE_INT128
+	*((uint128_t*) a) ^= *((uint128_t*) b);
+#else
+	((uint64_t*) a)[0] ^= ((uint64_t*) b)[0];
+	((uint64_t*) a)[1] ^= ((uint64_t*) b)[1];
+#endif
+}
+
+static inline void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
+#if USE_INT128
+	*((uint128_t*) dst) = *((uint128_t*) a) ^ *((uint128_t*) b);
+#else
+	((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0];
+	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
+#endif
+}
+
+typedef struct {
+	uint8_t _ALIGN(16) long_state[MEMORY];
+	union cn_slow_hash_state state;
+	uint8_t _ALIGN(16) text[INIT_SIZE_BYTE];
+	uint8_t _ALIGN(16) a[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) b[AES_BLOCK_SIZE];
+	uint8_t _ALIGN(16) c[AES_BLOCK_SIZE];
+	oaes_ctx* aes_ctx;
+} cryptonight_ctx;
+
+static __thread cryptonight_ctx ctx;
+
+void cryptonight_hash_ctx(void* output, const void* input, int len)
+{
+	hash_process(&ctx.state.hs, (const uint8_t*) input, len);
+	ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
+	size_t i, j;
+	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+
+	oaes_key_import_data(ctx.aes_ctx, ctx.state.hs.b, AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 0], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 1], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 2], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 3], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 4], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 5], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 6], ctx.aes_ctx->key->exp_data);
+		aesb_pseudo_round_mut(&ctx.text[AES_BLOCK_SIZE * 7], ctx.aes_ctx->key->exp_data);
+		memcpy(&ctx.long_state[i], ctx.text, INIT_SIZE_BYTE);
+	}
+
+	xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
+	xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);
+
+	for (i = 0; likely(i < ITER / 4); ++i) {
+		/* Dependency chain: address -> read value ------+
+		 * written value <-+ hard function (AES or MUL) <+
+		 * next address  <-+
+		 */
+		/* Iteration 1 */
+		j = e2i(ctx.a);
+		aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
+		xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
+		/* Iteration 2 */
+		mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)]);
+		/* Iteration 3 */
+		j = e2i(ctx.a);
+		aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
+		xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
+		/* Iteration 4 */
+		mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
+	}
+
+	memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
+	oaes_key_import_data(ctx.aes_ctx, &ctx.state.hs.b[32], AES_KEY_SIZE);
+	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
+		xor_blocks(&ctx.text[0 * AES_BLOCK_SIZE], &ctx.long_state[i + 0 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[0 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[1 * AES_BLOCK_SIZE], &ctx.long_state[i + 1 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[1 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[2 * AES_BLOCK_SIZE], &ctx.long_state[i + 2 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[2 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[3 * AES_BLOCK_SIZE], &ctx.long_state[i + 3 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[3 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[4 * AES_BLOCK_SIZE], &ctx.long_state[i + 4 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[4 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[5 * AES_BLOCK_SIZE], &ctx.long_state[i + 5 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[5 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[6 * AES_BLOCK_SIZE], &ctx.long_state[i + 6 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[6 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+		xor_blocks(&ctx.text[7 * AES_BLOCK_SIZE], &ctx.long_state[i + 7 * AES_BLOCK_SIZE]);
+		aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
+	}
+	memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+	hash_permutation(&ctx.state.hs);
+	/*memcpy(hash, &state, 32);*/
+	extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+	oaes_free((OAES_CTX **) &ctx.aes_ctx);
+}
+
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -0,0 +1,47 @@
+#ifndef __CRYPTONIGHT_H_INCLUDED
+#define __CRYPTONIGHT_H_INCLUDED
+
+#include <stddef.h>
+#include "crypto/oaes_lib.h"
+#include "miner.h"
+
+#define MEMORY         (1 << 21) /* 2 MiB */
+#define ITER           (1 << 20)
+#define AES_BLOCK_SIZE  16
+#define AES_KEY_SIZE    32 /*16*/
+#define INIT_SIZE_BLK   8
+#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)	// 128
+
+
+#pragma pack(push, 1)
+union hash_state {
+  uint8_t b[200];
+  uint64_t w[25];
+};
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+union cn_slow_hash_state {
+    union hash_state hs;
+    struct {
+        uint8_t k[64];
+        uint8_t init[INIT_SIZE_BYTE];
+    };
+};
+#pragma pack(pop)
+
+void do_blake_hash(const void* input, size_t len, char* output);
+void do_groestl_hash(const void* input, size_t len, char* output);
+void do_jh_hash(const void* input, size_t len, char* output);
+void do_skein_hash(const void* input, size_t len, char* output);
+void cryptonight_hash_ctx(void* output, const void* input, int len);
+void keccakf(uint64_t st[25], int rounds);
+extern void (* const extra_hashes[4])(const void *, size_t, char *);
+
+int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done );
+
+void cryptonight_hash_aes( void *restrict output, const void *input, int len );
+
+#endif
+
--- a/algo/cubehash/.dirstamp
+++ b/algo/cubehash/.dirstamp
--- a/algo/cubehash/sph_cubehash.c
+++ b/algo/cubehash/sph_cubehash.c
@@ -0,0 +1,723 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -0,0 +1,292 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/cubehash/sse2/.dirstamp
+++ b/algo/cubehash/sse2/.dirstamp
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -0,0 +1,268 @@
+/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
+#define CUBEHASH_ROUNDS	16
+#define CUBEHASH_BLOCKBYTES 32
+#define OPTIMIZE_SSE2
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include "cubehash_sse2.h"
+#include "algo/sha3/sha3-defs.h"
+
+//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
+
+//#if defined(OPTIMIZE_SSE2)
+
+static void transform( cubehashParam *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+#ifdef __AVX2__
+
+    __m256i x0, x1, x2, x3, y0, y1;
+#ifdef  UNUSED
+    __m256i y2, y3;
+#endif
+
+    x0 = _mm256_load_si256( 0 + sp->x );
+    x1 = _mm256_load_si256( 2 + sp->x );   
+    x2 = _mm256_load_si256( 4 + sp->x );
+    x3 = _mm256_load_si256( 6 + sp->x );
+
+    for ( r = 0; r < rounds; ++r )
+    { 
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = x1;
+        y1 = x0;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0x4e );
+        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = _mm256_permute2f128_si256( x0, x0, 1 );
+        y1 = _mm256_permute2f128_si256( x1, x1, 1 );
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
+                               _mm256_srli_epi32( y1, 21 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
+        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+    }
+
+    _mm256_store_si256( 0 + sp->x, x0 );
+    _mm256_store_si256( 2 + sp->x, x1 );
+    _mm256_store_si256( 4 + sp->x, x2 );
+    _mm256_store_si256( 6 + sp->x, x3 );
+
+#elif defined OPTIMIZE_SSE2
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#ifdef	UNUSED
+    __m128i y4, y5, y6, y7;
+#endif
+
+    x0 = _mm_load_si128(0 + sp->x);
+    x1 = _mm_load_si128(1 + sp->x);
+    x2 = _mm_load_si128(2 + sp->x);
+    x3 = _mm_load_si128(3 + sp->x);
+    x4 = _mm_load_si128(4 + sp->x);
+    x5 = _mm_load_si128(5 + sp->x);
+    x6 = _mm_load_si128(6 + sp->x);
+    x7 = _mm_load_si128(7 + sp->x);
+
+    for (r = 0; r < rounds; ++r) {
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x2;
+	y1 = x3;
+	y2 = x0;
+	y3 = x1;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0x4e);
+	x5 = _mm_shuffle_epi32(x5, 0x4e);
+	x6 = _mm_shuffle_epi32(x6, 0x4e);
+	x7 = _mm_shuffle_epi32(x7, 0x4e);
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x1;
+	y1 = x0;
+	y2 = x3;
+	y3 = x2;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0xb1);
+	x5 = _mm_shuffle_epi32(x5, 0xb1);
+	x6 = _mm_shuffle_epi32(x6, 0xb1);
+	x7 = _mm_shuffle_epi32(x7, 0xb1);
+    }
+
+    _mm_store_si128(0 + sp->x, x0);
+    _mm_store_si128(1 + sp->x, x1);
+    _mm_store_si128(2 + sp->x, x2);
+    _mm_store_si128(3 + sp->x, x3);
+    _mm_store_si128(4 + sp->x, x4);
+    _mm_store_si128(5 + sp->x, x5);
+    _mm_store_si128(6 + sp->x, x6);
+    _mm_store_si128(7 + sp->x, x7);
+
+#else	/* OPTIMIZE_SSE2 */
+// Tis code probably not used, sph used instead for uniptoimized mining.
+
+#define ROTATE(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+
+    uint32_t y[16];
+    int i;
+
+    for (r = 0; r < rounds; ++r) {
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],7);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 2] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) y[i ^ 4] = sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],11);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 1] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+    }
+#endif	
+}  // transform
+
+int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
+{
+    int i;
+
+    if (hashbitlen < 8) return BAD_HASHBITLEN;
+    if (hashbitlen > 512) return BAD_HASHBITLEN;
+    if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN;
+
+    /* Sanity checks */
+    if (rounds <= 0 || rounds > 32) rounds = CUBEHASH_ROUNDS;
+    if (blockbytes <= 0 || blockbytes >= 256) blockbytes = CUBEHASH_BLOCKBYTES;
+
+    sp->hashbitlen = hashbitlen;
+    sp->rounds = rounds;
+    sp->blockbytes = blockbytes;
+#if defined(OPTIMIZE_SSE2)
+    for (i = 0; i < 8; ++i) sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
+#else
+    for (i = 0; i < 32; ++i) sp->x[i] = 0;
+    sp->x[0] = hashbitlen / 8;
+    sp->x[1] = sp->blockbytes;
+    sp->x[2] = sp->rounds;
+#endif
+    for (i = 0; i < 10; ++i) transform(sp);
+    sp->pos = 0;
+    return SUCCESS;
+}
+
+int
+cubehashReset(cubehashParam *sp)
+{
+    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
+}
+
+int cubehashUpdate(cubehashParam *sp, const byte *data, size_t size)
+{
+    uint64_t databitlen = 8 * size;
+
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */
+
+    while (databitlen >= 8) {
+#if defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	data += 1;
+	databitlen -= 8;
+	sp->pos += 8;
+	if (sp->pos == 8 * sp->blockbytes) {
+	    transform(sp);
+	    sp->pos = 0;
+	}
+    }
+    if (databitlen > 0) {
+#if defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	sp->pos += databitlen;
+    }
+    return SUCCESS;
+}
+
+int cubehashDigest(cubehashParam *sp, byte *digest)
+{
+    int i;
+
+#if defined(OPTIMIZE_SSE2)
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    transform(sp);
+    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = ((unsigned char *) sp->x)[i];
+#else
+    uint32_t u;
+
+    u = (128 >> (sp->pos % 8));
+    u <<= 8 * ((sp->pos / 8) % 4);
+    sp->x[sp->pos / 32] ^= u;
+    transform(sp);
+    sp->x[31] ^= 1;
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = sp->x[i / 4] >> (8 * (i % 4));
+#endif
+
+    return SUCCESS;
+}
--- a/algo/cubehash/sse2/cubehash_sse2.c.broke
+++ b/algo/cubehash/sse2/cubehash_sse2.c.broke
@@ -0,0 +1,292 @@
+/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
+#define CUBEHASH_ROUNDS	16
+#define CUBEHASH_BLOCKBYTES 32
+#define OPTIMIZE_SSE2
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include "cubehash_sse2.h"
+#include "algo/sha3/sha3-defs.h"
+
+//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
+
+//#if defined(OPTIMIZE_SSE2)
+
+static inline void transform( cubehashParam *sp )
+{
+    int r;
+
+#ifdef __AVX2__
+
+    __m256i x0, x1, x2, x3, y0, y1;
+#ifdef  UNUSED
+    __m256i y2, y3;
+#endif
+
+    x0 = _mm256_loadu_si256( 0 + sp->x );
+    x1 = _mm256_loadu_si256( 2 + sp->x );
+    x2 = _mm256_loadu_si256( 4 + sp->x );
+    x3 = _mm256_loadu_si256( 6 + sp->x );
+
+    for ( r = 0; r < sp->rounds; ++r )
+    { 
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = x1;
+        y1 = x0;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0x4e );
+        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = _mm256_permute2f128_si256( x0, x0, 1 );
+        y1 = _mm256_permute2f128_si256( x1, x1, 1 );
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
+                               _mm256_srli_epi32( y1, 21 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
+        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+    }
+
+    _mm256_storeu_si256( 0 + sp->x, x0 );
+    _mm256_storeu_si256( 2 + sp->x, x1 );
+    _mm256_storeu_si256( 4 + sp->x, x2 );
+    _mm256_storeu_si256( 6 + sp->x, x3 );
+
+#elif defined OPTIMIZE_SSE2
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#ifdef	UNUSED
+    __m128i y4, y5, y6, y7;
+#endif
+
+    x0 = _mm_load_si128(0 + sp->x);
+    x1 = _mm_load_si128(1 + sp->x);
+    x2 = _mm_load_si128(2 + sp->x);
+    x3 = _mm_load_si128(3 + sp->x);
+    x4 = _mm_load_si128(4 + sp->x);
+    x5 = _mm_load_si128(5 + sp->x);
+    x6 = _mm_load_si128(6 + sp->x);
+    x7 = _mm_load_si128(7 + sp->x);
+
+    for (r = 0; r < sp->rounds; ++r) {
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x2;
+	y1 = x3;
+	y2 = x0;
+	y3 = x1;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0x4e);
+	x5 = _mm_shuffle_epi32(x5, 0x4e);
+	x6 = _mm_shuffle_epi32(x6, 0x4e);
+	x7 = _mm_shuffle_epi32(x7, 0x4e);
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x1;
+	y1 = x0;
+	y2 = x3;
+	y3 = x2;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0xb1);
+	x5 = _mm_shuffle_epi32(x5, 0xb1);
+	x6 = _mm_shuffle_epi32(x6, 0xb1);
+	x7 = _mm_shuffle_epi32(x7, 0xb1);
+    }
+
+    _mm_store_si128(0 + sp->x, x0);
+    _mm_store_si128(1 + sp->x, x1);
+    _mm_store_si128(2 + sp->x, x2);
+    _mm_store_si128(3 + sp->x, x3);
+    _mm_store_si128(4 + sp->x, x4);
+    _mm_store_si128(5 + sp->x, x5);
+    _mm_store_si128(6 + sp->x, x6);
+    _mm_store_si128(7 + sp->x, x7);
+
+#else	/* OPTIMIZE_SSE2 */
+// Tis code probably not used, sph used instead for uniptoimized mining.
+
+#define ROTATE(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+
+    uint32_t y[16];
+    int i;
+
+    for (r = 0; r < sp->rounds; ++r) {
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],7);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 2] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) y[i ^ 4] = sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],11);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 1] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+    }
+#endif	
+}  // transform
+
+int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
+{
+    int i;
+
+    if (hashbitlen < 8) return BAD_HASHBITLEN;
+    if (hashbitlen > 512) return BAD_HASHBITLEN;
+    if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN;
+
+    /* Sanity checks */
+    if (rounds <= 0 || rounds > 32) rounds = CUBEHASH_ROUNDS;
+    if (blockbytes <= 0 || blockbytes >= 256) blockbytes = CUBEHASH_BLOCKBYTES;
+
+    sp->hashbitlen = hashbitlen;
+    sp->rounds = rounds;
+    sp->blockbytes = blockbytes;
+#if defined __AVX2__
+    for (i = 0; i < 4; ++i) sp->x[i] = _mm256_set_epi64x( 0, 0, 0, 0 );
+// try swapping
+    sp->x[0] = _mm256_set_epi32( 0, sp->rounds, sp->blockbytes, hashbitlen / 8,
+                                 0, 0, 0, 0);
+//    sp->x[0] = _mm256_set_epi32( 0, 0, 0, 0, 
+//                                 0, sp->rounds, sp->blockbytes, hashbitlen / 8 );
+#elif defined(OPTIMIZE_SSE2)
+    for (i = 0; i < 8; ++i) sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
+#else
+    for (i = 0; i < 32; ++i) sp->x[i] = 0;
+    sp->x[0] = hashbitlen / 8;
+    sp->x[1] = sp->blockbytes;
+    sp->x[2] = sp->rounds;
+#endif
+    for (i = 0; i < 10; ++i) transform(sp);
+    sp->pos = 0;
+    return SUCCESS;
+}
+
+int
+cubehashReset(cubehashParam *sp)
+{
+    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
+}
+
+int cubehashUpdate(cubehashParam *sp, const byte *data, size_t size)
+{
+    uint64_t databitlen = 8 * size;
+
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */
+
+    while (databitlen >= 8) {
+#if defined __AVX2__
+        ((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#elif defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	data += 1;
+	databitlen -= 8;
+	sp->pos += 8;
+	if (sp->pos == 8 * sp->blockbytes) {
+	    transform(sp);
+	    sp->pos = 0;
+	}
+    }
+    if (databitlen > 0) {
+#if defined __AVX2__
+        ((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#elif defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	sp->pos += databitlen;
+    }
+    return SUCCESS;
+}
+
+int cubehashDigest(cubehashParam *sp, byte *digest)
+{
+    int i;
+#if defined __AVX2__
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    __m128i t;
+    transform(sp);
+// try control 0
+//    t = _mm256_extracti128_si256( sp->x[7], 1 );
+    t = _mm256_extracti128_si256( sp->x[7], 0 );
+    t = _mm_xor_si128( t, _mm_set_epi32(1, 0, 0, 0) );
+//     _mm256_inserti128_si256( sp->x[7], t, 1 );
+     _mm256_inserti128_si256( sp->x[7], t, 0 );
+
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+        digest[i] = ((unsigned char *) sp->x)[i];
+
+#elif defined(OPTIMIZE_SSE2)
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    transform(sp);
+    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = ((unsigned char *) sp->x)[i];
+#else
+    uint32_t u;
+
+    u = (128 >> (sp->pos % 8));
+    u <<= 8 * ((sp->pos / 8) % 4);
+    sp->x[sp->pos / 32] ^= u;
+    transform(sp);
+    sp->x[31] ^= 1;
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = sp->x[i / 4] >> (8 * (i % 4));
+#endif
+
+    return SUCCESS;
+}
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -0,0 +1,64 @@
+#ifndef CUBEHASH_SSE2_H__
+#define CUBEHASH_SSE2_H__
+
+#include "compat.h"
+#include <stdint.h>
+#include "algo/sha3/sha3-defs.h"
+//#include <beecrypt/beecrypt.h>
+
+//#if defined(__SSE2__)
+#define	OPTIMIZE_SSE2
+//#endif
+
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+
+/*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
+ * \ingroup HASH_cubehash_m
+ */
+
+struct _cubehashParam
+//#endif
+{
+    int hashbitlen;
+    int rounds;
+    int blockbytes;
+    int pos;		/* number of bits read into x from current block */
+#if defined(OPTIMIZE_SSE2)
+    __m128i _ALIGN(256) x[8];
+#else
+    uint32_t x[32];
+#endif
+};
+
+//#ifndef __cplusplus
+typedef struct _cubehashParam cubehashParam;
+//#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\var cubehash256
+ * \brief Holds the full API description of the CUBEHASH algorithm.
+ */
+//extern BEECRYPTAPI const hashFunction cubehash256;
+
+//BEECRYPTAPI
+int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+
+//BEECRYPTAPI
+int cubehashReset(cubehashParam* sp);
+
+//BEECRYPTAPI
+int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);
+
+//BEECRYPTAPI
+int cubehashDigest(cubehashParam* sp, byte *digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* H_CUBEHASH */
--- a/algo/drop.c
+++ b/algo/drop.c
@@ -0,0 +1,264 @@
+/**
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2015 kernels10, tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     drop.c
+ * @author   kernels10 <kernels10@gmail.com.com>
+ * @author   tpruvot <tpruvot@github>
+ */
+
+#define POK_BOOL_MASK 0x00008000
+#define POK_DATA_MASK 0xFFFF0000
+ 
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <string.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/fugue//sph_fugue.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/shavite/sph_shavite.h"
+
+static void shiftr_lp(const uint32_t *input, uint32_t *output, unsigned int shift)
+{
+	if(!shift) {
+		memcpy(output, input, 64);
+		return;
+	}
+
+	memset(output, 0, 64);
+	for(int i = 0; i < 15; ++i) {
+		output[i + 1] |= (input[i] >> (32 - shift));
+		output[i] |= (input[i] << shift);
+	}
+
+	output[15] |= (input[15] << shift);
+	return;
+}
+
+static void switchHash(const void *input, void *output, int id)
+{
+/*
+ 	sph_keccak512_context ctx_keccak;
+	sph_blake512_context ctx_blake;
+	sph_groestl512_context ctx_groestl;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_echo512_context ctx_echo;
+	sph_simd512_context ctx_simd;
+	sph_cubehash512_context ctx_cubehash;
+	sph_fugue512_context ctx_fugue;
+	sph_shavite512_context ctx_shavite;
+
+	switch(id) {
+	case 0:
+		sph_keccak512_init(&ctx_keccak); sph_keccak512(&ctx_keccak, input, 64); sph_keccak512_close(&ctx_keccak, output);
+		break;
+	case 1:
+		sph_blake512_init(&ctx_blake); sph_blake512(&ctx_blake, input, 64); sph_blake512_close(&ctx_blake, output);
+		break;
+	case 2:
+		sph_groestl512_init(&ctx_groestl); sph_groestl512(&ctx_groestl, input, 64); sph_groestl512_close(&ctx_groestl, output);
+		break;
+	case 3:
+		sph_skein512_init(&ctx_skein); sph_skein512(&ctx_skein, input, 64); sph_skein512_close(&ctx_skein, output);
+		break;
+	case 4:
+		sph_luffa512_init(&ctx_luffa); sph_luffa512(&ctx_luffa, input, 64); sph_luffa512_close(&ctx_luffa, output);
+		break;
+	case 5:
+		sph_echo512_init(&ctx_echo); sph_echo512(&ctx_echo, input, 64); sph_echo512_close(&ctx_echo, output);
+		break;
+	case 6:
+		sph_shavite512_init(&ctx_shavite); sph_shavite512(&ctx_shavite, input, 64); sph_shavite512_close(&ctx_shavite, output);
+		break;
+	case 7:
+		sph_fugue512_init(&ctx_fugue); sph_fugue512(&ctx_fugue, input, 64); sph_fugue512_close(&ctx_fugue, output);
+		break;
+	case 8:
+		sph_simd512_init(&ctx_simd); sph_simd512(&ctx_simd, input, 64); sph_simd512_close(&ctx_simd, output);
+		break;
+	case 9:
+		sph_cubehash512_init(&ctx_cubehash); sph_cubehash512(&ctx_cubehash, input, 64); sph_cubehash512_close(&ctx_cubehash, output);
+		break;
+	default:
+		break;
+	}
+*/
+}
+
+void droplp_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hash[2][16];
+	sph_jh512_context ctx_jh;
+	uint32_t *hashA = hash[0];
+	uint32_t *hashB = hash[1];
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, input, 80);
+	sph_jh512_close(&ctx_jh, (void*)(hashA));
+
+	unsigned int startPosition = hashA[0] % 31;
+	unsigned int i = 0;
+	int j = 0;
+	int start = 0;
+
+	for (i = startPosition; i < 31; i+=9) {
+		start = i % 10;
+		for (j = start; j < 10; j++) {
+			shiftr_lp(hashA, hashB, (i & 3));
+			switchHash((const void*)hashB, (void*)hashA, j);
+		}
+		for (j = 0; j < start; j++) {
+			shiftr_lp(hashA, hashB, (i & 3));
+			switchHash((const void*)hashB, (void*)hashA, j);
+		}
+	}
+	for (i = 0; i < startPosition; i += 9) {
+		start = i % 10;
+		for (j = start; j < 10; j++) {
+			shiftr_lp(hashA, hashB, (i & 3));
+			switchHash((const void*)hashB, (void*)hashA, j);
+		}
+		for (j = 0; j < start; j++) {
+			shiftr_lp(hashA, hashB, (i & 3));
+			switchHash((const void*)hashB, (void*)hashA, j);
+		}
+	}
+
+	memcpy(state, hashA, 32);
+}
+
+static void droplp_hash_pok(void *output, uint32_t *pdata, const uint32_t version)
+{
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t pok;
+
+	pdata[0] = version;
+	droplp_hash(hash, pdata);
+
+	// fill PoK
+	pok = version | (hash[0] & POK_DATA_MASK);
+	if (pdata[0] != pok) {
+		pdata[0] = pok;
+		droplp_hash(hash, pdata);
+	}
+	memcpy(output, hash, 32);
+}
+
+int scanhash_drop(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(64) hash[16];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+	#define tmpdata pdata
+
+	if (opt_benchmark)
+		ptarget[7] = 0x07ff;
+
+	const uint32_t htarg = ptarget[7];
+
+	do {
+		tmpdata[19] = nonce;
+		droplp_hash_pok(hash, tmpdata, version);
+
+		if (hash[7] <= htarg && fulltest(hash, ptarget)) {
+			pdata[0] = tmpdata[0];
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce + 1;
+			if (opt_debug)
+				applog(LOG_INFO, "found nonce %x", nonce);
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
+                        uint32_t* end_nonce_ptr, bool clean_job )
+{
+   // ignore POK in first word
+// const int nonce_i = 19;
+   const int wkcmp_sz = 72;  // (19-1) * sizeof(uint32_t)
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+   if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz )
+       && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) )
+   {
+      work_free( work );
+      work_copy( work, g_work );
+      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
+      if ( opt_randomize )
+         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
+   }
+   else
+       ++(*nonceptr);
+}
+
+void drop_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (65536.0 * opt_diff_factor) );
+}
+ 
+void drop_display_pok( struct work* work ) 
+{
+      if ( work->data[0] & 0x00008000 ) 
+        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
+}
+
+// Need to fix POK offset problems like zr5
+bool register_drop_algo( algo_gate_t* gate )
+{
+    algo_not_tested();
+    gate->scanhash              = (void*)&scanhash_drop;
+    gate->hash                  = (void*)&droplp_hash_pok;
+    gate->hash_alt              = (void*)&droplp_hash_pok;
+    gate->hash_suw              = (void*)&droplp_hash_pok;
+    gate->get_new_work          = (void*)&drop_get_new_work;
+    gate->set_target            = (void*)&scrypt_set_target;
+    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
+    gate->set_work_data_endian  = (void*)&swab_work_data;
+    gate->display_extra_data    = (void*)&drop_display_pok;
+    gate->work_data_size        = 80;
+    gate->work_cmp_size         = 72;
+    return true;
+};
+
--- a/algo/echo/.dirstamp
+++ b/algo/echo/.dirstamp
--- a/algo/echo/aes_ni/.dirstamp
+++ b/algo/echo/aes_ni/.dirstamp
--- a/algo/echo/aes_ni/api.h
+++ b/algo/echo/aes_ni/api.h
@@ -0,0 +1,2 @@
+#define CRYPTO_BYTES 64
+#define CRYPTO_VERSION "1.208"
--- a/algo/echo/aes_ni/architectures
+++ b/algo/echo/aes_ni/architectures
@@ -0,0 +1,2 @@
+amd64
+x86
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -0,0 +1,623 @@
+/*
+ * file        : echo_vperm.c
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * - vperm and aes_ni implementations of hash function ECHO
+ * - implements NIST hash api
+ * - assumes that message lenght is multiple of 8-bits
+ * - _ECHO_VPERM_ must be defined if compiling with ../main.c
+ * -  define NO_AES_NI for aes_ni version
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+
+#include <memory.h>
+#include "miner.h"
+#include "hash_api.h"
+#include "vperm.h"
+
+#ifndef NO_AES_NI
+#include <wmmintrin.h>
+#else
+#include <tmmintrin.h>
+#endif
+
+
+MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
+MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
+MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
+MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
+MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
+MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
+MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
+MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
+MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
+MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
+MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
+MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
+MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
+
+
+MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
+
+
+//#include "crypto_hash.h"
+
+ int crypto_hash(
+   unsigned char *out,
+   const unsigned char *in,
+   unsigned long long inlen
+ )
+ {
+
+	 if(hash_echo(512, in, inlen * 8, out) == SUCCESS) 
+		 return 0;
+	 
+	 return -1;
+ }
+
+/*
+int main()
+{
+	return 0;
+}
+*/
+
+#if 0
+void DumpState(__m128i *ps)
+{
+	int i, j, k;
+	unsigned int ucol;
+
+	for(j = 0; j < 4; j++)
+	{
+		for(i = 0; i < 4; i++)
+		{
+			printf("row %d,col %d : ", i, j);
+			for(k = 0; k < 4; k++)
+			{
+				ucol = *((int*)ps + 16 * i + 4 * j + k);
+				printf("%02x%02x%02x%02x ", (ucol >> 0) & 0xff, (ucol >> 8) & 0xff, (ucol >> 16) & 0xff, (ucol >> 24) & 0xff);
+			}
+
+			printf("\n");
+		}
+	}
+
+	printf("\n");
+}
+#endif
+
+
+
+
+#ifndef NO_AES_NI
+#define ECHO_SUBBYTES(state, i, j) \
+				state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
+				state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
+				k1 = _mm_add_epi32(k1, M128(const1))
+#else
+#define ECHO_SUBBYTES(state, i, j) \
+				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
+				state[i][j] = _mm_xor_si128(state[i][j], k1);\
+				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
+				k1 = _mm_add_epi32(k1, M128(const1))
+
+#define ECHO_SUB_AND_MIX(state, i, j, state2, c, r1, r2, r3, r4) \
+				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
+				ktemp = k1;\
+				TRANSFORM(ktemp, _k_ipt, t1, t4);\
+				state[i][j] = _mm_xor_si128(state[i][j], ktemp);\
+				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
+				k1 = _mm_add_epi32(k1, M128(const1));\
+				s1 = state[i][j];\
+				s2 = s1;\
+				TRANSFORM(s2, mul2ipt, t1, t2);\
+				s3 = _mm_xor_si128(s1, s2);\
+				state2[r1][c] = _mm_xor_si128(state2[r1][c], s2);\
+				state2[r2][c] = _mm_xor_si128(state2[r2][c], s1);\
+				state2[r3][c] = _mm_xor_si128(state2[r3][c], s1);\
+				state2[r4][c] = _mm_xor_si128(state2[r4][c], s3)
+
+
+
+#endif
+
+
+#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
+				s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
+				t1 = _mm_srli_epi16(state1[0][j], 7);\
+				t1 = _mm_and_si128(t1, M128(lsbmask));\
+				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+				s2 = _mm_xor_si128(s2, t2);\
+				state2[0][j] = s2;\
+				state2[1][j] = state1[0][j];\
+				state2[2][j] = state1[0][j];\
+				state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
+				s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+				t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
+				t1 = _mm_and_si128(t1, M128(lsbmask));\
+				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+				s2 = _mm_xor_si128(s2, t2);\
+				state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+				state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
+				state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
+				state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
+				s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+				t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
+				t1 = _mm_and_si128(t1, M128(lsbmask));\
+				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+				s2 = _mm_xor_si128(s2, t2);\
+				state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
+				state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+				state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
+				state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
+				s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+				t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
+				t1 = _mm_and_si128(t1, M128(lsbmask));\
+				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+				s2 = _mm_xor_si128(s2, t2);\
+				state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
+				state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
+				state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+				state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+
+
+#define ECHO_ROUND_UNROLL2 \
+			ECHO_SUBBYTES(_state, 0, 0);\
+			ECHO_SUBBYTES(_state, 1, 0);\
+			ECHO_SUBBYTES(_state, 2, 0);\
+			ECHO_SUBBYTES(_state, 3, 0);\
+			ECHO_SUBBYTES(_state, 0, 1);\
+			ECHO_SUBBYTES(_state, 1, 1);\
+			ECHO_SUBBYTES(_state, 2, 1);\
+			ECHO_SUBBYTES(_state, 3, 1);\
+			ECHO_SUBBYTES(_state, 0, 2);\
+			ECHO_SUBBYTES(_state, 1, 2);\
+			ECHO_SUBBYTES(_state, 2, 2);\
+			ECHO_SUBBYTES(_state, 3, 2);\
+			ECHO_SUBBYTES(_state, 0, 3);\
+			ECHO_SUBBYTES(_state, 1, 3);\
+			ECHO_SUBBYTES(_state, 2, 3);\
+			ECHO_SUBBYTES(_state, 3, 3);\
+			ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+			ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+			ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+			ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+			ECHO_SUBBYTES(_state2, 0, 0);\
+			ECHO_SUBBYTES(_state2, 1, 0);\
+			ECHO_SUBBYTES(_state2, 2, 0);\
+			ECHO_SUBBYTES(_state2, 3, 0);\
+			ECHO_SUBBYTES(_state2, 0, 1);\
+			ECHO_SUBBYTES(_state2, 1, 1);\
+			ECHO_SUBBYTES(_state2, 2, 1);\
+			ECHO_SUBBYTES(_state2, 3, 1);\
+			ECHO_SUBBYTES(_state2, 0, 2);\
+			ECHO_SUBBYTES(_state2, 1, 2);\
+			ECHO_SUBBYTES(_state2, 2, 2);\
+			ECHO_SUBBYTES(_state2, 3, 2);\
+			ECHO_SUBBYTES(_state2, 0, 3);\
+			ECHO_SUBBYTES(_state2, 1, 3);\
+			ECHO_SUBBYTES(_state2, 2, 3);\
+			ECHO_SUBBYTES(_state2, 3, 3);\
+			ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+			ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+			ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+			ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+
+
+#define SAVESTATE(dst, src)\
+		dst[0][0] = src[0][0];\
+		dst[0][1] = src[0][1];\
+		dst[0][2] = src[0][2];\
+		dst[0][3] = src[0][3];\
+		dst[1][0] = src[1][0];\
+		dst[1][1] = src[1][1];\
+		dst[1][2] = src[1][2];\
+		dst[1][3] = src[1][3];\
+		dst[2][0] = src[2][0];\
+		dst[2][1] = src[2][1];\
+		dst[2][2] = src[2][2];\
+		dst[2][3] = src[2][3];\
+		dst[3][0] = src[3][0];\
+		dst[3][1] = src[3][1];\
+		dst[3][2] = src[3][2];\
+		dst[3][3] = src[3][3]
+
+
+void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+{
+	unsigned int r, b, i, j;
+	__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < ctx->uHashSize / 256; j++)
+			_state[i][j] = ctx->state[i][j];
+
+
+#ifdef NO_AES_NI
+	// transform cv
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < ctx->uHashSize / 256; j++)
+		{
+			TRANSFORM(_state[i][j], _k_ipt, t1, t2);
+		}
+#endif
+
+	for(b = 0; b < uBlockCount; b++)
+	{
+		ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
+
+		// load message
+		for(j = ctx->uHashSize / 256; j < 4; j++)
+		{
+			for(i = 0; i < 4; i++)
+			{
+				_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+
+#ifdef NO_AES_NI
+				// transform message
+				TRANSFORM(_state[i][j], _k_ipt, t1, t2);
+#endif
+			}
+		}
+
+		// save state
+		SAVESTATE(_statebackup, _state);
+
+
+		k1 = ctx->k;
+
+#ifndef NO_AES_NI
+		for(r = 0; r < ctx->uRounds / 2; r++)
+		{
+			ECHO_ROUND_UNROLL2;
+		}
+
+#else
+		for(r = 0; r < ctx->uRounds / 2; r++)
+		{
+			_state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero);
+			_state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero);
+			_state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero);
+			_state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero);																			
+
+			ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2);
+
+			_state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero);
+			_state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero);
+			_state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero);
+			_state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero);																			
+
+			ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2);
+			ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3);
+			ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0);
+			ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1);
+			ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2);
+
+		}
+#endif
+
+		
+		if(ctx->uHashSize == 256)
+		{
+			for(i = 0; i < 4; i++)
+			{
+				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+			}
+		}
+		else
+		{
+			for(i = 0; i < 4; i++)
+			{
+				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+				_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+
+				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+			}
+		}
+
+		pmsg += ctx->uBlockLength;
+	}
+
+#ifdef NO_AES_NI
+	// transform state
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < 4; j++)
+		{
+			TRANSFORM(_state[i][j], _k_opt, t1, t2);
+		}
+#endif
+
+		SAVESTATE(ctx->state, _state);
+
+}
+
+
+
+HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+{
+	int i, j;
+
+	ctx->k = _mm_xor_si128(ctx->k, ctx->k);
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch(nHashSize)
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
+			break;
+
+		default:
+			return BAD_HASHBITLEN;
+	}
+
+
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < nHashSize / 256; j++)
+			ctx->state[i][j] = ctx->hashsize;
+
+	for(i = 0; i < 4; i++)
+		for(j = nHashSize / 256; j < 4; j++)
+			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
+
+	return SUCCESS;
+}
+
+HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+{
+	unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+	uByteLength = (unsigned int)(databitlen / 8);
+
+	if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
+	{
+		if(state->uBufferBytes != 0)
+		{
+			// Fill the buffer
+			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+
+			// Process buffer
+			Compress(state, state->buffer, 1);
+			state->processed_bits += state->uBlockLength * 8;
+
+			data += state->uBlockLength - state->uBufferBytes;
+			uByteLength -= state->uBlockLength - state->uBufferBytes;
+		}
+
+		// buffer now does not contain any unprocessed bytes
+
+		uBlockCount = uByteLength / state->uBlockLength;
+		uRemainingBytes = uByteLength % state->uBlockLength;
+
+		if(uBlockCount > 0)
+		{
+			Compress(state, data, uBlockCount);
+
+			state->processed_bits += uBlockCount * state->uBlockLength * 8;
+			data += uBlockCount * state->uBlockLength;
+		}
+
+		if(uRemainingBytes > 0)
+		{
+			memcpy(state->buffer, (void*)data, uRemainingBytes);
+		}
+
+		state->uBufferBytes = uRemainingBytes;
+	}
+	else
+	{
+		memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
+		state->uBufferBytes += uByteLength;
+	}
+
+	return SUCCESS;
+}
+
+HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+{
+	__m128i remainingbits;
+
+	// Add remaining bytes in the buffer
+	state->processed_bits += state->uBufferBytes * 8;
+
+	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
+
+	// Pad with 0x80
+	state->buffer[state->uBufferBytes++] = 0x80;
+	
+	// Enough buffer space for padding in this block?
+	if((state->uBlockLength - state->uBufferBytes) >= 18)
+	{
+		// Pad with zeros
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Last block contains message bits?
+		if(state->uBufferBytes == 1)
+		{
+			state->k = _mm_xor_si128(state->k, state->k);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+		else
+		{
+			state->k = _mm_add_epi64(state->k, remainingbits);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+
+		// Compress
+		Compress(state, state->buffer, 1);
+	}
+	else
+	{
+		// Fill with zero and compress
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
+		state->k = _mm_add_epi64(state->k, remainingbits);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+
+		// Last block
+		memset(state->buffer, 0, state->uBlockLength - 18);
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Compress the last block
+		state->k = _mm_xor_si128(state->k, state->k);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+	}
+
+	// Store the hash value
+	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+
+	if(state->uHashSize == 512)
+	{
+		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+	}
+
+	return SUCCESS;
+}
+
+
+
+HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
+{
+	HashReturn hRet;
+	hashState_echo hs;
+
+	/////
+	/*
+	__m128i a, b, c, d, t[4], u[4], v[4];
+
+	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
+	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
+	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
+	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
+
+	t[0] = _mm_unpacklo_epi8(a, b);
+	t[1] = _mm_unpackhi_epi8(a, b);
+	t[2] = _mm_unpacklo_epi8(c, d);
+	t[3] = _mm_unpackhi_epi8(c, d);
+
+	u[0] = _mm_unpacklo_epi16(t[0], t[2]);
+	u[1] = _mm_unpackhi_epi16(t[0], t[2]);
+	u[2] = _mm_unpacklo_epi16(t[1], t[3]);
+	u[3] = _mm_unpackhi_epi16(t[1], t[3]);
+
+
+	t[0] = _mm_unpacklo_epi16(u[0], u[1]);
+	t[1] = _mm_unpackhi_epi16(u[0], u[1]);
+	t[2] = _mm_unpacklo_epi16(u[2], u[3]);
+	t[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+	u[0] = _mm_unpacklo_epi8(t[0], t[1]);
+	u[1] = _mm_unpackhi_epi8(t[0], t[1]);
+	u[2] = _mm_unpacklo_epi8(t[2], t[3]);
+	u[3] = _mm_unpackhi_epi8(t[2], t[3]);
+
+	a = _mm_unpacklo_epi8(u[0], u[1]);
+	b = _mm_unpackhi_epi8(u[0], u[1]);
+	c = _mm_unpacklo_epi8(u[2], u[3]);
+	d = _mm_unpackhi_epi8(u[2], u[3]);
+	*/
+	/////
+
+	hRet = init_echo(&hs, hashbitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = update_echo(&hs, data, databitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = final_echo(&hs, hashval);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	return SUCCESS;
+}
+
+
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -0,0 +1,58 @@
+/*
+ * file        : hash_api.h
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * ECHO vperm implementation Hash API
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+
+
+#ifndef HASH_API_H
+#define HASH_API_H
+
+#ifndef NO_AES_NI
+#define HASH_IMPL_STR	"ECHO-aesni"
+#else
+#define HASH_IMPL_STR	"ECHO-vperm"
+#endif
+
+
+#include "algo/sha3/sha3_common.h"
+
+#include <emmintrin.h>
+
+
+typedef struct
+{
+	__m128i			state[4][4];
+	__m128i			k;
+	__m128i			hashsize;
+	__m128i			const1536;
+
+	unsigned int	uRounds;
+	unsigned int	uHashSize;
+	unsigned int	uBlockLength;
+	unsigned int	uBufferBytes;
+	DataLength		processed_bits;
+	BitSequence		buffer[192];
+
+} hashState_echo;
+
+HashReturn init_echo(hashState_echo *state, int hashbitlen);
+
+HashReturn reinit_echo(hashState_echo *state);
+
+HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen);
+
+HashReturn final_echo(hashState_echo *state, BitSequence *hashval);
+
+HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval);
+
+
+#endif // HASH_API_H
+
--- a/algo/echo/aes_ni/implementors
+++ b/algo/echo/aes_ni/implementors
@@ -0,0 +1 @@
+Çağdaş Çalık
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -0,0 +1,119 @@
+/*
+ * file        : vperm.h
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * vperm implementation of AES s-box 
+ *
+ * Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+
+#ifndef VPERM_H
+#define VPERM_H
+
+#include "algo/sha3/sha3_common.h"
+#include <tmmintrin.h>
+
+/*
+extern const unsigned int _k_s0F[];
+extern const unsigned int _k_ipt[];
+extern const unsigned int _k_opt[];
+extern const unsigned int _k_inv[];
+extern const unsigned int _k_sb1[];
+extern const unsigned int _k_sb2[];
+extern const unsigned int _k_sb3[];
+extern const unsigned int _k_sb4[];
+extern const unsigned int _k_sb5[];
+extern const unsigned int _k_sb7[];
+extern const unsigned int _k_sbo[];
+extern const unsigned int _k_h63[];
+extern const unsigned int _k_hc6[];
+extern const unsigned int _k_h5b[];
+extern const unsigned int _k_h4e[];
+extern const unsigned int _k_h0e[];
+extern const unsigned int _k_h15[];
+extern const unsigned int _k_aesmix1[];
+extern const unsigned int _k_aesmix2[];
+extern const unsigned int _k_aesmix3[];
+extern const unsigned int _k_aesmix4[];
+*/
+
+// input: x, table
+// output: x
+#define TRANSFORM(x, table, t1, t2)\
+	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
+	t1 = _mm_srli_epi32(t1, 4);\
+	x  = _mm_and_si128(x, M128(_k_s0F));\
+	t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
+	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
+	x  = _mm_xor_si128(x, t1)
+
+// compiled erroneously with 32-bit msc compiler
+	//t2 = _mm_shuffle_epi8(table[0], x);\
+	//x  = _mm_shuffle_epi8(table[1], t1);\
+	//x  = _mm_xor_si128(x, t2)
+
+
+// input: x
+// output: t2, t3
+#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
+	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
+	t1 = _mm_srli_epi32(t1, 4);\
+	x  = _mm_and_si128(x, M128(_k_s0F));\
+	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
+	x  = _mm_xor_si128(x, t1);\
+	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
+	t3 = _mm_xor_si128(t3, t2);\
+	t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
+	t4 = _mm_xor_si128(t4, t2);\
+	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
+	t2 = _mm_xor_si128(t2, x);\
+	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
+	t3 = _mm_xor_si128(t3, t1);\
+
+
+// input: x1, x2, table
+// output: y
+#define VPERM_LOOKUP(x1, x2, table, y, t)\
+	t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
+	y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
+	y = _mm_xor_si128(y, t)
+
+
+// input: x
+// output: x
+#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4)  \
+	TRANSFORM(x, _k_ipt, t1, t2);\
+	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
+	VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
+	x = _mm_xor_si128(x, M128(_k_h63))
+
+
+// input: x
+// output: x
+#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
+	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
+	VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
+	VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
+	s3 = _mm_xor_si128(s1, s2);\
+	x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
+	x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
+	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
+	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
+	x = _mm_xor_si128(x, M128(_k_h5b))
+
+
+// input: x
+// output: x
+#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
+	TRANSFORM(x, _k_ipt, t1, t2);\
+	AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
+	TRANSFORM(x, _k_opt, t1, t2)
+
+#endif // VPERM_H
+
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -0,0 +1,320 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/echo/sse2/echo.c
+++ b/algo/echo/sse2/echo.c
--- a/algo/echo/sse2/sph_echo.h
+++ b/algo/echo/sse2/sph_echo.h
@@ -0,0 +1,320 @@
+/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * ECHO interface. ECHO is a family of functions which differ by
+ * their output size; this implementation defines ECHO for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_echo.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_ECHO_H__
+#define SPH_ECHO_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for ECHO-224.
+ */
+#define SPH_SIZE_echo224   224
+
+/**
+ * Output size (in bits) for ECHO-256.
+ */
+#define SPH_SIZE_echo256   256
+
+/**
+ * Output size (in bits) for ECHO-384.
+ */
+#define SPH_SIZE_echo384   384
+
+/**
+ * Output size (in bits) for ECHO-512.
+ */
+#define SPH_SIZE_echo512   512
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-224
+ * and ECHO-256.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[192];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[4][4];
+#if SPH_64
+		sph_u64 Vb[4][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_small_context;
+
+/**
+ * This structure is a context for ECHO computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * an ECHO computation has been performed, the context can be reused for
+ * another computation. This specific structure is used for ECHO-384
+ * and ECHO-512.
+ *
+ * The contents of this structure are private. A running ECHO computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+	union {
+		sph_u32 Vs[8][4];
+#if SPH_64
+		sph_u64 Vb[8][2];
+#endif
+	} u;
+	sph_u32 C0, C1, C2, C3;
+#endif
+} sph_echo_big_context;
+
+/**
+ * Type for a ECHO-224 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo224_context;
+
+/**
+ * Type for a ECHO-256 context (identical to the common "small" context).
+ */
+typedef sph_echo_small_context sph_echo256_context;
+
+/**
+ * Type for a ECHO-384 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo384_context;
+
+/**
+ * Type for a ECHO-512 context (identical to the common "big" context).
+ */
+typedef sph_echo_big_context sph_echo512_context;
+
+/**
+ * Initialize an ECHO-224 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-224 context (pointer to a
+ *             <code>sph_echo224_context</code>)
+ */
+void sph_echo224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param dst   the destination buffer
+ */
+void sph_echo224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-256 context (pointer to a
+ *             <code>sph_echo256_context</code>)
+ */
+void sph_echo256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param dst   the destination buffer
+ */
+void sph_echo256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-384 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-384 context (pointer to a
+ *             <code>sph_echo384_context</code>)
+ */
+void sph_echo384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param dst   the destination buffer
+ */
+void sph_echo384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize an ECHO-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the ECHO-512 context (pointer to a
+ *             <code>sph_echo512_context</code>)
+ */
+void sph_echo512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the ECHO-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_echo512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current ECHO-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param dst   the destination buffer
+ */
+void sph_echo512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the ECHO-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_echo512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+	
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/fresh.c
+++ b/algo/fresh.c
@@ -0,0 +1,139 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/echo/sph_echo.h"
+
+//#define DEBUG_ALGO
+
+extern void freshhash(void* output, const void* input, uint32_t len)
+{
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashA hash
+	#define hashB hash+64
+
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, input, len);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, hashB, 64);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hashB, 64);
+	sph_echo512_close(&ctx_echo, hashA);
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_fresh(int thr_id, struct work *work,
+				uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t len = 80;
+
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+#ifdef _MSC_VER
+	uint32_t __declspec(align(32)) hash64[8];
+#else
+	uint32_t hash64[8] __attribute__((aligned(32)));
+#endif
+	uint32_t endiandata[32];
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+        for (int k = 0; k < 19; k++)
+                be32enc(&endiandata[k], pdata[k]);
+
+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				freshhash(hash64, endiandata, len);
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+void fresh_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+
+bool register_fresh_algo( algo_gate_t* gate )
+{
+    algo_not_tested();
+    gate->scanhash   = (void*)&scanhash_fresh;
+    gate->hash       = (void*)&freshhash;
+    gate->hash_alt   = (void*)&freshhash;
+    gate->set_target = (void*)&fresh_set_target;
+    gate->get_max64  = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/fugue/.dirstamp
+++ b/algo/fugue/.dirstamp
--- a/algo/fugue/sph_fugue.c
+++ b/algo/fugue/sph_fugue.c
--- a/algo/fugue/sph_fugue.h
+++ b/algo/fugue/sph_fugue.h
@@ -0,0 +1,81 @@
+#ifndef SPH_FUGUE_H__
+#define SPH_FUGUE_H__
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_fugue224   224
+
+#define SPH_SIZE_fugue256   256
+
+#define SPH_SIZE_fugue384   384
+
+#define SPH_SIZE_fugue512   512
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	sph_u32 partial;
+	unsigned partial_len;
+	unsigned round_shift;
+	sph_u32 S[36];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} sph_fugue_context;
+
+typedef sph_fugue_context sph_fugue224_context;
+
+typedef sph_fugue_context sph_fugue256_context;
+
+typedef sph_fugue_context sph_fugue384_context;
+
+typedef sph_fugue_context sph_fugue512_context;
+
+void sph_fugue224_init(void *cc);
+
+void sph_fugue224(void *cc, const void *data, size_t len);
+
+void sph_fugue224_close(void *cc, void *dst);
+
+void sph_fugue224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue256_init(void *cc);
+
+void sph_fugue256(void *cc, const void *data, size_t len);
+
+void sph_fugue256_close(void *cc, void *dst);
+
+void sph_fugue256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue384_init(void *cc);
+
+void sph_fugue384(void *cc, const void *data, size_t len);
+
+void sph_fugue384_close(void *cc, void *dst);
+
+void sph_fugue384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_fugue512_init(void *cc);
+
+void sph_fugue512(void *cc, const void *data, size_t len);
+
+void sph_fugue512_close(void *cc, void *dst);
+
+void sph_fugue512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/gost/.dirstamp
+++ b/algo/gost/.dirstamp
--- a/algo/gost/sph_gost.c
+++ b/algo/gost/sph_gost.c
--- a/algo/gost/sph_gost.h
+++ b/algo/gost/sph_gost.h
@@ -0,0 +1,185 @@
+/* $Id: sph_gost.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * GOST interface. This is the interface for GOST R 12 with the
+ * recommended parameters for SHA-3, with output lengths 256
+ * and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_gost.h
+ * @author   Mish <mish@btchouse.com>
+ */
+
+#ifndef SPH_GOST_H__
+#define SPH_GOST_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for GOST-256.
+ */
+#define SPH_SIZE_gost256   256
+
+/**
+ * Output size (in bits) for GOST-512.
+ */
+#define SPH_SIZE_gost512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * GOST computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running GOST computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+/**
+ * This structure is a context for Gost-256 computations.
+ */
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[3][8];
+#endif
+} sph_gost256_context;
+
+/**
+ * This structure is a context for Gost-512 computations.
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 V[5][8];
+#endif
+} sph_gost512_context;
+
+
+/**
+ * Initialize a GOST-256 context. This process performs no memory allocation.
+ *
+ * @param cc   the GOST-256 context (pointer to a
+ *             <code>sph_gost256_context</code>)
+ */
+void sph_gost256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the Gost-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_gost256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current GOST-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the GOST-256 context
+ * @param dst   the destination buffer
+ */
+void sph_gost256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the GOST-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_gost256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a Gost-512 context. This process performs no memory allocation.
+ *
+ * @param cc   the GOST-512 context (pointer to a
+ *             <code>sph_gost512_context</code>)
+ */
+void sph_gost512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the GOST-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_gost512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current GOST-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the GOST-512 context
+ * @param dst   the destination buffer
+ */
+void sph_gost512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the GOST-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_gost512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/groestl/.dirstamp
+++ b/algo/groestl/.dirstamp
--- a/algo/groestl/aes_ni/.dirstamp
+++ b/algo/groestl/aes_ni/.dirstamp
--- a/algo/groestl/aes_ni/README
+++ b/algo/groestl/aes_ni/README
@@ -0,0 +1,14 @@
+This package contains an implementation of the Groestl-512 hash
+function optimized for the Intel AES instructions.
+
+Authors are Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+
+There are no known present or future claims by a copyright holder that
+the distribution of this software infringes the copyright. In
+particular, the author of the software is not making such claims and
+does not intend to make such claims.
+
+Moreover, there are no known present or future claims by a patent
+holder that the use of this software infringes the patent. In
+particular, the author of the software is not making such claims and
+does not intend to make such claims.
--- a/algo/groestl/aes_ni/api.h
+++ b/algo/groestl/aes_ni/api.h
@@ -0,0 +1,2 @@
+#define CRYPTO_BYTES 64
+#define CRYPTO_VERSION "2.2"
--- a/Show More
+++ b/Show More