v23.5

2025-09-17 23:44:27 +00:00 · 2023-10-25 20:36:20 -04:00
parent 31c4dedf59
commit 160608cce5
180 changed files with 10318 additions and 13097 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,19 +22,13 @@ cpuminer_SOURCES = \
  sysinfos.c \
  algo-gate-api.c\
  malloc-huge.c \
-  algo/argon2/argon2a/argon2a.c \
-  algo/argon2/argon2a/ar2/argon2.c \
-  algo/argon2/argon2a/ar2/opt.c \
-  algo/argon2/argon2a/ar2/cores.c \
-  algo/argon2/argon2a/ar2/ar2-scrypt-jane.c \
-  algo/argon2/argon2a/ar2/blake2b.c \
-  algo/argon2/argon2d/argon2d-gate.c \
-  algo/argon2/argon2d/blake2/blake2b.c \
-  algo/argon2/argon2d/argon2d/argon2.c \
-  algo/argon2/argon2d/argon2d/core.c \
-  algo/argon2/argon2d/argon2d/opt.c \
-  algo/argon2/argon2d/argon2d/argon2d_thread.c \
-  algo/argon2/argon2d/argon2d/encoding.c \
+  algo/argon2d/argon2d-gate.c \
+  algo/argon2d/blake2/blake2b.c \
+  algo/argon2d/argon2d/argon2.c \
+  algo/argon2d/argon2d/core.c \
+  algo/argon2d/argon2d/opt.c \
+  algo/argon2d/argon2d/argon2d_thread.c \
+  algo/argon2d/argon2d/encoding.c \
  algo/blake/sph_blake.c \
  algo/blake/blake256-hash.c \
  algo/blake/blake512-hash.c \
@@ -63,6 +57,7 @@ cpuminer_SOURCES = \
  algo/bmw/bmw512-4way.c \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
+  algo/cubehash/sph_cubehash.c \
  algo/echo/sph_echo.c \
  algo/echo/echo-hash-4way.c \
  algo/echo/aes_ni/hash.c\
@@ -104,6 +99,7 @@ cpuminer_SOURCES = \
  algo/lanehash/lane.c \
  algo/luffa/luffa_for_sse2.c \
  algo/luffa/luffa-hash-2way.c \
+  algo/luffa/sph_luffa.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/sponge-2way.c \
@@ -114,13 +110,11 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2rev3.c \
  algo/lyra2/lyra2rev3-4way.c \
  algo/lyra2/lyra2re.c \
-  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
  algo/lyra2/lyra2h.c \
  algo/lyra2/lyra2h-4way.c \
  algo/lyra2/allium-4way.c \
-  algo/lyra2/allium.c \
  algo/lyra2/phi2-4way.c \
  algo/lyra2/phi2.c \
  algo/m7m/m7m.c \
@@ -179,6 +173,7 @@ cpuminer_SOURCES = \
  algo/shavite/shavite.c \
  algo/simd/nist.c \
  algo/simd/vector.c \
+  algo/simd/sph_simd.c \
  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
--- a/28
+++ b/28
@@ -33,6 +33,14 @@ supported.
 64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
 are not supported. FreeBSD YMMV.

+ARM requirements (Beta):
+
+CPU: Armv8 and NEON, SHA2 & AES are optional
+OS: Linux distribution built for AArch64.
+Packages: source code only.
+
+See wiki for details.
+
 Reporting bugs
 --------------

@@ -65,6 +73,26 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v23.5
+
+New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.
+
+BETA: 64 bit ARM support
+- ARM 64 bit CPUs are now supported with source code for Linux. Windows is not supported.
+- NEON, AES & SHA2 are supported.
+- This the first public release and is early Beta quality.
+- Some algorithms do not work on ARM or work at reduced performance.
+See wiki for details: https://github.com/JayDDee/cpuminer-opt/wiki/Support-for-AArch64.
+
+- CPU archtecture and OS detection and logging now support ARM features.
+- New 2way parallel hash for ARM also helps x86_64 CPUs without AVX2 on supported algorithms.
+- Enhanced startup feature logs to support ARM.
+- Removed startup logs for incompatible CPU/SW architectures.
+- Added CPU architecture & OS type to RPC user agent string.
+- Added share reject controls, a warning log is displayed at 10% reject rate, the miner exits with an error log at 50%.
+- Removed argon2 algorithm.
+- New CLI option "--bell" adds an ASCII bell code in the output string of error, warning, & rejected share logs. The option is disabled by default.
+
 v3.23.4

 Source code only. 
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -295,7 +295,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  {
    case ALGO_ALLIUM:       rc = register_allium_algo        ( gate ); break;
    case ALGO_ANIME:        rc = register_anime_algo         ( gate ); break;
-    case ALGO_ARGON2:       rc = register_argon2_algo        ( gate ); break;
    case ALGO_ARGON2D250:   rc = register_argon2d_crds_algo  ( gate ); break;
    case ALGO_ARGON2D500:   rc = register_argon2d_dyn_algo   ( gate ); break;
    case ALGO_ARGON2D4096:  rc = register_argon2d4096_algo   ( gate ); break;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -89,15 +89,18 @@
 typedef  uint32_t set_t;

 #define EMPTY_SET        0
-#define SSE2_OPT         1   // Core2, NEON
-#define AES_OPT          2  
-#define SSE42_OPT        4
-#define AVX_OPT          8   // Sandybridge
-#define AVX2_OPT      0x10   // Haswell, Zen1
-#define SHA_OPT       0x20   // Zen1, Icelake. NEON
-#define AVX512_OPT    0x40   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
-#define VAES_OPT      0x80   // Icelake, Zen3
-#define SHA512_OPT   0x100   // Lunar Lake, Arrow Lake, NEON
+#define SSE2_OPT         1         // parity with NEON
+#define SSSE3_OPT        1 <<  1   // Intel Core2
+#define SSE41_OPT        1 <<  2
+#define SSE42_OPT        1 <<  3
+#define AVX_OPT          1 <<  4   // Intel Sandybridge
+#define AVX2_OPT         1 <<  5   // Intel Haswell, AMD Zen1
+#define AVX512_OPT       1 <<  6   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
+#define AES_OPT          1 <<  7   // Intel Westmere, AArch64
+#define VAES_OPT         1 <<  8   // Icelake, Zen3
+#define SHA_OPT          1 <<  9   // Zen1, Icelake, AArch64 
+#define SHA512_OPT       1 << 10   // AArch64 
+#define NEON_OPT         1 << 11   // AArch64 

 // AVX10 does not have explicit algo features:
 //  AVX10_512 is compatible with AVX512 + VAES
--- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c
+++ b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c
@@ -1,249 +0,0 @@
-/*
-	scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
-
-	Public Domain or MIT License, whichever is easier
-*/
-
-#include <string.h>
-
-#if defined( _WINDOWS )
-#if !defined( QT_GUI )
-extern "C" {
-#endif
-#endif
-
-#include "ar2-scrypt-jane.h"
-
-#include "sj/scrypt-jane-portable.h"
-#include "sj/scrypt-jane-hash.h"
-#include "sj/scrypt-jane-romix.h"
-#include "sj/scrypt-jane-test-vectors.h"
-
-#define scrypt_maxNfactor 30  /* (1 << (30 + 1)) = ~2 billion */
-#if (SCRYPT_BLOCK_BYTES == 64)
-#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
-#elif (SCRYPT_BLOCK_BYTES == 128)
-#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
-#elif (SCRYPT_BLOCK_BYTES == 256)
-#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
-#elif (SCRYPT_BLOCK_BYTES == 512)
-#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
-#endif
-#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */
-#define scrypt_maxpfactor 25  /* (1 << 25) = ~33 million */
-
-#include <stdio.h>
-//#include <malloc.h>
-
-static void NORETURN
-scrypt_fatal_error_default(const char *msg) {
-	fprintf(stderr, "%s\n", msg);
-	exit(1);
-}
-
-static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
-
-void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) {
-	scrypt_fatal_error = fn;
-}
-
-static int scrypt_power_on_self_test(void)
-{
-	const scrypt_test_setting *t;
-	uint8_t test_digest[64];
-	uint32_t i;
-	int res = 7, scrypt_valid;
-
-	if (!scrypt_test_mix()) {
-#if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
-#endif
-		res &= ~1;
-	}
-
-	if (!scrypt_test_hash()) {
-#if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
-#endif
-		res &= ~2;
-	}
-
-	for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
-		t = post_settings + i;
-		scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
-		scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
-	}
-
-	if (!scrypt_valid) {
-#if !defined(SCRYPT_TEST)
-		scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
-#endif
-		res &= ~4;
-	}
-
-	return res;
-}
-
-typedef struct scrypt_aligned_alloc_t {
-	uint8_t *mem, *ptr;
-} scrypt_aligned_alloc;
-
-#ifdef SCRYPT_TEST_SPEED
-
-static uint8_t *mem_base = (uint8_t *)0;
-static size_t mem_bump = 0;
-
-/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
-static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
-{
-	scrypt_aligned_alloc aa;
-	if (!mem_base) {
-		mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
-		if (!mem_base)
-			scrypt_fatal_error("scrypt: out of memory");
-		mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
-	}
-	aa.mem = mem_base + mem_bump;
-	aa.ptr = aa.mem;
-	mem_bump += (size_t)size;
-	return aa;
-}
-
-static void scrypt_free(scrypt_aligned_alloc *aa) {
-	mem_bump = 0;
-}
-
-#else
-
-static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
-{
-	static const size_t max_alloc = (size_t)-1;
-	scrypt_aligned_alloc aa;
-	size += (SCRYPT_BLOCK_BYTES - 1);
-	if (size > max_alloc)
-		scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
-	aa.mem = (uint8_t *)malloc((size_t)size);
-	aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
-	if (!aa.mem)
-		scrypt_fatal_error("scrypt: out of memory");
-	return aa;
-}
-
-static void scrypt_free(scrypt_aligned_alloc *aa)
-{
-	free(aa->mem);
-}
-
-#endif /* SCRYPT_TEST_SPEED */
-
-
-void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len,
-	uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes)
-{
-	scrypt_aligned_alloc YX, V;
-	uint8_t *X, *Y;
-	uint32_t N, r, p, chunk_bytes, i;
-
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
-#endif
-
-#if !defined(SCRYPT_TEST)
-	static int power_on_self_test = 0;
-	if (!power_on_self_test) {
-		power_on_self_test = 1;
-		if (!scrypt_power_on_self_test())
-			scrypt_fatal_error("scrypt: power on self test failed");
-	}
-#endif
-
-	if (Nfactor > scrypt_maxNfactor)
-		scrypt_fatal_error("scrypt: N out of range");
-	if (rfactor > scrypt_maxrfactor)
-		scrypt_fatal_error("scrypt: r out of range");
-	if (pfactor > scrypt_maxpfactor)
-		scrypt_fatal_error("scrypt: p out of range");
-
-	N = (1 << (Nfactor + 1));
-	r = (1 << rfactor);
-	p = (1 << pfactor);
-
-	chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
-	V = scrypt_alloc((uint64_t)N * chunk_bytes);
-	YX = scrypt_alloc((p + 1) * chunk_bytes);
-
-	/* 1: X = PBKDF2(password, salt) */
-	Y = YX.ptr;
-	X = Y + chunk_bytes;
-	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
-
-	/* 2: X = ROMix(X) */
-	for (i = 0; i < p; i++)
-		scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
-
-	/* 3: Out = PBKDF2(password, X) */
-	scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
-
-	scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
-
-	scrypt_free(&V);
-	scrypt_free(&YX);
-}
-
-#define Nfactor 8
-#define rfactor 0
-#define pfactor 0
-#if (SCRYPT_BLOCK_BYTES == 64)
-#define chunk_bytes 128
-#elif (SCRYPT_BLOCK_BYTES == 128)
-#define chunk_bytes 256
-#elif (SCRYPT_BLOCK_BYTES == 256)
-#define chunk_bytes 512
-#elif (SCRYPT_BLOCK_BYTES == 512)
-#define chunk_bytes 1024
-#endif
-
-void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out)
-{
-	scrypt_aligned_alloc YX, V;
-	uint8_t *X, *Y;
-
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-	scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
-#endif
-
-/*
-#if !defined(SCRYPT_TEST)
-	static int power_on_self_test = 0;
-	if (!power_on_self_test) {
-		power_on_self_test = 1;
-		if (!scrypt_power_on_self_test())
-			scrypt_fatal_error("scrypt: power on self test failed");
-	}
-#endif
-*/
-	V = scrypt_alloc((uint64_t)512 * chunk_bytes);
-	YX = scrypt_alloc(2 * chunk_bytes);
-
-	/* 1: X = PBKDF2(password, salt) */
-	Y = YX.ptr;
-	X = Y + chunk_bytes;
-	scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes);
-
-	/* 2: X = ROMix(X) */
-	scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1);
-
-	/* 3: Out = PBKDF2(password, X) */
-	scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32);
-
-	scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes);
-
-	scrypt_free(&V);
-	scrypt_free(&YX);
-}
-
-#if defined( _WINDOWS )
-#if !defined( QT_GUI )
-} /* extern "C" */
-#endif
-#endif
--- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h
+++ b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h
@@ -1,35 +0,0 @@
-#ifndef AR2_SCRYPT_JANE_H
-#define AR2_SCRYPT_JANE_H
-
-#ifdef _MSC_VER
-#undef SCRYPT_CHOOSE_COMPILETIME
-#endif
-//#define SCRYPT_TEST
-#define SCRYPT_SKEIN512
-#define SCRYPT_SALSA64
-
-/*
-	Nfactor: Increases CPU & Memory Hardness
-	N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
-
-	rfactor: Increases Memory Hardness
-	r = (1 << rfactor): How large a chunk is
-
-	pfactor: Increases CPU Hardness
-	p = (1 << pfactor): Number of times to mix the main chunk
-
-	A block is the basic mixing unit (salsa/chacha block = 64 bytes)
-	A chunk is (2 * r) blocks
-
-	~Memory used = (N + 2) * ((2 * r) * block size)
-*/
-
-#include <stdlib.h>
-#include <stdint.h>
-
-typedef void (*scrypt_fatal_errorfn)(const char *msg);
-void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
-
-void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes);
-void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out);
-#endif /* AR2_SCRYPT_JANE_H */
--- a/algo/argon2/argon2a/ar2/argon2.c
+++ b/algo/argon2/argon2a/ar2/argon2.c
@@ -1,284 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <limits.h>
-
-#include "argon2.h"
-#include "cores.h"
-
-/* Error messages */
-static const char *Argon2_ErrorMessage[] = {
-	/*{ARGON2_OK, */ "OK",
-	/*},
-
-	{ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL",
-	/*},
-
-{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short",
-	/*},
-{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long",
-	/*},
-
-{ARGON2_PWD_TOO_SHORT, */ "Password is too short",
-	/*},
-{ARGON2_PWD_TOO_LONG, */ "Password is too long",
-	/*},
-
-{ARGON2_SALT_TOO_SHORT, */ "Salt is too short",
-	/*},
-{ARGON2_SALT_TOO_LONG, */ "Salt is too long",
-	/*},
-
-{ARGON2_AD_TOO_SHORT, */ "Associated data is too short",
-	/*},
-{ARGON2_AD_TOO_LONG, */ "Associated date is too long",
-	/*},
-
-{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short",
-	/*},
-{ARGON2_SECRET_TOO_LONG, */ "Secret is too long",
-	/*},
-
-{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small",
-	/*},
-{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large",
-	/*},
-
-{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small",
-	/*},
-{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large",
-	/*},
-
-{ARGON2_LANES_TOO_FEW, */ "Too few lanes",
-	/*},
-{ARGON2_LANES_TOO_MANY, */ "Too many lanes",
-	/*},
-
-{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0",
-	/*},
-{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0",
-	/*},
-{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0",
-	/*},
-{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0",
-	/*},
-
-{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error",
-	/*},
-
-{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL",
-	/*},
-{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL",
-	/*},
-
-{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL",
-	/*},
-{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2",
-	/*},
-
-{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch",
-	/*},
-
-{ARGON2_THREADS_TOO_FEW, */ "Not enough threads",
-	/*},
-{ARGON2_THREADS_TOO_MANY, */ "Too many threads",
-	/*},
-{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/
-};
-
-int argon2d(argon2_context *context) { return ar2_argon2_core(context, Argon2_d); }
-
-int argon2i(argon2_context *context) { return ar2_argon2_core(context, Argon2_i); }
-
-int ar2_verify_d(argon2_context *context, const char *hash)
-{
-	int result;
-	/*if (0 == context->outlen || NULL == hash) {
-		return ARGON2_OUT_PTR_MISMATCH;
-	}*/
-
-	result = ar2_argon2_core(context, Argon2_d);
-
-	if (ARGON2_OK != result) {
-		return result;
-	}
-
-	return 0 == memcmp(hash, context->out, 32);
-}
-
-const char *error_message(int error_code)
-{
-	enum {
-		/* Make sure---at compile time---that the enum size matches the array
-		   size */
-		ERROR_STRING_CHECK =
-			1 /
-			!!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) ==
-			   ARGON2_ERROR_CODES_LENGTH)
-	};
-	if (error_code < ARGON2_ERROR_CODES_LENGTH) {
-		return Argon2_ErrorMessage[(argon2_error_codes)error_code];
-	}
-	return "Unknown error code.";
-}
-
-/* encoding/decoding helpers */
-
-/*
- * Some macros for constant-time comparisons. These work over values in
- * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true".
- */
-#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF)
-#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF)
-#define GE(x, y) (GT(y, x) ^ 0xFF)
-#define LT(x, y) GT(y, x)
-#define LE(x, y) GE(y, x)
-
-/*
- * Convert value x (0..63) to corresponding Base64 character.
- */
-static int b64_byte_to_char(unsigned x) {
-//static inline int b64_byte_to_char(unsigned x) {
-	return (LT(x, 26) & (x + 'A')) |
-		   (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) |
-		   (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') |
-		   (EQ(x, 63) & '/');
-}
-
-/*
- * Convert some bytes to Base64. 'dst_len' is the length (in characters)
- * of the output buffer 'dst'; if that buffer is not large enough to
- * receive the result (including the terminating 0), then (size_t)-1
- * is returned. Otherwise, the zero-terminated Base64 string is written
- * in the buffer, and the output length (counted WITHOUT the terminating
- * zero) is returned.
- */
-static size_t to_base64(char *dst, size_t dst_len, const void *src)
-{
-	size_t olen;
-	const unsigned char *buf;
-	unsigned acc, acc_len;
-
-	olen = 43;
-	/*switch (32 % 3) {
-	case 2:
-		olen++;*/
-	/* fall through */
-	/*case 1:
-		olen += 2;
-		break;
-	}*/
-	if (dst_len <= olen) {
-		return (size_t)-1;
-	}
-	acc = 0;
-	acc_len = 0;
-	buf = (const unsigned char *)src;
-	size_t src_len = 32;
-	while (src_len-- > 0) {
-		acc = (acc << 8) + (*buf++);
-		acc_len += 8;
-		while (acc_len >= 6) {
-			acc_len -= 6;
-			*dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F);
-		}
-	}
-	if (acc_len > 0) {
-		*dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F);
-	}
-	*dst++ = 0;
-	return olen;
-}
-
-/* ==================================================================== */
-/*
- * Code specific to Argon2i.
- *
- * The code below applies the following format:
- *
- *  $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
- *
- * where <num> is a decimal integer (positive, fits in an 'unsigned long')
- * and <bin> is Base64-encoded data (no '=' padding characters, no newline
- * or whitespace). The "keyid" is a binary identifier for a key (up to 8
- * bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
- * (resp. the 'data') is empty, then it is ommitted from the output.
- *
- * The last two binary chunks (encoded in Base64) are, in that order,
- * the salt and the output. Both are optional, but you cannot have an
- * output without a salt. The binary salt length is between 8 and 48 bytes.
- * The output length is always exactly 32 bytes.
- */
-
-int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx)
-{
-#define SS(str)                                                                \
-	do {                                                                       \
-		size_t pp_len = strlen(str);                                           \
-		if (pp_len >= dst_len) {                                               \
-			return 0;                                                          \
-		}                                                                      \
-		memcpy(dst, str, pp_len + 1);                                          \
-		dst += pp_len;                                                         \
-		dst_len -= pp_len;                                                     \
-	} while (0)
-
-#define SX(x)                                                                  \
-	do {                                                                       \
-		char tmp[30];                                                          \
-		sprintf(tmp, "%lu", (unsigned long)(x));                               \
-		SS(tmp);                                                               \
-	} while (0);
-
-#define SB(buf)                                                                \
-	do {                                                                       \
-		size_t sb_len = to_base64(dst, dst_len, buf);                          \
-		if (sb_len == (size_t)-1) {                                            \
-			return 0;                                                          \
-		}                                                                      \
-		dst += sb_len;                                                         \
-		dst_len -= sb_len;                                                     \
-	} while (0);
-
-	SS("$argon2i$m=");
-	SX(16);
-	SS(",t=");
-	SX(2);
-	SS(",p=");
-	SX(1);
-
-	/*if (ctx->adlen > 0) {
-		SS(",data=");
-		SB(ctx->ad, ctx->adlen);
-	}*/
-
-	/*if (ctx->saltlen == 0)
-		return 1;*/
-
-	SS("$");
-	SB(ctx->salt);
-
-	/*if (ctx->outlen32 == 0)
-		return 1;*/
-
-	SS("$");
-	SB(ctx->out);
-	return 1;
-
-#undef SS
-#undef SX
-#undef SB
-}
--- a/algo/argon2/argon2a/ar2/argon2.h
+++ b/algo/argon2/argon2a/ar2/argon2.h
@@ -1,292 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-#ifndef ARGON2_H
-#define ARGON2_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include <limits.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/*************************Argon2 input parameter
- * restrictions**************************************************/
-
-/* Minimum and maximum number of lanes (degree of parallelism) */
-#define ARGON2_MIN_LANES UINT32_C(1)
-#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
-
-/* Minimum and maximum number of threads */
-#define ARGON2_MIN_THREADS UINT32_C(1)
-#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
-
-/* Number of synchronization points between lanes per pass */
-#define ARGON2_SYNC_POINTS UINT32_C(4)
-
-/* Minimum and maximum digest size in bytes */
-#define ARGON2_MIN_OUTLEN UINT32_C(4)
-#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
-
-/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
-#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
-
-#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
-/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB)
- */
-#define ARGON2_MAX_MEMORY_BITS                                                 \
-    ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
-#define ARGON2_MAX_MEMORY                                                      \
-    ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
-
-/* Minimum and maximum number of passes */
-#define ARGON2_MIN_TIME UINT32_C(1)
-#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
-
-/* Minimum and maximum password length in bytes */
-#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
-#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
-
-/* Minimum and maximum associated data length in bytes */
-#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
-#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
-
-/* Minimum and maximum salt length in bytes */
-#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
-#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
-
-/* Minimum and maximum key length in bytes */
-#define ARGON2_MIN_SECRET UINT32_C(0)
-#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
-
-#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
-#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
-#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2)
-#define ARGON2_DEFAULT_FLAGS                                                   \
-    (ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY)
-
-/* Error codes */
-typedef enum Argon2_ErrorCodes {
-    ARGON2_OK = 0,
-
-    ARGON2_OUTPUT_PTR_NULL = 1,
-
-    ARGON2_OUTPUT_TOO_SHORT = 2,
-    ARGON2_OUTPUT_TOO_LONG = 3,
-
-    ARGON2_PWD_TOO_SHORT = 4,
-    ARGON2_PWD_TOO_LONG = 5,
-
-    ARGON2_SALT_TOO_SHORT = 6,
-    ARGON2_SALT_TOO_LONG = 7,
-
-    ARGON2_AD_TOO_SHORT = 8,
-    ARGON2_AD_TOO_LONG = 9,
-
-    ARGON2_SECRET_TOO_SHORT = 10,
-    ARGON2_SECRET_TOO_LONG = 11,
-
-    ARGON2_TIME_TOO_SMALL = 12,
-    ARGON2_TIME_TOO_LARGE = 13,
-
-    ARGON2_MEMORY_TOO_LITTLE = 14,
-    ARGON2_MEMORY_TOO_MUCH = 15,
-
-    ARGON2_LANES_TOO_FEW = 16,
-    ARGON2_LANES_TOO_MANY = 17,
-
-    ARGON2_PWD_PTR_MISMATCH = 18,    /* NULL ptr with non-zero length */
-    ARGON2_SALT_PTR_MISMATCH = 19,   /* NULL ptr with non-zero length */
-    ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */
-    ARGON2_AD_PTR_MISMATCH = 21,     /* NULL ptr with non-zero length */
-
-    ARGON2_MEMORY_ALLOCATION_ERROR = 22,
-
-    ARGON2_FREE_MEMORY_CBK_NULL = 23,
-    ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24,
-
-    ARGON2_INCORRECT_PARAMETER = 25,
-    ARGON2_INCORRECT_TYPE = 26,
-
-    ARGON2_OUT_PTR_MISMATCH = 27,
-
-    ARGON2_THREADS_TOO_FEW = 28,
-    ARGON2_THREADS_TOO_MANY = 29,
-
-    ARGON2_MISSING_ARGS = 30,
-
-    ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after
-                                 this
-                                 error code */
-} argon2_error_codes;
-
-/* Memory allocator types --- for external allocation */
-typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
-typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
-
-/* Argon2 external data structures */
-
-/*
- *****Context: structure to hold Argon2 inputs:
- * output array and its length,
- * password and its length,
- * salt and its length,
- * secret and its length,
- * associated data and its length,
- * number of passes, amount of used memory (in KBytes, can be rounded up a bit)
- * number of parallel threads that will be run.
- * All the parameters above affect the output hash value.
- * Additionally, two function pointers can be provided to allocate and
- deallocate the memory (if NULL, memory will be allocated internally).
- * Also, three flags indicate whether to erase password, secret as soon as they
- are pre-hashed (and thus not needed anymore), and the entire memory
- ****************************
- Simplest situation: you have output array out[8], password is stored in
- pwd[32], salt is stored in salt[16], you do not have keys nor associated data.
- You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel
- lanes.
- You want to erase the password, but you're OK with last pass not being erased.
- You want to use the default memory allocator.
- */
-typedef struct Argon2_Context {
-    uint8_t *out;    /* output array */
-    uint8_t *pwd;    /* password array */
-    uint8_t *salt;    /* salt array */
-    /*uint8_t *secret;*/    /* key array */
-    /*uint8_t *ad;*/   /* associated data array */
-
-    allocate_fptr allocate_cbk; /* pointer to memory allocator */
-    deallocate_fptr free_cbk;   /* pointer to memory deallocator */
-
-    /*uint32_t outlen;*/ /* digest length */
-    uint32_t pwdlen;  /* password length */
-    /*uint32_t saltlen;*/ /* salt length */
-    /*uint32_t secretlen;*/ /* key length */
-    /*uint32_t adlen;*/ /* associated data length */
-    /*uint32_t t_cost;*/  /* number of passes */
-    /*uint32_t m_cost;*/  /* amount of memory requested (KB) */
-    /*uint32_t lanes;*/   /* number of lanes */
-    /*uint32_t threads;*/ /* maximum number of threads */
-    /*uint32_t flags;*/ /* array of bool options */
-
-} argon2_context;
-
-/**
- * Function to hash the inputs in the memory-hard fashion (uses Argon2i)
- * @param  out  Pointer to the memory where the hash digest will be written
- * @param  outlen Digest length in bytes
- * @param  in Pointer to the input (password)
- * @param  inlen Input length in bytes
- * @param  salt Pointer to the salt
- * @param  saltlen Salt length in bytes
- * @pre    @a out must have at least @a outlen bytes allocated
- * @pre    @a in must be at least @inlen bytes long
- * @pre    @a saltlen must be at least @saltlen bytes long
- * @return Zero if successful, 1 otherwise.
- */
-/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen,
-                 const void *salt, size_t saltlen, unsigned int t_cost,
-                 unsigned int m_cost);*/
-
-/* same for argon2d */
-/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen,
-                 const void *salt, size_t saltlen, unsigned int t_cost,
-                 unsigned int m_cost);*/
-
-/*
- * **************Argon2d: Version of Argon2 that picks memory blocks depending
- * on the password and salt. Only for side-channel-free
- * environment!!***************
- * @param  context  Pointer to current Argon2 context
- * @return  Zero if successful, a non zero error code otherwise
- */
-int argon2d(argon2_context *context);
-
-/*
- *  * **************Argon2i: Version of Argon2 that picks memory blocks
- *independent on the password and salt. Good for side-channels,
- ******************* but worse w.r.t. tradeoff attacks if
- *******************only one pass is used***************
- * @param  context  Pointer to current Argon2 context
- * @return  Zero if successful, a non zero error code otherwise
- */
-int argon2i(argon2_context *context);
-
-/*
- *   * **************Argon2di: Reserved name***************
- * @param  context  Pointer to current Argon2 context
- * @return  Zero if successful, a non zero error code otherwise
- */
-int argon2di(argon2_context *context);
-
-/*
- *   * **************Argon2ds: Argon2d hardened against GPU attacks, 20%
- * slower***************
- * @param  context  Pointer to current Argon2 context
- * @return  Zero if successful, a non zero error code otherwise
- */
-int argon2ds(argon2_context *context);
-
-/*
- *   * **************Argon2id: First half-pass over memory is
- *password-independent, the rest are password-dependent
- ********************OK against side channels: they reduce to 1/2-pass
- *Argon2i***************
- * @param  context  Pointer to current Argon2 context
- * @return  Zero if successful, a non zero error code otherwise
- */
-int argon2id(argon2_context *context);
-
-/*
- * Verify if a given password is correct for Argon2d hashing
- * @param  context  Pointer to current Argon2 context
- * @param  hash  The password hash to verify. The length of the hash is
- * specified by the context outlen member
- * @return  Zero if successful, a non zero error code otherwise
- */
-int ar2_verify_d(argon2_context *context, const char *hash);
-
-/*
- * Get the associated error message for given error code
- * @return  The error message associated with the given error code
- */
-const char *error_message(int error_code);
-
-/* ==================================================================== */
-/*
- * Code specific to Argon2i.
- *
- * The code below applies the following format:
- *
- *  $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
- *
- * where <num> is a decimal integer (positive, fits in an 'unsigned long')
- * and <bin> is Base64-encoded data (no '=' padding characters, no newline
- * or whitespace). The "keyid" is a binary identifier for a key (up to 8
- * bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
- * (resp. the 'data') is empty, then it is ommitted from the output.
- *
- * The last two binary chunks (encoded in Base64) are, in that order,
- * the salt and the output. Both are optional, but you cannot have an
- * output without a salt. The binary salt length is between 8 and 48 bytes.
- * The output length is always exactly 32 bytes.
- */
-
-int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/algo/argon2/argon2a/ar2/bench.c
+++ b/algo/argon2/argon2a/ar2/bench.c
@@ -1,114 +0,0 @@
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-
-#include "argon2.h"
-
-static uint64_t rdtsc(void)
-{
-#ifdef _MSC_VER
-	return __rdtsc();
-#else
-#if defined(__amd64__) || defined(__x86_64__)
-	uint64_t rax, rdx;
-	__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
-	return (rdx << 32) | rax;
-#elif defined(__i386__) || defined(__i386) || defined(__X86__)
-	uint64_t rax;
-	__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
-	return rax;
-#else
-#error "Not implemented!"
-#endif
-#endif
-}
-
-/*
- * Benchmarks Argon2 with salt length 16, password length 16, t_cost 1,
-   and different m_cost and threads
- */
-static void benchmark()
-{
-#define BENCH_OUTLEN 16
-#define BENCH_INLEN 16
-	const uint32_t inlen = BENCH_INLEN;
-	const unsigned outlen = BENCH_OUTLEN;
-	unsigned char out[BENCH_OUTLEN];
-	unsigned char pwd_array[BENCH_INLEN];
-	unsigned char salt_array[BENCH_INLEN];
-#undef BENCH_INLEN
-#undef BENCH_OUTLEN
-
-	uint32_t t_cost = 1;
-	uint32_t m_cost;
-	uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16};
-
-	memset(pwd_array, 0, inlen);
-	memset(salt_array, 1, inlen);
-
-	for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
-		unsigned i;
-		for (i = 0; i < 6; ++i) {
-			argon2_context context;
-			uint32_t thread_n = thread_test[i];
-			uint64_t stop_cycles, stop_cycles_i;
-			clock_t stop_time;
-			uint64_t delta_d, delta_i;
-			double mcycles_d, mcycles_i, run_time;
-
-			clock_t start_time = clock();
-			uint64_t start_cycles = rdtsc();
-
-			context.out = out;
-			context.outlen = outlen;
-			context.pwd = pwd_array;
-			context.pwdlen = inlen;
-			context.salt = salt_array;
-			context.saltlen = inlen;
-			context.secret = NULL;
-			context.secretlen = 0;
-			context.ad = NULL;
-			context.adlen = 0;
-			context.t_cost = t_cost;
-			context.m_cost = m_cost;
-			context.lanes = thread_n;
-			context.threads = thread_n;
-			context.allocate_cbk = NULL;
-			context.free_cbk = NULL;
-			context.flags = 0;
-
-			argon2d(&context);
-			stop_cycles = rdtsc();
-			argon2i(&context);
-			stop_cycles_i = rdtsc();
-			stop_time = clock();
-
-			delta_d = (stop_cycles - start_cycles) / (m_cost);
-			delta_i = (stop_cycles_i - stop_cycles) / (m_cost);
-			mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20);
-			mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20);
-			printf("Argon2d %d iterations  %d MiB %d threads:  %2.2f cpb %2.2f "
-				   "Mcycles \n",
-				   t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024,
-				   mcycles_d);
-			printf("Argon2i %d iterations  %d MiB %d threads:  %2.2f cpb %2.2f "
-				   "Mcycles \n",
-				   t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024,
-				   mcycles_i);
-
-			run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
-			printf("%2.4f seconds\n\n", run_time);
-		}
-	}
-}
-
-int main()
-{
-	benchmark();
-	return ARGON2_OK;
-}
--- a/algo/argon2/argon2a/ar2/blake2/blake2-impl.h
+++ b/algo/argon2/argon2a/ar2/blake2/blake2-impl.h
@@ -1,143 +0,0 @@
-#ifndef PORTABLE_BLAKE2_IMPL_H
-#define PORTABLE_BLAKE2_IMPL_H
-
-#include <stdint.h>
-#include <string.h>
-
-#if defined(_MSC_VER)
-#define BLAKE2_INLINE __inline
-#elif defined(__GNUC__) || defined(__clang__)
-#define BLAKE2_INLINE __inline__
-#else
-#define BLAKE2_INLINE
-#endif
-
-/* Argon2 Team - Begin Code */
-/*
-   Not an exhaustive list, but should cover the majority of modern platforms
-   Additionally, the code will always be correct---this is only a performance
-   tweak.
-*/
-#if (defined(__BYTE_ORDER__) &&                                                \
-	 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) ||                           \
-	defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
-	defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) ||       \
-	defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) ||                \
-	defined(_M_ARM)
-#define NATIVE_LITTLE_ENDIAN
-#endif
-/* Argon2 Team - End Code */
-
-static BLAKE2_INLINE uint32_t load32(const void *src) {
-#if defined(NATIVE_LITTLE_ENDIAN)
-	uint32_t w;
-	memcpy(&w, src, sizeof w);
-	return w;
-#else
-	const uint8_t *p = (const uint8_t *)src;
-	uint32_t w = *p++;
-	w |= (uint32_t)(*p++) << 8;
-	w |= (uint32_t)(*p++) << 16;
-	w |= (uint32_t)(*p++) << 24;
-	return w;
-#endif
-}
-
-static BLAKE2_INLINE uint64_t load64(const void *src) {
-#if defined(NATIVE_LITTLE_ENDIAN)
-	uint64_t w;
-	memcpy(&w, src, sizeof w);
-	return w;
-#else
-	const uint8_t *p = (const uint8_t *)src;
-	uint64_t w = *p++;
-	w |= (uint64_t)(*p++) << 8;
-	w |= (uint64_t)(*p++) << 16;
-	w |= (uint64_t)(*p++) << 24;
-	w |= (uint64_t)(*p++) << 32;
-	w |= (uint64_t)(*p++) << 40;
-	w |= (uint64_t)(*p++) << 48;
-	w |= (uint64_t)(*p++) << 56;
-	return w;
-#endif
-}
-
-static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
-#if defined(NATIVE_LITTLE_ENDIAN)
-	memcpy(dst, &w, sizeof w);
-#else
-	uint8_t *p = (uint8_t *)dst;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-#endif
-}
-
-static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
-#if defined(NATIVE_LITTLE_ENDIAN)
-	memcpy(dst, &w, sizeof w);
-#else
-	uint8_t *p = (uint8_t *)dst;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-#endif
-}
-
-static BLAKE2_INLINE uint64_t load48(const void *src) {
-	const uint8_t *p = (const uint8_t *)src;
-	uint64_t w = *p++;
-	w |= (uint64_t)(*p++) << 8;
-	w |= (uint64_t)(*p++) << 16;
-	w |= (uint64_t)(*p++) << 24;
-	w |= (uint64_t)(*p++) << 32;
-	w |= (uint64_t)(*p++) << 40;
-	return w;
-}
-
-static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
-	uint8_t *p = (uint8_t *)dst;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-	w >>= 8;
-	*p++ = (uint8_t)w;
-}
-
-static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
-	return (w >> c) | (w << (32 - c));
-}
-
-static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
-	return (w >> c) | (w << (64 - c));
-}
-
-/* prevents compiler optimizing out memset() */
-static BLAKE2_INLINE void burn(void *v, size_t n) {
-	static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
-	memset_v(v, 0, n);
-}
-
-#endif
--- a/algo/argon2/argon2a/ar2/blake2/blake2.h
+++ b/algo/argon2/argon2a/ar2/blake2/blake2.h
@@ -1,76 +0,0 @@
-#ifndef PORTABLE_BLAKE2_H
-#define PORTABLE_BLAKE2_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <limits.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-enum blake2b_constant {
-	BLAKE2B_BLOCKBYTES = 128,
-	BLAKE2B_OUTBYTES = 64,
-	BLAKE2B_KEYBYTES = 64,
-	BLAKE2B_SALTBYTES = 16,
-	BLAKE2B_PERSONALBYTES = 16
-};
-
-#pragma pack(push, 1)
-typedef struct __blake2b_param {
-	uint8_t digest_length;                   /* 1 */
-	uint8_t key_length;                      /* 2 */
-	uint8_t fanout;                          /* 3 */
-	uint8_t depth;                           /* 4 */
-	uint32_t leaf_length;                    /* 8 */
-	uint64_t node_offset;                    /* 16 */
-	uint8_t node_depth;                      /* 17 */
-	uint8_t inner_length;                    /* 18 */
-	uint8_t reserved[14];                    /* 32 */
-	uint8_t salt[BLAKE2B_SALTBYTES];         /* 48 */
-	uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
-} blake2b_param;
-#pragma pack(pop)
-
-typedef struct __blake2b_state {
-	uint64_t h[8];
-	uint64_t t[2];
-	uint64_t f[2];
-	unsigned buflen;
-	unsigned outlen;
-	uint8_t last_node;
-	uint8_t buf[BLAKE2B_BLOCKBYTES];
-} blake2b_state;
-
-/* Ensure param structs have not been wrongly padded */
-/* Poor man's static_assert */
-enum {
-	blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
-	blake2_size_check_2 =
-		1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
-};
-
-/* Streaming API */
-int ar2_blake2b_init(blake2b_state *S, size_t outlen);
-int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
-					 size_t keylen);
-int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P);
-int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
-void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
-int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen);
-
-/* Simple API */
-int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen);
-
-/* Argon2 Team - Begin Code */
-int ar2_blake2b_long(void *out, const void *in);
-/* Argon2 Team - End Code */
-/* Miouyouyou */
-void ar2_blake2b_too(void *out, const void *in);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
--- a/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h
@@ -1,162 +0,0 @@
-#ifndef BLAKE_ROUND_MKA_OPT_H
-#define BLAKE_ROUND_MKA_OPT_H
-
-#include "blake2-impl.h"
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-
-#include <immintrin.h>
-#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
-#include <x86intrin.h>
-#endif
-
-#if !defined(__XOP__)
-#if defined(__SSSE3__)
-#define r16                                                                    \
-    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
-#define r24                                                                    \
-    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define _mm_roti_epi64(x, c)                                                   \
-    (-(c) == 32)                                                               \
-        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
-        : (-(c) == 24)                                                         \
-              ? _mm_shuffle_epi8((x), r24)                                     \
-              : (-(c) == 16)                                                   \
-                    ? _mm_shuffle_epi8((x), r16)                               \
-                    : (-(c) == 63)                                             \
-                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_add_epi64((x), (x)))             \
-                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
-                                          _mm_slli_epi64((x), 64 - (-(c))))
-#else /* defined(__SSE2__) */
-#define _mm_roti_epi64(r, c)                                                   \
-    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
-#endif
-#else
-#endif
-
-static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
-    const __m128i z = _mm_mul_epu32(x, y);
-    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
-}
-
-#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
-    do {                                                                       \
-        A0 = fBlaMka(A0, B0);                                                  \
-        A1 = fBlaMka(A1, B1);                                                  \
-                                                                               \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
-                                                                               \
-        D0 = _mm_roti_epi64(D0, -32);                                          \
-        D1 = _mm_roti_epi64(D1, -32);                                          \
-                                                                               \
-        C0 = fBlaMka(C0, D0);                                                  \
-        C1 = fBlaMka(C1, D1);                                                  \
-                                                                               \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
-                                                                               \
-        B0 = _mm_roti_epi64(B0, -24);                                          \
-        B1 = _mm_roti_epi64(B1, -24);                                          \
-    } while ((void)0, 0)
-
-#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
-    do {                                                                       \
-        A0 = fBlaMka(A0, B0);                                                  \
-        A1 = fBlaMka(A1, B1);                                                  \
-                                                                               \
-        D0 = _mm_xor_si128(D0, A0);                                            \
-        D1 = _mm_xor_si128(D1, A1);                                            \
-                                                                               \
-        D0 = _mm_roti_epi64(D0, -16);                                          \
-        D1 = _mm_roti_epi64(D1, -16);                                          \
-                                                                               \
-        C0 = fBlaMka(C0, D0);                                                  \
-        C1 = fBlaMka(C1, D1);                                                  \
-                                                                               \
-        B0 = _mm_xor_si128(B0, C0);                                            \
-        B1 = _mm_xor_si128(B1, C1);                                            \
-                                                                               \
-        B0 = _mm_roti_epi64(B0, -63);                                          \
-        B1 = _mm_roti_epi64(B1, -63);                                          \
-    } while ((void)0, 0)
-
-#if defined(__SSSE3__)
-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
-        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
-
-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
-        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
-        B0 = t0;                                                               \
-        B1 = t1;                                                               \
-                                                                               \
-        t0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-                                                                               \
-        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
-        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
-        D0 = t1;                                                               \
-        D1 = t0;                                                               \
-    } while ((void)0, 0)
-#else /* SSE2 */
-#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
-    do {                                                                       \
-        __m128i t0 = D0;                                                       \
-        __m128i t1 = B0;                                                       \
-        D0 = C0;                                                               \
-        C0 = C1;                                                               \
-        C1 = D0;                                                               \
-        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
-        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
-        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
-        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
-    } while ((void)0, 0)
-
-#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
-    do {                                                                       \
-        __m128i t0 = C0;                                                       \
-        C0 = C1;                                                               \
-        C1 = t0;                                                               \
-        t0 = B0;                                                               \
-        __m128i t1 = D0;                                                       \
-        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
-        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
-        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
-        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
-    } while ((void)0, 0)
-#endif
-
-#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
-    do {                                                                       \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
-                                                                               \
-        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
-                                                                               \
-        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
-    } while ((void)0, 0)
-
-#endif
--- a/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h
@@ -1,39 +0,0 @@
-#ifndef BLAKE_ROUND_MKA_H
-#define BLAKE_ROUND_MKA_H
-
-#include "blake2.h"
-#include "blake2-impl.h"
-
-/*designed by the Lyra PHC team */
-static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
-    const uint64_t m = UINT64_C(0xFFFFFFFF);
-    const uint64_t xy = (x & m) * (y & m);
-    return x + y + 2 * xy;
-}
-
-#define G(a, b, c, d)                                                          \
-    do {                                                                       \
-        a = fBlaMka(a, b);                                                     \
-        d = rotr64(d ^ a, 32);                                                 \
-        c = fBlaMka(c, d);                                                     \
-        b = rotr64(b ^ c, 24);                                                 \
-        a = fBlaMka(a, b);                                                     \
-        d = rotr64(d ^ a, 16);                                                 \
-        c = fBlaMka(c, d);                                                     \
-        b = rotr64(b ^ c, 63);                                                 \
-    } while ((void)0, 0)
-
-#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
-                           v12, v13, v14, v15)                                 \
-    do {                                                                       \
-        G(v0, v4, v8, v12);                                                    \
-        G(v1, v5, v9, v13);                                                    \
-        G(v2, v6, v10, v14);                                                   \
-        G(v3, v7, v11, v15);                                                   \
-        G(v0, v5, v10, v15);                                                   \
-        G(v1, v6, v11, v12);                                                   \
-        G(v2, v7, v8, v13);                                                    \
-        G(v3, v4, v9, v14);                                                    \
-    } while ((void)0, 0)
-
-#endif
--- a/algo/argon2/argon2a/ar2/blake2b.c
+++ b/algo/argon2/argon2a/ar2/blake2b.c
@@ -1,316 +0,0 @@
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "blake2/blake2.h"
-#include "blake2/blake2-impl.h"
-
-#if defined(_MSC_VER)
-// i know there is a trick but nvm :p
-#define PRIu64 "%llu"
-#define PRIx64 "%llx"
-#endif
-
-static const uint64_t blake2b_IV[8] = {
-	UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
-	UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
-	UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
-	UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)
-};
-
-static const unsigned int blake2b_sigma[12][16] = {
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-};
-
-static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
-	S->f[1] = (uint64_t)-1;
-}
-
-static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
-	if (S->last_node) {
-		blake2b_set_lastnode(S);
-	}
-	S->f[0] = (uint64_t)-1;
-}
-
-static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) {
-	S->t[0] += inc;
-	S->t[1] += (S->t[0] < inc);
-}
-
-static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
-	burn(S, sizeof(*S));      /* wipe */
-	blake2b_set_lastblock(S); /* invalidate for further use */
-}
-
-static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
-	memset(S, 0, sizeof(*S));
-	memcpy(S->h, blake2b_IV, sizeof(S->h));
-}
-
-/*
-void print_state(blake2b_state BlakeHash)
-{
-	printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
-				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
-				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
-				"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
-		".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
-		".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n",
-		BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3],
-		BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7],
-		BlakeHash.t[0], BlakeHash.t[1],
-		BlakeHash.f[0], BlakeHash.f[1]);
-	printf(".buf = {");
-	for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++)
-		printf("%" PRIu8 ", ", BlakeHash.buf[i]);
-	puts("\n");
-	printf("}\n.buflen = %d\n.outlen = %d\n",
-		  BlakeHash.buflen, BlakeHash.outlen);
-	printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node);
-	fflush(stdout);
-}
-*/
-
-static const blake2b_state miou = {
-	.h = {
-		UINT64_C(7640891576939301128), UINT64_C(13503953896175478587),
-		UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
-		UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
-		UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
-	},
-	.t = {UINT64_C(0), UINT64_C(0)},
-	.f = {UINT64_C(0), UINT64_C(0)},
-	.buf = {
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-	},
-	.buflen = 0,
-	.outlen = 64,
-	.last_node = 0
-};
-
-
-int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
-{
-	const unsigned char *p = (const unsigned char *)P;
-	unsigned int i;
-
-	if (NULL == P || NULL == S) {
-		return -1;
-	}
-
-	blake2b_init0(S);
-	/* IV XOR Parameter Block */
-	for (i = 0; i < 8; ++i) {
-		S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
-	}
-	S->outlen = P->digest_length;
-	return 0;
-}
-
-void compare_buffs(uint64_t *h, size_t outlen)
-{
-	// printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t))));
-	printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen);
-	fflush(stdout);
-}
-
-/* Sequential blake2b initialization */
-int ar2_blake2b_init(blake2b_state *S, size_t outlen)
-{
-	memcpy(S, &miou, sizeof(*S));
-	S->h[0] += outlen;
-	return 0;
-}
-
-void print64(const char *name, const uint64_t *array, uint16_t size)
-{
-	printf("%s = {", name);
-	for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]);
-	printf("};\n");
-}
-
-int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen)
-{
-	return 0;
-}
-
-static void blake2b_compress(blake2b_state *S, const uint8_t *block)
-{
-	uint64_t m[16];
-	uint64_t v[16];
-	unsigned int i, r;
-
-	for (i = 0; i < 16; ++i) {
-		m[i] = load64(block + i * 8);
-	}
-
-	for (i = 0; i < 8; ++i) {
-		v[i] = S->h[i];
-	}
-
-	v[8] = blake2b_IV[0];
-	v[9] = blake2b_IV[1];
-	v[10] = blake2b_IV[2];
-	v[11] = blake2b_IV[3];
-	v[12] = blake2b_IV[4] ^ S->t[0];
-	v[13] = blake2b_IV[5]/* ^ S->t[1]*/;
-	v[14] = blake2b_IV[6] ^ S->f[0];
-	v[15] = blake2b_IV[7]/* ^ S->f[1]*/;
-
-#define G(r, i, a, b, c, d)                                                    \
-	do {                                                                       \
-		a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
-		d = rotr64(d ^ a, 32);                                                 \
-		c = c + d;                                                             \
-		b = rotr64(b ^ c, 24);                                                 \
-		a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
-		d = rotr64(d ^ a, 16);                                                 \
-		c = c + d;                                                             \
-		b = rotr64(b ^ c, 63);                                                 \
-	} while ((void)0, 0)
-
-#define ROUND(r)                                                               \
-	do {                                                                       \
-		G(r, 0, v[0], v[4], v[8], v[12]);                                      \
-		G(r, 1, v[1], v[5], v[9], v[13]);                                      \
-		G(r, 2, v[2], v[6], v[10], v[14]);                                     \
-		G(r, 3, v[3], v[7], v[11], v[15]);                                     \
-		G(r, 4, v[0], v[5], v[10], v[15]);                                     \
-		G(r, 5, v[1], v[6], v[11], v[12]);                                     \
-		G(r, 6, v[2], v[7], v[8], v[13]);                                      \
-		G(r, 7, v[3], v[4], v[9], v[14]);                                      \
-	} while ((void)0, 0)
-
-	for (r = 0; r < 12; ++r) ROUND(r);
-
-	for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-
-#undef G
-#undef ROUND
-}
-
-int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
-{
-	const uint8_t *pin = (const uint8_t *)in;
-	/* Complete current block */
-	memcpy(&S->buf[4], pin, 124);
-	blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
-	blake2b_compress(S, S->buf);
-	S->buflen = 0;
-	pin += 124;
-
-	register int8_t i = 7;
-	/* Avoid buffer copies when possible */
-	while (i--) {
-	  blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
-	  blake2b_compress(S, pin);
-	  pin += BLAKE2B_BLOCKBYTES;
-	}
-	memcpy(&S->buf[S->buflen], pin, 4);
-	S->buflen += 4;
-	return 0;
-}
-
-void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
-{
-	memcpy(&S->buf[S->buflen], in, inlen);
-	S->buflen += (unsigned int)inlen;
-}
-
-int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen)
-{
-	uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
-	unsigned int i;
-
-	blake2b_increment_counter(S, S->buflen);
-	blake2b_set_lastblock(S);
-	memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
-	blake2b_compress(S, S->buf);
-
-	for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
-		store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
-	}
-
-	memcpy(out, buffer, S->outlen);
-
-	burn(buffer, sizeof(buffer));
-	burn(S->buf, sizeof(S->buf));
-	burn(S->h, sizeof(S->h));
-	return 0;
-}
-
-int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen)
-{
-	blake2b_state S;
-
-	ar2_blake2b_init(&S, 64);
-	my_blake2b_update(&S, in, 64);
-	ar2_blake2b_final(&S, out, 64);
-	burn(&S, sizeof(S));
-	return 0;
-}
-
-void ar2_blake2b_too(void *pout, const void *in)
-{
-	uint8_t *out = (uint8_t *)pout;
-	uint8_t out_buffer[64];
-	uint8_t in_buffer[64];
-
-	blake2b_state blake_state;
-	ar2_blake2b_init(&blake_state, 64);
-	blake_state.buflen = blake_state.buf[1] = 4;
-	my_blake2b_update(&blake_state, in, 72);
-	ar2_blake2b_final(&blake_state, out_buffer, 64);
-	memcpy(out, out_buffer, 32);
-	out += 32;
-
-	register uint8_t i = 29;
-	while (i--) {
-		memcpy(in_buffer, out_buffer, 64);
-		ar2_blake2b(out_buffer, in_buffer, NULL, 0);
-		memcpy(out, out_buffer, 32);
-		out += 32;
-	}
-
-	memcpy(in_buffer, out_buffer, 64);
-	ar2_blake2b(out_buffer, in_buffer, NULL, 0);
-	memcpy(out, out_buffer, 64);
-
-	burn(&blake_state, sizeof(blake_state));
-}
-
-/* Argon2 Team - Begin Code */
-int ar2_blake2b_long(void *pout, const void *in)
-{
-	uint8_t *out = (uint8_t *)pout;
-	blake2b_state blake_state;
-	uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
-
-	store32(outlen_bytes, 32);
-
-	ar2_blake2b_init(&blake_state, 32);
-	my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes));
-	ar2_blake2b_update(&blake_state, in, 1024);
-	ar2_blake2b_final(&blake_state, out, 32);
-	burn(&blake_state, sizeof(blake_state));
-	return 0;
-}
-/* Argon2 Team - End Code */
--- a/algo/argon2/argon2a/ar2/cores.c
+++ b/algo/argon2/argon2a/ar2/cores.c
@@ -1,349 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-/*For memory wiping*/
-#ifdef _MSC_VER
-#include <windows.h>
-#include <winbase.h> /* For SecureZeroMemory */
-#endif
-#if defined __STDC_LIB_EXT1__
-#define __STDC_WANT_LIB_EXT1__ 1
-#endif
-#define VC_GE_2005(version) (version >= 1400)
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "argon2.h"
-#include "cores.h"
-#include "blake2/blake2.h"
-#include "blake2/blake2-impl.h"
-
-#ifdef GENKAT
-#include "genkat.h"
-#endif
-
-#if defined(__clang__)
-#if __has_attribute(optnone)
-#define NOT_OPTIMIZED __attribute__((optnone))
-#endif
-#elif defined(__GNUC__)
-#define GCC_VERSION                                                            \
-    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#if GCC_VERSION >= 40400
-#define NOT_OPTIMIZED __attribute__((optimize("O0")))
-#endif
-#endif
-#ifndef NOT_OPTIMIZED
-#define NOT_OPTIMIZED
-#endif
-
-/***************Instance and Position constructors**********/
-void ar2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
-//inline void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
-
-void ar2_copy_block(block *dst, const block *src) {
-//inline void copy_block(block *dst, const block *src) {
-    memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK);
-}
-
-void ar2_xor_block(block *dst, const block *src) {
-//inline void xor_block(block *dst, const block *src) {
-    int i;
-    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
-        dst->v[i] ^= src->v[i];
-    }
-}
-
-static void ar2_load_block(block *dst, const void *input) {
-//static inline void load_block(block *dst, const void *input) {
-    unsigned i;
-    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
-        dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
-    }
-}
-
-static void ar2_store_block(void *output, const block *src) {
-//static inline void store_block(void *output, const block *src) {
-    unsigned i;
-    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
-        store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
-    }
-}
-
-/***************Memory allocators*****************/
-int ar2_allocate_memory(block **memory, uint32_t m_cost) {
-    if (memory != NULL) {
-        size_t memory_size = sizeof(block) * m_cost;
-        if (m_cost != 0 &&
-            memory_size / m_cost !=
-                sizeof(block)) { /*1. Check for multiplication overflow*/
-            return ARGON2_MEMORY_ALLOCATION_ERROR;
-        }
-
-        *memory = (block *)malloc(memory_size); /*2. Try to allocate*/
-
-        if (!*memory) {
-            return ARGON2_MEMORY_ALLOCATION_ERROR;
-        }
-
-        return ARGON2_OK;
-    } else {
-        return ARGON2_MEMORY_ALLOCATION_ERROR;
-    }
-}
-
-void ar2_secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
-//inline void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
-
-/*********Memory functions*/
-
-void ar2_clear_memory(argon2_instance_t *instance, int clear) {
-//inline void clear_memory(argon2_instance_t *instance, int clear) {
-    if (instance->memory != NULL && clear) {
-        ar2_secure_wipe_memory(instance->memory,
-                           sizeof(block) * /*instance->memory_blocks*/16);
-    }
-}
-
-void ar2_free_memory(block *memory) { free(memory); }
-//inline void free_memory(block *memory) { free(memory); }
-
-void ar2_finalize(const argon2_context *context, argon2_instance_t *instance) {
-    if (context != NULL && instance != NULL) {
-        block blockhash;
-        ar2_copy_block(&blockhash, instance->memory + 15);
-
-        /* Hash the result */
-        {
-            uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
-            ar2_store_block(blockhash_bytes, &blockhash);
-            ar2_blake2b_long(context->out, blockhash_bytes);
-            ar2_secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
-            ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
-        }
-
-#ifdef GENKAT
-        print_tag(context->out, context->outlen);
-#endif
-
-        /* Clear memory */
-        // clear_memory(instance, 1);
-
-        ar2_free_memory(instance->memory);
-    }
-}
-
-uint32_t ar2_index_alpha(const argon2_instance_t *instance,
-                     const argon2_position_t *position, uint32_t pseudo_rand,
-                     int same_lane) {
-    /*
-     * Pass 0:
-     *      This lane : all already finished segments plus already constructed
-     * blocks in this segment
-     *      Other lanes : all already finished segments
-     * Pass 1+:
-     *      This lane : (SYNC_POINTS - 1) last segments plus already constructed
-     * blocks in this segment
-     *      Other lanes : (SYNC_POINTS - 1) last segments
-     */
-    uint32_t reference_area_size;
-    uint64_t relative_position;
-    uint32_t start_position, absolute_position;
-
-    if (0 == position->pass) {
-        /* First pass */
-        if (0 == position->slice) {
-            /* First slice */
-            reference_area_size =
-                position->index - 1; /* all but the previous */
-        } else {
-            if (same_lane) {
-                /* The same lane => add current segment */
-                reference_area_size =
-                    position->slice * 4 +
-                    position->index - 1;
-            } else {
-                reference_area_size =
-                    position->slice * 4 +
-                    ((position->index == 0) ? (-1) : 0);
-            }
-        }
-    } else {
-        /* Second pass */
-        if (same_lane) {reference_area_size = 11 + position->index;}
-        else {reference_area_size = 12 - (position->index == 0);}
-    }
-
-    /* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
-     * relative position */
-    relative_position = pseudo_rand;
-    relative_position = relative_position * relative_position >> 32;
-    relative_position = reference_area_size - 1 -
-                        (reference_area_size * relative_position >> 32);
-
-    /* 1.2.5 Computing starting position */
-    start_position = 0;
-
-    if (0 != position->pass) {
-        start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
-                             ? 0 : (position->slice + 1) * 4;
-    }
-
-    /* 1.2.6. Computing absolute position */
-    absolute_position = (start_position + relative_position) % 16;
-    return absolute_position;
-}
-
-void ar2_fill_memory_blocks(argon2_instance_t *instance) {
-    uint32_t r, s;
-
-    for (r = 0; r < 2; ++r) {
-        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
-
-            argon2_position_t position;
-            position.pass = r;
-            position.lane = 0;
-            position.slice = (uint8_t)s;
-            position.index = 0;
-            ar2_fill_segment(instance, position);
-        }
-
-#ifdef GENKAT
-        internal_kat(instance, r); /* Print all memory blocks */
-#endif
-    }
-}
-
-void ar2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
-    /* Make the first and second block in each lane as G(H0||i||0) or
-       G(H0||i||1) */
-    uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
-    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
-    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0);
-    ar2_blake2b_too(blockhash_bytes, blockhash);
-    ar2_load_block(&instance->memory[0], blockhash_bytes);
-
-    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
-    ar2_blake2b_too(blockhash_bytes, blockhash);
-    ar2_load_block(&instance->memory[1], blockhash_bytes);
-    ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
-}
-
-
-static const blake2b_state base_hash = {
- .h = {
-  UINT64_C(7640891576939301192), UINT64_C(13503953896175478587),
-  UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
-  UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
-  UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
- },
- .t = {UINT64_C(0),UINT64_C(0)},
- .f = {UINT64_C(0),UINT64_C(0)},
- .buf = {
-  1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0,
-  0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- .buflen = 28,
- .outlen = 64,
- .last_node = 0
-};
-
-#define PWDLEN 32
-#define SALTLEN 32
-#define SECRETLEN 0
-#define ADLEN 0
-void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
-                  argon2_type type) {
-
-    uint8_t value[sizeof(uint32_t)];
-
-    /* Is it generating cache invalidation between cores ? */
-    blake2b_state BlakeHash = base_hash;
-    BlakeHash.buf[20] = (uint8_t) type;
-    my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
-                   PWDLEN);
-
-
-    ar2_secure_wipe_memory(context->pwd, PWDLEN);
-    context->pwdlen = 0;
-
-    store32(&value, SALTLEN);
-    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-
-    my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
-                   SALTLEN);
-
-    store32(&value, SECRETLEN);
-    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-
-    store32(&value, ADLEN);
-    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-
-    ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
-}
-
-int ar2_initialize(argon2_instance_t *instance, argon2_context *context) {
-    /* 1. Memory allocation */
-
-
-    ar2_allocate_memory(&(instance->memory), 16);
-
-    /* 2. Initial hashing */
-    /* H_0 + 8 extra bytes to produce the first blocks */
-    /* Hashing all inputs */
-    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
-    ar2_initial_hash(blockhash, context, instance->type);
-    /* Zeroing 8 extra bytes */
-    ar2_secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
-                       ARGON2_PREHASH_SEED_LENGTH -
-                           ARGON2_PREHASH_DIGEST_LENGTH);
-
-#ifdef GENKAT
-    initial_kat(blockhash, context, instance->type);
-#endif
-
-    /* 3. Creating first blocks, we always have at least two blocks in a slice
-     */
-    ar2_fill_first_blocks(blockhash, instance);
-    /* Clearing the hash */
-    ar2_secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
-
-    return ARGON2_OK;
-}
-
-int ar2_argon2_core(argon2_context *context, argon2_type type) {
-    argon2_instance_t instance;
-    instance.memory = NULL;
-    instance.type = type;
-
-    /* 3. Initialization: Hashing inputs, allocating memory, filling first
-     * blocks
-     */
-
-    int result = ar2_initialize(&instance, context);
-    if (ARGON2_OK != result) return result;
-
-    /* 4. Filling memory */
-    ar2_fill_memory_blocks(&instance);
-
-    /* 5. Finalization */
-    ar2_finalize(context, &instance);
-
-    return ARGON2_OK;
-}
--- a/algo/argon2/argon2a/ar2/cores.h
+++ b/algo/argon2/argon2a/ar2/cores.h
@@ -1,216 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#ifndef ARGON2_CORES_H
-#define ARGON2_CORES_H
-
-#if defined(_MSC_VER)
-#include <Windows.h>
-#include <process.h>
-#define ALIGN(n) __declspec(align(n))
-#elif defined(__GNUC__) || defined(__clang)
-#define ALIGN(x) __attribute__((__aligned__(x)))
-#else
-#define ALIGN(x)
-#endif
-
-/*************************Argon2 internal
- * constants**************************************************/
-
-enum argon2_core_constants {
-    /* Version of the algorithm */
-    ARGON2_VERSION_NUMBER = 0x10,
-
-    /* Memory block size in bytes */
-    ARGON2_BLOCK_SIZE = 1024,
-    ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
-    ARGON2_QWORDS_IN_BLOCK = 64,
-
-    /* Number of pseudo-random values generated by one call to Blake in Argon2i
-       to
-       generate reference block positions */
-    ARGON2_ADDRESSES_IN_BLOCK = 128,
-
-    /* Pre-hashing digest length and its extension*/
-    ARGON2_PREHASH_DIGEST_LENGTH = 64,
-    ARGON2_PREHASH_SEED_LENGTH = 72
-};
-
-/* Argon2 primitive type */
-typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type;
-
-/*************************Argon2 internal data
- * types**************************************************/
-
-/*
- * Structure for the (1KB) memory block implemented as 128 64-bit words.
- * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
- * bounds checking).
- */
-typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } ALIGN(16) block;
-
-/*****************Functions that work with the block******************/
-
-/* Initialize each byte of the block with @in */
-void ar2_init_block_value(block *b, uint8_t in);
-
-/* Copy block @src to block @dst */
-void ar2_copy_block(block *dst, const block *src);
-
-/* XOR @src onto @dst bytewise */
-void ar2_xor_block(block *dst, const block *src);
-
-/*
- * Argon2 instance: memory pointer, number of passes, amount of memory, type,
- * and derived values.
- * Used to evaluate the number and location of blocks to construct in each
- * thread
- */
-typedef struct Argon2_instance_t {
-    block *memory;          /* Memory pointer */
-    argon2_type type;
-    int print_internals; /* whether to print the memory blocks */
-} argon2_instance_t;
-
-/*
- * Argon2 position: where we construct the block right now. Used to distribute
- * work between threads.
- */
-typedef struct Argon2_position_t {
-    uint32_t pass;
-    uint32_t lane;
-    uint8_t slice;
-    uint32_t index;
-} argon2_position_t;
-
-/*************************Argon2 core
- * functions**************************************************/
-
-/* Allocates memory to the given pointer
- * @param memory pointer to the pointer to the memory
- * @param m_cost number of blocks to allocate in the memory
- * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
- */
-int ar2_allocate_memory(block **memory, uint32_t m_cost);
-
-/* Function that securely cleans the memory
- * @param mem Pointer to the memory
- * @param s Memory size in bytes
- */
-void ar2_secure_wipe_memory(void *v, size_t n);
-
-/* Clears memory
- * @param instance pointer to the current instance
- * @param clear_memory indicates if we clear the memory with zeros.
- */
-void ar2_clear_memory(argon2_instance_t *instance, int clear);
-
-/* Deallocates memory
- * @param memory pointer to the blocks
- */
-void ar2_free_memory(block *memory);
-
-/*
- * Computes absolute position of reference block in the lane following a skewed
- * distribution and using a pseudo-random value as input
- * @param instance Pointer to the current instance
- * @param position Pointer to the current position
- * @param pseudo_rand 32-bit pseudo-random value used to determine the position
- * @param same_lane Indicates if the block will be taken from the current lane.
- * If so we can reference the current segment
- * @pre All pointers must be valid
- */
-uint32_t ar2_index_alpha(const argon2_instance_t *instance,
-                     const argon2_position_t *position, uint32_t pseudo_rand,
-                     int same_lane);
-
-/*
- * Function that validates all inputs against predefined restrictions and return
- * an error code
- * @param context Pointer to current Argon2 context
- * @return ARGON2_OK if everything is all right, otherwise one of error codes
- * (all defined in <argon2.h>
- */
-int ar2_validate_inputs(const argon2_context *context);
-
-/*
- * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
- * password and secret if needed
- * @param  context  Pointer to the Argon2 internal structure containing memory
- * pointer, and parameters for time and space requirements.
- * @param  blockhash Buffer for pre-hashing digest
- * @param  type Argon2 type
- * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
- * allocated
- */
-void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
-                  argon2_type type);
-
-/*
- * Function creates first 2 blocks per lane
- * @param instance Pointer to the current instance
- * @param blockhash Pointer to the pre-hashing digest
- * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
- */
-void ar2_fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
-
-/*
- * Function allocates memory, hashes the inputs with Blake,  and creates first
- * two blocks. Returns the pointer to the main memory with 2 blocks per lane
- * initialized
- * @param  context  Pointer to the Argon2 internal structure containing memory
- * pointer, and parameters for time and space requirements.
- * @param  instance Current Argon2 instance
- * @return Zero if successful, -1 if memory failed to allocate. @context->state
- * will be modified if successful.
- */
-int ar2_initialize(argon2_instance_t *instance, argon2_context *context);
-
-/*
- * XORing the last block of each lane, hashing it, making the tag. Deallocates
- * the memory.
- * @param context Pointer to current Argon2 context (use only the out parameters
- * from it)
- * @param instance Pointer to current instance of Argon2
- * @pre instance->state must point to necessary amount of memory
- * @pre context->out must point to outlen bytes of memory
- * @pre if context->free_cbk is not NULL, it should point to a function that
- * deallocates memory
- */
-void ar2_finalize(const argon2_context *context, argon2_instance_t *instance);
-
-/*
- * Function that fills the segment using previous segments also from other
- * threads
- * @param instance Pointer to the current instance
- * @param position Current position
- * @pre all block pointers must be valid
- */
-void ar2_fill_segment(const argon2_instance_t *instance,
-                  argon2_position_t position);
-
-/*
- * Function that fills the entire memory t_cost times based on the first two
- * blocks in each lane
- * @param instance Pointer to the current instance
- */
-void ar2_fill_memory_blocks(argon2_instance_t *instance);
-
-/*
- * Function that performs memory-hard hashing with certain degree of parallelism
- * @param  context  Pointer to the Argon2 internal structure
- * @return Error code if smth is wrong, ARGON2_OK otherwise
- */
-int ar2_argon2_core(argon2_context *context, argon2_type type);
-
-#endif
--- a/algo/argon2/argon2a/ar2/genkat.c.hide
+++ b/algo/argon2/argon2a/ar2/genkat.c.hide
@@ -1,186 +0,0 @@
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "argon2.h"
-#include "cores.h"
-
-void initial_kat(const uint8_t *blockhash, const argon2_context *context,
-                 argon2_type type)
-{
-    unsigned i;
-
-    if (blockhash != NULL && context != NULL) {
-        printf("=======================================");
-
-        switch (type) {
-        case Argon2_d:
-            printf("Argon2d\n");
-            break;
-
-        case Argon2_i:
-            printf("Argon2i\n");
-            break;
-
-        default:
-            break;
-        }
-
-        printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag "
-               "length: %u bytes\n",
-               context->m_cost, context->t_cost, context->lanes,
-               context->outlen);
-
-        printf("Password[%u]: ", context->pwdlen);
-
-        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            printf("CLEARED\n");
-        } else {
-            for (i = 0; i < context->pwdlen; ++i) {
-                printf("%2.2x ", ((unsigned char *)context->pwd)[i]);
-            }
-
-            printf("\n");
-        }
-
-        printf("Salt[%u]: ", context->saltlen);
-
-        for (i = 0; i < context->saltlen; ++i) {
-            printf("%2.2x ", ((unsigned char *)context->salt)[i]);
-        }
-
-        printf("\n");
-
-        printf("Secret[%u]: ", context->secretlen);
-
-        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            printf("CLEARED\n");
-        } else {
-            for (i = 0; i < context->secretlen; ++i) {
-                printf("%2.2x ", ((unsigned char *)context->secret)[i]);
-            }
-
-            printf("\n");
-        }
-
-        printf("Associated data[%u]: ", context->adlen);
-
-        for (i = 0; i < context->adlen; ++i) {
-            printf("%2.2x ", ((unsigned char *)context->ad)[i]);
-        }
-
-        printf("\n");
-
-        printf("Pre-hashing digest: ");
-
-        for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) {
-            printf("%2.2x ", ((unsigned char *)blockhash)[i]);
-        }
-
-        printf("\n");
-    }
-}
-
-void print_tag(const void *out, uint32_t outlen)
-{
-    unsigned i;
-    if (out != NULL) {
-        printf("Tag: ");
-
-        for (i = 0; i < outlen; ++i) {
-            printf("%2.2x ", ((uint8_t *)out)[i]);
-        }
-
-        printf("\n");
-    }
-}
-
-void internal_kat(const argon2_instance_t *instance, uint32_t pass)
-{
-    if (instance != NULL) {
-        uint32_t i, j;
-        printf("\n After pass %u:\n", pass);
-
-        for (i = 0; i < instance->memory_blocks; ++i) {
-            uint32_t how_many_words =
-                (instance->memory_blocks > ARGON2_WORDS_IN_BLOCK)
-                    ? 1
-                    : ARGON2_WORDS_IN_BLOCK;
-
-            for (j = 0; j < how_many_words; ++j)
-                printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j,
-                       instance->memory[i].v[j]);
-        }
-    }
-}
-
-static void fatal(const char *error) {
-    fprintf(stderr, "Error: %s\n", error);
-    exit(1);
-}
-
-static void generate_testvectors(const char *type)
-{
-#define TEST_OUTLEN 32
-#define TEST_PWDLEN 32
-#define TEST_SALTLEN 16
-#define TEST_SECRETLEN 8
-#define TEST_ADLEN 12
-    argon2_context context;
-
-    unsigned char out[TEST_OUTLEN];
-    unsigned char pwd[TEST_PWDLEN];
-    unsigned char salt[TEST_SALTLEN];
-    unsigned char secret[TEST_SECRETLEN];
-    unsigned char ad[TEST_ADLEN];
-    const allocate_fptr myown_allocator = NULL;
-    const deallocate_fptr myown_deallocator = NULL;
-
-    unsigned t_cost = 3;
-    unsigned m_cost = 16;
-    unsigned lanes = 4;
-
-    memset(pwd, 1, TEST_OUTLEN);
-    memset(salt, 2, TEST_SALTLEN);
-    memset(secret, 3, TEST_SECRETLEN);
-    memset(ad, 4, TEST_ADLEN);
-
-    context.out = out;
-    context.outlen = TEST_OUTLEN;
-    context.pwd = pwd;
-    context.pwdlen = TEST_PWDLEN;
-    context.salt = salt;
-    context.saltlen = TEST_SALTLEN;
-    context.secret = secret;
-    context.secretlen = TEST_SECRETLEN;
-    context.ad = ad;
-    context.adlen = TEST_ADLEN;
-    context.t_cost = t_cost;
-    context.m_cost = m_cost;
-    context.lanes = lanes;
-    context.threads = lanes;
-    context.allocate_cbk = myown_allocator;
-    context.free_cbk = myown_deallocator;
-    context.flags = 0;
-
-#undef TEST_OUTLEN
-#undef TEST_PWDLEN
-#undef TEST_SALTLEN
-#undef TEST_SECRETLEN
-#undef TEST_ADLEN
-
-    if (!strcmp(type, "d")) {
-        argon2d(&context);
-    } else if (!strcmp(type, "i")) {
-        argon2i(&context);
-    } else
-        fatal("wrong Argon2 type");
-}
-
-int main(int argc, char *argv[])
-{
-    const char *type = (argc > 1) ? argv[1] : "i";
-    generate_testvectors(type);
-    return ARGON2_OK;
-}
--- a/algo/argon2/argon2a/ar2/genkat.h.hide
+++ b/algo/argon2/argon2a/ar2/genkat.h.hide
@@ -1,45 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#ifndef ARGON2_KAT_H
-#define ARGON2_KAT_H
-
-/*
- * Initial KAT function that prints the inputs to the file
- * @param  blockhash  Array that contains pre-hashing digest
- * @param  context Holds inputs
- * @param  type Argon2 type
- * @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes
- * @pre context member pointers must point to allocated memory of size according
- * to the length values
- */
-void initial_kat(const uint8_t *blockhash, const argon2_context *context,
-                 argon2_type type);
-
-/*
- * Function that prints the output tag
- * @param  out  output array pointer
- * @param  outlen digest length
- * @pre out must point to @a outlen bytes
- **/
-void print_tag(const void *out, uint32_t outlen);
-
-/*
- * Function that prints the internal state at given moment
- * @param  instance pointer to the current instance
- * @param  pass current pass number
- * @pre instance must have necessary memory allocated
- **/
-void internal_kat(const argon2_instance_t *instance, uint32_t pass);
-
-#endif
--- a/algo/argon2/argon2a/ar2/opt.c
+++ b/algo/argon2/argon2a/ar2/opt.c
@@ -1,189 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#if defined(__SSE2__)
-
-#include <immintrin.h>
-
-#include "argon2.h"
-#include "cores.h"
-#include "opt.h"
-
-#include "blake2/blake2.h"
-#include "blake2/blamka-round-opt.h"
-
-void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
-{
-    __m128i ALIGN(16) block_XY[ARGON2_QWORDS_IN_BLOCK];
-    uint32_t i;
-    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
-        block_XY[i] = state[i] = _mm_xor_si128(
-            state[i], _mm_load_si128(&ref_block[i]));
-    }
-
-    BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]);
-    BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]);
-    BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]);
-    BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]);
-    BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]);
-    BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]);
-    BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]);
-    BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]);
-    /*for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
-                     state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
-                     state[8 * i + 6], state[8 * i + 7]);
-    }*/
-
-    BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]);
-    BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]);
-    BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]);
-    BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]);
-    BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]);
-    BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]);
-    BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]);
-    BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]);
-    /*for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
-                     state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
-                     state[8 * 6 + i], state[8 * 7 + i]);
-    }*/
-
-    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
-        state[i] = _mm_xor_si128(state[i], block_XY[i]);
-        _mm_storeu_si128(&next_block[i], state[i]);
-    }
-}
-
-static const uint64_t bad_rands[32] = {
-    UINT64_C(17023632018251376180), UINT64_C(4911461131397773491),
-    UINT64_C(15927076453364631751), UINT64_C(7860239898779391109),
-
-    UINT64_C(11820267568857244377), UINT64_C(12188179869468676617),
-    UINT64_C(3732913385414474778),  UINT64_C(7651458777762572084),
-
-    UINT64_C(3062274162574341415),  UINT64_C(17922653540258786897),
-    UINT64_C(17393848266100524980), UINT64_C(8539695715554563839),
-
-    UINT64_C(13824538050656654359), UINT64_C(12078939433126460936),
-    UINT64_C(15331979418564540430), UINT64_C(12058346794217174273),
-
-    UINT64_C(13593922096015221049), UINT64_C(18356682276374416500),
-    UINT64_C(4968040514092703824),  UINT64_C(11202790346130235567),
-
-    UINT64_C(2276229735041314644), UINT64_C(220837743321691382),
-    UINT64_C(4861211596230784273), UINT64_C(6330592584132590331),
-
-    UINT64_C(3515580430960296763), UINT64_C(9869356316971855173),
-    UINT64_C(485533243489193056),  UINT64_C(14596447761048148032),
-
-    UINT64_C(16531790085730132900), UINT64_C(17328824500878824371),
-    UINT64_C(8548260058287621283),  UINT64_C(8641748798041936364)
-};
-
-void ar2_generate_addresses(const argon2_instance_t *instance,
-                        const argon2_position_t *position,
-                        uint64_t *pseudo_rands)
-{
-    uint8_t offset = position->pass * 16 + position->slice * 4;
-    pseudo_rands[0] = bad_rands[offset++];
-    pseudo_rands[1] = bad_rands[offset++];
-    pseudo_rands[2] = bad_rands[offset++];
-    pseudo_rands[3] = bad_rands[offset++];
-
-    /*if ((position->pass == 1 && position->slice == 3))
-      print64("pseudo_rands", pseudo_rands, 4);*/
-}
-
-#define SEGMENT_LENGTH 4
-#define LANE_LENGTH 16
-#define POS_LANE 0
-
-void ar2_fill_segment(const argon2_instance_t *instance,
-                  argon2_position_t position)
-{
-    block *ref_block = NULL, *curr_block = NULL;
-    uint64_t pseudo_rand, ref_index;
-    uint32_t prev_offset, curr_offset;
-    uint8_t i;
-    __m128i state[64];
-    int data_independent_addressing = (instance->type == Argon2_i);
-
-    /* Pseudo-random values that determine the reference block position */
-    uint64_t *pseudo_rands = NULL;
-
-    pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32);
-
-    if (data_independent_addressing) {
-        ar2_generate_addresses(instance, &position, pseudo_rands);
-    }
-
-    i = 0;
-
-    if ((0 == position.pass) && (0 == position.slice)) {
-        i = 2; /* we have already generated the first two blocks */
-    }
-
-    /*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/
-    /* Offset of the current block */
-    curr_offset = position.slice * 4 + i;
-
-    if (0 == curr_offset % 16) {
-        /* Last block in this lane */
-        prev_offset = curr_offset + /*instance->lane_length - 1*/15;
-    } else {
-        /* Previous block */
-        prev_offset = curr_offset - 1;
-    }
-
-    memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
-
-    for (; i < SEGMENT_LENGTH;
-         ++i, ++curr_offset, ++prev_offset) {
-        /*1.1 Rotating prev_offset if needed */
-        if (curr_offset % LANE_LENGTH == 1) {
-            prev_offset = curr_offset - 1;
-        }
-
-        /* 1.2 Computing the index of the reference block */
-        /* 1.2.1 Taking pseudo-random value from the previous block */
-        if (data_independent_addressing) {
-            pseudo_rand = pseudo_rands[i];
-        } else {
-            pseudo_rand = instance->memory[prev_offset].v[0];
-        }
-
-        /* 1.2.2 Computing the lane of the reference block */
-
-        /* 1.2.3 Computing the number of possible reference block within the
-         * lane.
-         */
-        position.index = i;
-        ref_index = ar2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);
-
-        /* 2 Creating a new block */
-        ref_block = instance->memory + ref_index;
-        curr_block = instance->memory + curr_offset;
-        ar2_fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
-    }
-
-    free(pseudo_rands);
-}
-
-#endif
--- a/algo/argon2/argon2a/ar2/opt.h
+++ b/algo/argon2/argon2a/ar2/opt.h
@@ -1,49 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#ifndef ARGON2_OPT_H
-#define ARGON2_OPT_H
-
-/*
- * Function fills a new memory block. Differs from the
- * @param state Pointer to the just produced block. Content will be updated(!)
- * @param ref_block Pointer to the reference block
- * @param next_block Pointer to the block to be constructed
- * @pre all block pointers must be valid
- */
-void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
-
-/*
- * Generate pseudo-random values to reference blocks in the segment and puts
- * them into the array
- * @param instance Pointer to the current instance
- * @param position Pointer to the current position
- * @param pseudo_rands Pointer to the array of 64-bit values
- * @pre pseudo_rands must point to @a instance->segment_length allocated values
- */
-void ar2_generate_addresses(const argon2_instance_t *instance,
-                        const argon2_position_t *position,
-                        uint64_t *pseudo_rands);
-
-/*
- * Function that fills the segment using previous segments also from other
- * threads.
- * Identical to the reference code except that it calls optimized FillBlock()
- * @param instance Pointer to the current instance
- * @param position Current position
- * @pre all block pointers must be valid
- */
-void ar2_fill_segment(const argon2_instance_t *instance,
-                  argon2_position_t position);
-
-#endif /* ARGON2_OPT_H */
--- a/algo/argon2/argon2a/ar2/ref.c.hide
+++ b/algo/argon2/argon2a/ar2/ref.c.hide
@@ -1,174 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "argon2.h"
-#include "cores.h"
-#include "ref.h"
-
-#include "blake2/blamka-round-ref.h"
-#include "blake2/blake2-impl.h"
-#include "blake2/blake2.h"
-
-void fill_block(const block *prev_block, const block *ref_block,
-                block *next_block) {
-    block blockR, block_tmp;
-    unsigned i;
-
-    copy_block(&blockR, ref_block);
-    xor_block(&blockR, prev_block);
-    copy_block(&block_tmp, &blockR);
-
-    /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
-       (16,17,..31)... finally (112,113,...127) */
-    for (i = 0; i < 8; ++i) {
-        BLAKE2_ROUND_NOMSG(
-            blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
-            blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
-            blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
-            blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
-            blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
-            blockR.v[16 * i + 15]);
-    }
-
-    /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
-       (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
-    for (i = 0; i < 8; i++) {
-        BLAKE2_ROUND_NOMSG(
-            blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
-            blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
-            blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
-            blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
-            blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
-            blockR.v[2 * i + 113]);
-    }
-
-    copy_block(next_block, &block_tmp);
-    xor_block(next_block, &blockR);
-}
-
-void generate_addresses(const argon2_instance_t *instance,
-                        const argon2_position_t *position,
-                        uint64_t *pseudo_rands) {
-    block zero_block, input_block, address_block;
-    uint32_t i;
-
-    init_block_value(&zero_block, 0);
-    init_block_value(&input_block, 0);
-    init_block_value(&address_block, 0);
-
-    if (instance != NULL && position != NULL) {
-        input_block.v[0] = position->pass;
-        input_block.v[1] = position->lane;
-        input_block.v[2] = position->slice;
-        input_block.v[3] = 16;
-        input_block.v[4] = 2;
-        input_block.v[5] = instance->type;
-
-        for (i = 0; i < 4; ++i) {
-            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
-                input_block.v[6]++;
-                fill_block(&zero_block, &input_block, &address_block);
-                fill_block(&zero_block, &address_block, &address_block);
-            }
-
-            pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
-        }
-    }
-}
-
-void fill_segment(const argon2_instance_t *instance,
-                  argon2_position_t position) {
-    block *ref_block = NULL, *curr_block = NULL;
-    uint64_t pseudo_rand, ref_index, ref_lane;
-    uint32_t prev_offset, curr_offset;
-    uint32_t starting_index;
-    uint32_t i;
-    int data_independent_addressing = (instance->type == Argon2_i);
-    /* Pseudo-random values that determine the reference block position */
-    uint64_t *pseudo_rands = NULL;
-
-    if (instance == NULL) {
-        return;
-    }
-
-    pseudo_rands =
-        (uint64_t *)malloc(sizeof(uint64_t) * 4);
-
-    if (pseudo_rands == NULL) {
-        return;
-    }
-
-    if (data_independent_addressing) {
-        generate_addresses(instance, &position, pseudo_rands);
-    }
-
-    starting_index = 0;
-
-    if ((0 == position.pass) && (0 == position.slice)) {
-        starting_index = 2; /* we have already generated the first two blocks */
-    }
-
-    /* Offset of the current block */
-    curr_offset = position.lane * 16 +
-                  position.slice * 4 + starting_index;
-
-    if (0 == curr_offset % 16) {
-        /* Last block in this lane */
-        prev_offset = curr_offset + 16 - 1;
-    } else {
-        /* Previous block */
-        prev_offset = curr_offset - 1;
-    }
-
-    for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) {
-        /*1.1 Rotating prev_offset if needed */
-        if (curr_offset % 16 == 1) {
-            prev_offset = curr_offset - 1;
-        }
-
-        /* 1.2 Computing the index of the reference block */
-        /* 1.2.1 Taking pseudo-random value from the previous block */
-        if (data_independent_addressing) {
-            pseudo_rand = pseudo_rands[i];
-        } else {
-            pseudo_rand = instance->memory[prev_offset].v[0];
-        }
-
-        /* 1.2.2 Computing the lane of the reference block */
-        ref_lane = ((pseudo_rand >> 32)) % 1;
-
-        if ((position.pass == 0) && (position.slice == 0)) {
-            /* Can not reference other lanes yet */
-            ref_lane = position.lane;
-        }
-
-        /* 1.2.3 Computing the number of possible reference block within the
-         * lane.
-         */
-        position.index = i;
-        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
-                                ref_lane == position.lane);
-
-        /* 2 Creating a new block */
-        ref_block =
-            instance->memory + 16 * ref_lane + ref_index;
-        curr_block = instance->memory + curr_offset;
-        fill_block(instance->memory + prev_offset, ref_block, curr_block);
-    }
-
-    free(pseudo_rands);
-}
--- a/algo/argon2/argon2a/ar2/ref.h.hide
+++ b/algo/argon2/argon2a/ar2/ref.h.hide
@@ -1,49 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#ifndef ARGON2_REF_H
-#define ARGON2_REF_H
-
-/*
- * Function fills a new memory block
- * @param prev_block Pointer to the previous block
- * @param ref_block Pointer to the reference block
- * @param next_block Pointer to the block to be constructed
- * @pre all block pointers must be valid
- */
-void fill_block(const block *prev_block, const block *ref_block,
-                block *next_block);
-
-/*
- * Generate pseudo-random values to reference blocks in the segment and puts
- * them into the array
- * @param instance Pointer to the current instance
- * @param position Pointer to the current position
- * @param pseudo_rands Pointer to the array of 64-bit values
- * @pre pseudo_rands must point to @a instance->segment_length allocated values
- */
-void generate_addresses(const argon2_instance_t *instance,
-                        const argon2_position_t *position,
-                        uint64_t *pseudo_rands);
-
-/*
- * Function that fills the segment using previous segments also from other
- * threads
- * @param instance Pointer to the current instance
- * @param position Current position
- * @pre all block pointers must be valid
- */
-void fill_segment(const argon2_instance_t *instance,
-                  argon2_position_t position);
-
-#endif /* ARGON2_REF_H */
--- a/algo/argon2/argon2a/ar2/run.c.hide
+++ b/algo/argon2/argon2a/ar2/run.c.hide
@@ -1,223 +0,0 @@
-/*
- * Argon2 source code package
- *
- * Written by Daniel Dinu and Dmitry Khovratovich, 2015
- *
- * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
- *
- * You should have received a copy of the CC0 Public Domain Dedication along
- * with
- * this software. If not, see
- * <http://creativecommons.org/publicdomain/zero/1.0/>.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-
-#include "argon2.h"
-#include "cores.h"
-
-#define T_COST_DEF 3
-#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */
-#define LANES_DEF 1
-#define THREADS_DEF 1
-#define OUT_LEN 32
-#define SALT_LEN 16
-
-#define UNUSED_PARAMETER(x) (void)(x)
-
-static void usage(const char *cmd) {
-    printf("Usage:  %s pwd salt [-y version] [-t iterations] [-m memory] [-p "
-           "parallelism]\n",
-           cmd);
-
-    printf("Parameters:\n");
-    printf("\tpwd\t\tThe password to hash\n");
-    printf("\tsalt\t\tThe salt to use, at most 16 characters\n");
-    printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n");
-    printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n",
-           T_COST_DEF);
-    printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n",
-           LOG_M_COST_DEF);
-    printf("\t-p N\t\tSets parallelism to N threads (default %d)\n",
-           THREADS_DEF);
-}
-
-static void fatal(const char *error) {
-    fprintf(stderr, "Error: %s\n", error);
-    exit(1);
-}
-
-/*
-Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the
-Base64-encoded hash string
-@out output array with at least 32 bytes allocated
-@pwd NULL-terminated string, presumably from argv[]
-@salt salt array with at least SALTLEN_DEF bytes allocated
-@t_cost number of iterations
-@m_cost amount of requested memory in KB
-@lanes amount of requested parallelism
-@threads actual parallelism
-@type String, only "d" and "i" are accepted
-*/
-static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost,
-                uint32_t m_cost, uint32_t lanes, uint32_t threads,
-                const char *type) {
-    clock_t start_time, stop_time;
-    unsigned pwd_length;
-    argon2_context context;
-    int i;
-
-    start_time = clock();
-
-    if (!pwd) {
-        fatal("password missing");
-    }
-
-    if (!salt) {
-        secure_wipe_memory(pwd, strlen(pwd));
-        fatal("salt missing");
-    }
-
-    pwd_length = strlen(pwd);
-
-    UNUSED_PARAMETER(threads);
-
-    context.out = out;
-    context.outlen = OUT_LEN;
-    context.pwd = (uint8_t *)pwd;
-    context.pwdlen = pwd_length;
-    context.salt = salt;
-    context.saltlen = SALT_LEN;
-    context.secret = NULL;
-    context.secretlen = 0;
-    context.ad = NULL;
-    context.adlen = 0;
-    context.t_cost = t_cost;
-    context.m_cost = m_cost;
-    context.lanes = lanes;
-    context.threads = lanes;
-    context.allocate_cbk = NULL;
-    context.free_cbk = NULL;
-    context.flags = ARGON2_FLAG_CLEAR_PASSWORD;
-
-    if (!strcmp(type, "d")) {
-        int result = argon2d(&context);
-        if (result != ARGON2_OK)
-            fatal(error_message(result));
-    } else if (!strcmp(type, "i")) {
-        int result = argon2i(&context);
-        if (result != ARGON2_OK)
-            fatal(error_message(result));
-    } else {
-        secure_wipe_memory(pwd, strlen(pwd));
-        fatal("wrong Argon2 type");
-    }
-
-    stop_time = clock();
-
-    /* add back when proper decoding */
-    /*
-    char encoded[300];
-    encode_string(encoded, sizeof encoded, &context);
-    printf("%s\n", encoded);
-    */
-    printf("Hash:\t\t");
-    for (i = 0; i < context.outlen; ++i) {
-        printf("%02x", context.out[i]);
-    }
-    printf("\n");
-
-    printf("%2.3f seconds\n",
-           ((double)stop_time - start_time) / (CLOCKS_PER_SEC));
-}
-
-int main(int argc, char *argv[]) {
-    unsigned char out[OUT_LEN];
-    uint32_t m_cost = 1 << LOG_M_COST_DEF;
-    uint32_t t_cost = T_COST_DEF;
-    uint32_t lanes = LANES_DEF;
-    uint32_t threads = THREADS_DEF;
-    char *pwd = NULL;
-    uint8_t salt[SALT_LEN];
-    const char *type = "i";
-    int i;
-
-    if (argc < 3) {
-        usage(argv[0]);
-        return ARGON2_MISSING_ARGS;
-    }
-
-    /* get password and salt from command line */
-    pwd = argv[1];
-    if (strlen(argv[2]) > SALT_LEN) {
-        fatal("salt too long");
-    }
-    memset(salt, 0x00, SALT_LEN); /* pad with null bytes */
-    memcpy(salt, argv[2], strlen(argv[2]));
-
-    /* parse options */
-    for (i = 3; i < argc; i++) {
-        const char *a = argv[i];
-        unsigned long input = 0;
-        if (!strcmp(a, "-m")) {
-            if (i < argc - 1) {
-                i++;
-                input = strtoul(argv[i], NULL, 10);
-                if (input == 0 || input == ULONG_MAX ||
-                    input > ARGON2_MAX_MEMORY_BITS) {
-                    fatal("bad numeric input for -m");
-                }
-                m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF));
-                if (m_cost > ARGON2_MAX_MEMORY) {
-                    fatal("m_cost overflow");
-                }
-                continue;
-            } else {
-                fatal("missing -m argument");
-            }
-        } else if (!strcmp(a, "-t")) {
-            if (i < argc - 1) {
-                i++;
-                input = strtoul(argv[i], NULL, 10);
-                if (input == 0 || input == ULONG_MAX ||
-                    input > ARGON2_MAX_TIME) {
-                    fatal("bad numeric input for -t");
-                }
-                t_cost = input;
-                continue;
-            } else {
-                fatal("missing -t argument");
-            }
-        } else if (!strcmp(a, "-p")) {
-            if (i < argc - 1) {
-                i++;
-                input = strtoul(argv[i], NULL, 10);
-                if (input == 0 || input == ULONG_MAX ||
-                    input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) {
-                    fatal("bad numeric input for -p");
-                }
-                threads = input;
-                lanes = threads;
-                continue;
-            } else {
-                fatal("missing -p argument");
-            }
-        } else if (!strcmp(a, "-d")) {
-            type = "d";
-        } else {
-            fatal("unknown argument");
-        }
-    }
-    printf("Type:\t\tArgon2%c\n", type[0]);
-    printf("Iterations:\t%" PRIu32 " \n", t_cost);
-    printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost);
-    printf("Parallelism:\t%" PRIu32 " \n", lanes);
-    run(out, pwd, salt, t_cost, m_cost, lanes, threads, type);
-
-    return ARGON2_OK;
-}
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h
@@ -1,38 +0,0 @@
-#if defined(SCRYPT_SKEIN512)
-#include "scrypt-jane-hash_skein512.h"
-#else
-	#define SCRYPT_HASH "ERROR"
-	#define SCRYPT_HASH_BLOCK_SIZE 64
-	#define SCRYPT_HASH_DIGEST_SIZE 64
-	typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
-	typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
-	static void scrypt_hash_init(scrypt_hash_state *S) {}
-	static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
-	static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
-	static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
-	#error must define a hash function!
-#endif
-
-#include "scrypt-jane-pbkdf2.h"
-
-#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
-
-static int
-scrypt_test_hash(void) {
-	scrypt_hash_state st;
-	scrypt_hash_digest hash, final;
-	uint8_t msg[SCRYPT_TEST_HASH_LEN];
-	size_t i;
-
-	for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
-		msg[i] = (uint8_t)i;
-
-	scrypt_hash_init(&st);
-	for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
-		scrypt_hash(hash, msg, i);
-		scrypt_hash_update(&st, hash, sizeof(hash));
-	}
-	scrypt_hash_finish(&st, final);
-	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
-}
-
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h
@@ -1,188 +0,0 @@
-#define SCRYPT_HASH "Skein-512"
-#define SCRYPT_HASH_BLOCK_SIZE 64
-#define SCRYPT_HASH_DIGEST_SIZE 64
-
-typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
-
-typedef struct scrypt_hash_state_t {
-	uint64_t X[8], T[2];
-	uint32_t leftover;
-	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
-} scrypt_hash_state;
-
-#include <stdio.h>
-
-static void
-skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
-	uint64_t X[8], key[8], Xt[9+18], T[3+1];
-	size_t r;
-
-	while (blocks--) {
-		T[0] = S->T[0] + add;
-		T[1] = S->T[1];
-		T[2] = T[0] ^ T[1];
-		key[0] = U8TO64_LE(in +  0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
-		key[1] = U8TO64_LE(in +  8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
-		key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
-		key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
-		key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
-		key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
-		key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
-		key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
-		Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
-		in += SCRYPT_HASH_BLOCK_SIZE;
-
-		for (r = 0; r < 18; r++)
-			Xt[r + 9] = Xt[r + 0];
-
-		for (r = 0; r < 18; r += 2) {
-			X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
-			X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
-			X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
-			X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
-			X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
-			X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
-			X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
-			X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
-			X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
-			X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
-			X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
-			X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
-			X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
-			X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
-			X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
-			X[0] += X[7]; X[7] = ROTL64(X[7],  9) ^ X[0];
-
-			X[0] += Xt[r + 1];
-			X[1] += Xt[r + 2];
-			X[2] += Xt[r + 3];
-			X[3] += Xt[r + 4];
-			X[4] += Xt[r + 5];
-			X[5] += Xt[r + 6] + T[1];
-			X[6] += Xt[r + 7] + T[2];
-			X[7] += Xt[r + 8] + r + 1;
-
-			T[3] = T[0];
-			T[0] = T[1];
-			T[1] = T[2];
-			T[2] = T[3];
-
-			X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
-			X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
-			X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
-			X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
-			X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
-			X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
-			X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
-			X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
-			X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
-			X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
-			X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
-			X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
-			X[6] += X[1]; X[1] = ROTL64(X[1],  8) ^ X[6];
-			X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
-			X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
-			X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
-
-			X[0] += Xt[r + 2];
-			X[1] += Xt[r + 3];
-			X[2] += Xt[r + 4];
-			X[3] += Xt[r + 5];
-			X[4] += Xt[r + 6];
-			X[5] += Xt[r + 7] + T[1];
-			X[6] += Xt[r + 8] + T[2];
-			X[7] += Xt[r + 9] + r + 2;
-
-			T[3] = T[0];
-			T[0] = T[1];
-			T[1] = T[2];
-			T[2] = T[3];
-		}
-
-		S->X[0] = key[0] ^ X[0];
-		S->X[1] = key[1] ^ X[1];
-		S->X[2] = key[2] ^ X[2];
-		S->X[3] = key[3] ^ X[3];
-		S->X[4] = key[4] ^ X[4];
-		S->X[5] = key[5] ^ X[5];
-		S->X[6] = key[6] ^ X[6];
-		S->X[7] = key[7] ^ X[7];
-
-		S->T[0] = T[0];
-		S->T[1] = T[1] & ~0x4000000000000000ull;
-	}
-}
-
-static void
-scrypt_hash_init(scrypt_hash_state *S) {
-	S->X[0] = 0x4903ADFF749C51CEull;
-	S->X[1] = 0x0D95DE399746DF03ull;
-	S->X[2] = 0x8FD1934127C79BCEull;
-	S->X[3] = 0x9A255629FF352CB1ull;
-	S->X[4] = 0x5DB62599DF6CA7B0ull;
-	S->X[5] = 0xEABE394CA9D5C3F4ull;
-	S->X[6] = 0x991112C71A75B523ull;
-	S->X[7] = 0xAE18A40B660FCC33ull;
-	S->T[0] = 0x0000000000000000ull;
-	S->T[1] = 0x7000000000000000ull;
-	S->leftover = 0;
-}
-
-static void
-scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
-	size_t blocks, want;
-
-	/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
-	if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
-		/* handle the previous data, we know there is enough for at least one block */
-		if (S->leftover) {
-			want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
-			memcpy(S->buffer + S->leftover, in, want);
-			in += want;
-			inlen -= want;
-			S->leftover = 0;
-			skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
-		}
-
-		/* handle the current data if there's more than one block */
-		if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
-			blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
-			skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
-			inlen -= blocks;
-			in += blocks;
-		}
-	}
-
-	/* handle leftover data */
-	memcpy(S->buffer + S->leftover, in, inlen);
-	S->leftover += (int) inlen;
-}
-
-static void
-scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
-	memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
-	S->T[1] |= 0x8000000000000000ull;
-	skein512_blocks(S, S->buffer, 1, S->leftover);
-
-	memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
-	S->T[0] = 0;
-	S->T[1] = 0xff00000000000000ull;
-	skein512_blocks(S, S->buffer, 1, 8);
-
-	U64TO8_LE(&hash[ 0], S->X[0]);
-	U64TO8_LE(&hash[ 8], S->X[1]);
-	U64TO8_LE(&hash[16], S->X[2]);
-	U64TO8_LE(&hash[24], S->X[3]);
-	U64TO8_LE(&hash[32], S->X[4]);
-	U64TO8_LE(&hash[40], S->X[5]);
-	U64TO8_LE(&hash[48], S->X[6]);
-	U64TO8_LE(&hash[56], S->X[7]);
-}
-
-
-static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
-	0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
-	0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
-	0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
-	0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
-};
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h
@@ -1,367 +0,0 @@
-/* x64 */
-#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA64_AVX
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_avx)
-	a1(push rbp)
-	a2(mov rbp, rsp)
-	a2(and rsp, ~63)
-	a2(sub rsp, 128)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,7)
-	a2(lea r9,[rcx-128])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(vmovdqa xmm0,[rax+0])
-	a2(vmovdqa xmm1,[rax+16])
-	a2(vmovdqa xmm2,[rax+32])
-	a2(vmovdqa xmm3,[rax+48])
-	a2(vmovdqa xmm4,[rax+64])
-	a2(vmovdqa xmm5,[rax+80])
-	a2(vmovdqa xmm6,[rax+96])
-	a2(vmovdqa xmm7,[rax+112])
-	aj(jz scrypt_ChunkMix_avx_no_xor1)
-	a3(vpxor xmm0,xmm0,[r9+0])
-	a3(vpxor xmm1,xmm1,[r9+16])
-	a3(vpxor xmm2,xmm2,[r9+32])
-	a3(vpxor xmm3,xmm3,[r9+48])
-	a3(vpxor xmm4,xmm4,[r9+64])
-	a3(vpxor xmm5,xmm5,[r9+80])
-	a3(vpxor xmm6,xmm6,[r9+96])
-	a3(vpxor xmm7,xmm7,[r9+112])
-	a1(scrypt_ChunkMix_avx_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_avx_loop:)
-		a2(and rdx, rdx)
-		a3(vpxor xmm0,xmm0,[rsi+r9+0])
-		a3(vpxor xmm1,xmm1,[rsi+r9+16])
-		a3(vpxor xmm2,xmm2,[rsi+r9+32])
-		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		a3(vpxor xmm4,xmm4,[rsi+r9+64])
-		a3(vpxor xmm5,xmm5,[rsi+r9+80])
-		a3(vpxor xmm6,xmm6,[rsi+r9+96])
-		a3(vpxor xmm7,xmm7,[rsi+r9+112])
-		aj(jz scrypt_ChunkMix_avx_no_xor2)
-		a3(vpxor xmm0,xmm0,[rdx+r9+0])
-		a3(vpxor xmm1,xmm1,[rdx+r9+16])
-		a3(vpxor xmm2,xmm2,[rdx+r9+32])
-		a3(vpxor xmm3,xmm3,[rdx+r9+48])
-		a3(vpxor xmm4,xmm4,[rdx+r9+64])
-		a3(vpxor xmm5,xmm5,[rdx+r9+80])
-		a3(vpxor xmm6,xmm6,[rdx+r9+96])
-		a3(vpxor xmm7,xmm7,[rdx+r9+112])
-		a1(scrypt_ChunkMix_avx_no_xor2:)
-		a2(vmovdqa [rsp+0],xmm0)
-		a2(vmovdqa [rsp+16],xmm1)
-		a2(vmovdqa [rsp+32],xmm2)
-		a2(vmovdqa [rsp+48],xmm3)
-		a2(vmovdqa [rsp+64],xmm4)
-		a2(vmovdqa [rsp+80],xmm5)
-		a2(vmovdqa [rsp+96],xmm6)
-		a2(vmovdqa [rsp+112],xmm7)
-		a2(mov rax,8)
-		a1(scrypt_salsa64_avx_loop: )
-			a3(vpaddq xmm8, xmm0, xmm2)
-			a3(vpaddq xmm9, xmm1, xmm3)
-			a3(vpshufd xmm8, xmm8, 0xb1)
-			a3(vpshufd xmm9, xmm9, 0xb1)
-			a3(vpxor xmm6, xmm6, xmm8)
-			a3(vpxor xmm7, xmm7, xmm9)
-			a3(vpaddq xmm10, xmm0, xmm6)
-			a3(vpaddq xmm11, xmm1, xmm7)
-			a3(vpsrlq xmm8, xmm10, 51)
-			a3(vpsrlq xmm9, xmm11, 51)
-			a3(vpsllq xmm10, xmm10, 13)
-			a3(vpsllq xmm11, xmm11, 13)
-			a3(vpxor xmm4, xmm4, xmm8)
-			a3(vpxor xmm5, xmm5, xmm9)
-			a3(vpxor xmm4, xmm4, xmm10)
-			a3(vpxor xmm5, xmm5, xmm11)
-			a3(vpaddq xmm8, xmm6, xmm4)
-			a3(vpaddq xmm9, xmm7, xmm5)
-			a3(vpsrlq xmm10, xmm8, 25)
-			a3(vpsrlq xmm11, xmm9, 25)
-			a3(vpsllq xmm8, xmm8, 39)
-			a3(vpsllq xmm9, xmm9, 39)
-			a3(vpxor xmm2, xmm2, xmm10)
-			a3(vpxor xmm3, xmm3, xmm11)
-			a3(vpxor xmm2, xmm2, xmm8)
-			a3(vpxor xmm3, xmm3, xmm9)
-			a3(vpaddq xmm10, xmm4, xmm2)
-			a3(vpaddq xmm11, xmm5, xmm3)
-			a3(vpshufd xmm10, xmm10, 0xb1)
-			a3(vpshufd xmm11, xmm11, 0xb1)
-			a3(vpxor xmm0, xmm0, xmm10)
-			a3(vpxor xmm1, xmm1, xmm11)
-			a2(vmovdqa xmm8, xmm2)
-			a2(vmovdqa xmm9, xmm3)
-			a4(vpalignr xmm2, xmm6, xmm7, 8)
-			a4(vpalignr xmm3, xmm7, xmm6, 8)
-			a4(vpalignr xmm6, xmm9, xmm8, 8)
-			a4(vpalignr xmm7, xmm8, xmm9, 8)
-			a3(vpaddq xmm10, xmm0, xmm2)
-			a3(vpaddq xmm11, xmm1, xmm3)
-			a3(vpshufd xmm10, xmm10, 0xb1)
-			a3(vpshufd xmm11, xmm11, 0xb1)
-			a3(vpxor xmm6, xmm6, xmm10)
-			a3(vpxor xmm7, xmm7, xmm11)
-			a3(vpaddq xmm8, xmm0, xmm6)
-			a3(vpaddq xmm9, xmm1, xmm7)
-			a3(vpsrlq xmm10, xmm8, 51)
-			a3(vpsrlq xmm11, xmm9, 51)
-			a3(vpsllq xmm8, xmm8, 13)
-			a3(vpsllq xmm9, xmm9, 13)
-			a3(vpxor xmm5, xmm5, xmm10)
-			a3(vpxor xmm4, xmm4, xmm11)
-			a3(vpxor xmm5, xmm5, xmm8)
-			a3(vpxor xmm4, xmm4, xmm9)
-			a3(vpaddq xmm10, xmm6, xmm5)
-			a3(vpaddq xmm11, xmm7, xmm4)
-			a3(vpsrlq xmm8, xmm10, 25)
-			a3(vpsrlq xmm9, xmm11, 25)
-			a3(vpsllq xmm10, xmm10, 39)
-			a3(vpsllq xmm11, xmm11, 39)
-			a3(vpxor xmm2, xmm2, xmm8)
-			a3(vpxor xmm3, xmm3, xmm9)
-			a3(vpxor xmm2, xmm2, xmm10)
-			a3(vpxor xmm3, xmm3, xmm11)
-			a3(vpaddq xmm8, xmm5, xmm2)
-			a3(vpaddq xmm9, xmm4, xmm3)
-			a3(vpshufd xmm8, xmm8, 0xb1)
-			a3(vpshufd xmm9, xmm9, 0xb1)
-			a3(vpxor xmm0, xmm0, xmm8)
-			a3(vpxor xmm1, xmm1, xmm9)
-			a2(vmovdqa xmm10, xmm2)
-			a2(vmovdqa xmm11, xmm3)
-			a4(vpalignr xmm2, xmm6, xmm7, 8)
-			a4(vpalignr xmm3, xmm7, xmm6, 8)
-			a4(vpalignr xmm6, xmm11, xmm10, 8)
-			a4(vpalignr xmm7, xmm10, xmm11, 8)
-			a2(sub rax, 2)
-			aj(ja scrypt_salsa64_avx_loop)
-		a3(vpaddq xmm0,xmm0,[rsp+0])
-		a3(vpaddq xmm1,xmm1,[rsp+16])
-		a3(vpaddq xmm2,xmm2,[rsp+32])
-		a3(vpaddq xmm3,xmm3,[rsp+48])
-		a3(vpaddq xmm4,xmm4,[rsp+64])
-		a3(vpaddq xmm5,xmm5,[rsp+80])
-		a3(vpaddq xmm6,xmm6,[rsp+96])
-		a3(vpaddq xmm7,xmm7,[rsp+112])
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0xff)
-		a2(add r9,128)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(vmovdqa [rax+0],xmm0)
-		a2(vmovdqa [rax+16],xmm1)
-		a2(vmovdqa [rax+32],xmm2)
-		a2(vmovdqa [rax+48],xmm3)
-		a2(vmovdqa [rax+64],xmm4)
-		a2(vmovdqa [rax+80],xmm5)
-		a2(vmovdqa [rax+96],xmm6)
-		a2(vmovdqa [rax+112],xmm7)
-		aj(jne scrypt_ChunkMix_avx_loop)
-	a2(mov rsp, rbp)
-	a1(pop rbp)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_avx)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
-
-#define SCRYPT_SALSA64_AVX
-
-static void asm_calling_convention
-scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-	x4 = xmmp[4];
-	x5 = xmmp[5];
-	x6 = xmmp[6];
-	x7 = xmmp[7];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-			x4 = _mm_xor_si128(x4, xmmp[4]);
-			x5 = _mm_xor_si128(x5, xmmp[5]);
-			x6 = _mm_xor_si128(x6, xmmp[6]);
-			x7 = _mm_xor_si128(x7, xmmp[7]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-		t4 = x4;
-		t5 = x5;
-		t6 = x6;
-		t7 = x7;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x4 = _mm_xor_si128(x4, z2);
-			x5 = _mm_xor_si128(x5, z3);
-			x4 = _mm_xor_si128(x4, z0);
-			x5 = _mm_xor_si128(x5, z1);
-
-			z0 = _mm_add_epi64(x4, x6);
-			z1 = _mm_add_epi64(x5, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x4);
-			z1 = _mm_add_epi64(x3, x5);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x5 = _mm_xor_si128(x5, z2);
-			x4 = _mm_xor_si128(x4, z3);
-			x5 = _mm_xor_si128(x5, z0);
-			x4 = _mm_xor_si128(x4, z1);
-
-			z0 = _mm_add_epi64(x5, x6);
-			z1 = _mm_add_epi64(x4, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x5);
-			z1 = _mm_add_epi64(x3, x4);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-		}
-
-		x0 = _mm_add_epi64(x0, t0);
-		x1 = _mm_add_epi64(x1, t1);
-		x2 = _mm_add_epi64(x2, t2);
-		x3 = _mm_add_epi64(x3, t3);
-		x4 = _mm_add_epi64(x4, t4);
-		x5 = _mm_add_epi64(x5, t5);
-		x6 = _mm_add_epi64(x6, t6);
-		x7 = _mm_add_epi64(x7, t7);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-		xmmp[4] = x4;
-		xmmp[5] = x5;
-		xmmp[6] = x6;
-		xmmp[7] = x7;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX)
-	/* uses salsa64_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa64/8-AVX"
-	#undef SCRYPT_SALSA64_INCLUDED
-	#define SCRYPT_SALSA64_INCLUDED
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
@@ -1,221 +0,0 @@
-/* x64 */
-#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA64_AVX2
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_avx2)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,7)
-	a2(lea r9,[rcx-128])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(vmovdqa ymm0,[rax+0])
-	a2(vmovdqa ymm1,[rax+32])
-	a2(vmovdqa ymm2,[rax+64])
-	a2(vmovdqa ymm3,[rax+96])
-	aj(jz scrypt_ChunkMix_avx2_no_xor1)
-	a3(vpxor ymm0,ymm0,[r9+0])
-	a3(vpxor ymm1,ymm1,[r9+32])
-	a3(vpxor ymm2,ymm2,[r9+64])
-	a3(vpxor ymm3,ymm3,[r9+96])
-	a1(scrypt_ChunkMix_avx2_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_avx2_loop:)
-		a2(and rdx, rdx)
-		a3(vpxor ymm0,ymm0,[rsi+r9+0])
-		a3(vpxor ymm1,ymm1,[rsi+r9+32])
-		a3(vpxor ymm2,ymm2,[rsi+r9+64])
-		a3(vpxor ymm3,ymm3,[rsi+r9+96])
-		aj(jz scrypt_ChunkMix_avx2_no_xor2)
-		a3(vpxor ymm0,ymm0,[rdx+r9+0])
-		a3(vpxor ymm1,ymm1,[rdx+r9+32])
-		a3(vpxor ymm2,ymm2,[rdx+r9+64])
-		a3(vpxor ymm3,ymm3,[rdx+r9+96])
-		a1(scrypt_ChunkMix_avx2_no_xor2:)
-		a2(vmovdqa ymm6,ymm0)
-		a2(vmovdqa ymm7,ymm1)
-		a2(vmovdqa ymm8,ymm2)
-		a2(vmovdqa ymm9,ymm3)
-		a2(mov rax,4)
-		a1(scrypt_salsa64_avx2_loop: )
-			a3(vpaddq ymm4, ymm1, ymm0)
-			a3(vpshufd ymm4, ymm4, 0xb1)
-			a3(vpxor ymm3, ymm3, ymm4)
-			a3(vpaddq ymm4, ymm0, ymm3)
-			a3(vpsrlq ymm5, ymm4, 51)
-			a3(vpxor ymm2, ymm2, ymm5)
-			a3(vpsllq ymm4, ymm4, 13)
-			a3(vpxor ymm2, ymm2, ymm4)
-			a3(vpaddq ymm4, ymm3, ymm2)
-			a3(vpsrlq ymm5, ymm4, 25)
-			a3(vpxor ymm1, ymm1, ymm5)
-			a3(vpsllq ymm4, ymm4, 39)
-			a3(vpxor ymm1, ymm1, ymm4)
-			a3(vpaddq ymm4, ymm2, ymm1)
-			a3(vpshufd ymm4, ymm4, 0xb1)
-			a3(vpermq ymm1, ymm1, 0x39)
-			a3(vpermq ymm10, ymm2, 0x4e)
-			a3(vpxor ymm0, ymm0, ymm4)
-			a3(vpermq ymm3, ymm3, 0x93)
-			a3(vpaddq ymm4, ymm3, ymm0)
-			a3(vpshufd ymm4, ymm4, 0xb1)
-			a3(vpxor ymm1, ymm1, ymm4)
-			a3(vpaddq ymm4, ymm0, ymm1)
-			a3(vpsrlq ymm5, ymm4, 51)
-			a3(vpxor ymm10, ymm10, ymm5)
-			a3(vpsllq ymm4, ymm4, 13)
-			a3(vpxor ymm10, ymm10, ymm4)
-			a3(vpaddq ymm4, ymm1, ymm10)
-			a3(vpsrlq ymm5, ymm4, 25)
-			a3(vpxor ymm3, ymm3, ymm5)
-			a3(vpsllq ymm4, ymm4, 39)
-			a3(vpermq ymm1, ymm1, 0x93)
-			a3(vpxor ymm3, ymm3, ymm4)
-			a3(vpermq ymm2, ymm10, 0x4e)
-			a3(vpaddq ymm4, ymm10, ymm3)
-			a3(vpshufd ymm4, ymm4, 0xb1)
-			a3(vpermq ymm3, ymm3, 0x39)
-			a3(vpxor ymm0, ymm0, ymm4)
-			a1(dec rax)
-			aj(jnz scrypt_salsa64_avx2_loop)
-		a3(vpaddq ymm0,ymm0,ymm6)
-		a3(vpaddq ymm1,ymm1,ymm7)
-		a3(vpaddq ymm2,ymm2,ymm8)
-		a3(vpaddq ymm3,ymm3,ymm9)
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0xff)
-		a2(add r9,128)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(vmovdqa [rax+0],ymm0)
-		a2(vmovdqa [rax+32],ymm1)
-		a2(vmovdqa [rax+64],ymm2)
-		a2(vmovdqa [rax+96],ymm3)
-		aj(jne scrypt_ChunkMix_avx2_loop)
-	a1(vzeroupper)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_avx2)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
-
-#define SCRYPT_SALSA64_AVX2
-
-static void asm_calling_convention
-scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	y0 = ymmp[0];
-	y1 = ymmp[1];
-	y2 = ymmp[2];
-	y3 = ymmp[3];
-
-	if (Bxor) {
-		ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		y0 = _mm256_xor_si256(y0, ymmp[0]);
-		y1 = _mm256_xor_si256(y1, ymmp[1]);
-		y2 = _mm256_xor_si256(y2, ymmp[2]);
-		y3 = _mm256_xor_si256(y3, ymmp[3]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		ymmp = (ymmi *)scrypt_block(Bin, i);
-		y0 = _mm256_xor_si256(y0, ymmp[0]);
-		y1 = _mm256_xor_si256(y1, ymmp[1]);
-		y2 = _mm256_xor_si256(y2, ymmp[2]);
-		y3 = _mm256_xor_si256(y3, ymmp[3]);
-
-		if (Bxor) {
-			ymmp = (ymmi *)scrypt_block(Bxor, i);
-			y0 = _mm256_xor_si256(y0, ymmp[0]);
-			y1 = _mm256_xor_si256(y1, ymmp[1]);
-			y2 = _mm256_xor_si256(y2, ymmp[2]);
-			y3 = _mm256_xor_si256(y3, ymmp[3]);
-		}
-
-		t0 = y0;
-		t1 = y1;
-		t2 = y2;
-		t3 = y3;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			z0 = _mm256_add_epi64(y0, y1);
-			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			y3 = _mm256_xor_si256(y3, z0);
-			z0 = _mm256_add_epi64(y3, y0);
-			z1 = _mm256_srli_epi64(z0, 64-13);
-			y2 = _mm256_xor_si256(y2, z1);
-			z0 = _mm256_slli_epi64(z0, 13);
-			y2 = _mm256_xor_si256(y2, z0);
-			z0 = _mm256_add_epi64(y2, y3);
-			z1 = _mm256_srli_epi64(z0, 64-39);
-			y1 = _mm256_xor_si256(y1, z1);
-			z0 = _mm256_slli_epi64(z0, 39);
-			y1 = _mm256_xor_si256(y1, z0);
-			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
-			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
-			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
-			z0 = _mm256_add_epi64(y1, y2);
-			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			y0 = _mm256_xor_si256(y0, z0);
-			z0 = _mm256_add_epi64(y0, y3);
-			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			y1 = _mm256_xor_si256(y1, z0);
-			z0 = _mm256_add_epi64(y1, y0);
-			z1 = _mm256_srli_epi64(z0, 64-13);
-			y2 = _mm256_xor_si256(y2, z1);
-			z0 = _mm256_slli_epi64(z0, 13);
-			y2 = _mm256_xor_si256(y2, z0);
-			z0 = _mm256_add_epi64(y2, y1);
-			z1 = _mm256_srli_epi64(z0, 64-39);
-			y3 = _mm256_xor_si256(y3, z1);
-			z0 = _mm256_slli_epi64(z0, 39);
-			y3 = _mm256_xor_si256(y3, z0);
-			z0 = _mm256_add_epi64(y3, y2);
-			z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			y0 = _mm256_xor_si256(y0, z0);
-			y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
-			y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
-			y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
-		}
-
-		y0 = _mm256_add_epi64(y0, t0);
-		y1 = _mm256_add_epi64(y1, t1);
-		y2 = _mm256_add_epi64(y2, t2);
-		y3 = _mm256_add_epi64(y3, t3);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
-		ymmp[0] = y0;
-		ymmp[1] = y1;
-		ymmp[2] = y2;
-		ymmp[3] = y3;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX2)
-	/* uses salsa64_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa64/8-AVX2"
-	#undef SCRYPT_SALSA64_INCLUDED
-	#define SCRYPT_SALSA64_INCLUDED
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
@@ -1,449 +0,0 @@
-/* x64 */
-#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA64_SSE2
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_sse2)
-	a1(push rbp)
-	a2(mov rbp, rsp)
-	a2(and rsp, ~63)
-	a2(sub rsp, 128)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,7)
-	a2(lea r9,[rcx-128])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(movdqa xmm0,[rax+0])
-	a2(movdqa xmm1,[rax+16])
-	a2(movdqa xmm2,[rax+32])
-	a2(movdqa xmm3,[rax+48])
-	a2(movdqa xmm4,[rax+64])
-	a2(movdqa xmm5,[rax+80])
-	a2(movdqa xmm6,[rax+96])
-	a2(movdqa xmm7,[rax+112])
-	aj(jz scrypt_ChunkMix_sse2_no_xor1)
-	a2(pxor xmm0,[r9+0])
-	a2(pxor xmm1,[r9+16])
-	a2(pxor xmm2,[r9+32])
-	a2(pxor xmm3,[r9+48])
-	a2(pxor xmm4,[r9+64])
-	a2(pxor xmm5,[r9+80])
-	a2(pxor xmm6,[r9+96])
-	a2(pxor xmm7,[r9+112])
-	a1(scrypt_ChunkMix_sse2_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_sse2_loop:)
-		a2(and rdx, rdx)
-		a2(pxor xmm0,[rsi+r9+0])
-		a2(pxor xmm1,[rsi+r9+16])
-		a2(pxor xmm2,[rsi+r9+32])
-		a2(pxor xmm3,[rsi+r9+48])
-		a2(pxor xmm4,[rsi+r9+64])
-		a2(pxor xmm5,[rsi+r9+80])
-		a2(pxor xmm6,[rsi+r9+96])
-		a2(pxor xmm7,[rsi+r9+112])
-		aj(jz scrypt_ChunkMix_sse2_no_xor2)
-		a2(pxor xmm0,[rdx+r9+0])
-		a2(pxor xmm1,[rdx+r9+16])
-		a2(pxor xmm2,[rdx+r9+32])
-		a2(pxor xmm3,[rdx+r9+48])
-		a2(pxor xmm4,[rdx+r9+64])
-		a2(pxor xmm5,[rdx+r9+80])
-		a2(pxor xmm6,[rdx+r9+96])
-		a2(pxor xmm7,[rdx+r9+112])
-		a1(scrypt_ChunkMix_sse2_no_xor2:)
-		a2(movdqa [rsp+0],xmm0)
-		a2(movdqa [rsp+16],xmm1)
-		a2(movdqa [rsp+32],xmm2)
-		a2(movdqa [rsp+48],xmm3)
-		a2(movdqa [rsp+64],xmm4)
-		a2(movdqa [rsp+80],xmm5)
-		a2(movdqa [rsp+96],xmm6)
-		a2(movdqa [rsp+112],xmm7)
-		a2(mov rax,8)
-		a1(scrypt_salsa64_sse2_loop: )
-			a2(movdqa xmm8, xmm0)
-			a2(movdqa xmm9, xmm1)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm6, xmm8)
-			a2(pxor xmm7, xmm9)
-			a2(movdqa xmm10, xmm0)
-			a2(movdqa xmm11, xmm1)
-			a2(paddq xmm10, xmm6)
-			a2(paddq xmm11, xmm7)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 51)
-			a2(psrlq xmm11, 51)
-			a2(psllq xmm8, 13)
-			a2(psllq xmm9, 13)
-			a2(pxor xmm4, xmm10)
-			a2(pxor xmm5, xmm11)
-			a2(pxor xmm4, xmm8)
-			a2(pxor xmm5, xmm9)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(paddq xmm10, xmm4)
-			a2(paddq xmm11, xmm5)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 25)
-			a2(psrlq xmm11, 25)
-			a2(psllq xmm8, 39)
-			a2(psllq xmm9, 39)
-			a2(pxor xmm2, xmm10)
-			a2(pxor xmm3, xmm11)
-			a2(pxor xmm2, xmm8)
-			a2(pxor xmm3, xmm9)
-			a2(movdqa xmm8, xmm4)
-			a2(movdqa xmm9, xmm5)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm0, xmm8)
-			a2(pxor xmm1, xmm9)
-			a2(movdqa xmm8, xmm2)
-			a2(movdqa xmm9, xmm3)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(movdqa xmm2, xmm7)
-			a2(movdqa xmm3, xmm6)
-			a2(punpcklqdq xmm10, xmm6)
-			a2(punpcklqdq xmm11, xmm7)
-			a2(movdqa xmm6, xmm8)
-			a2(movdqa xmm7, xmm9)
-			a2(punpcklqdq xmm9, xmm9)
-			a2(punpcklqdq xmm8, xmm8)
-			a2(punpckhqdq xmm2, xmm10)
-			a2(punpckhqdq xmm3, xmm11)
-			a2(punpckhqdq xmm6, xmm9)
-			a2(punpckhqdq xmm7, xmm8)
-			a2(sub rax, 2)
-			a2(movdqa xmm8, xmm0)
-			a2(movdqa xmm9, xmm1)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm6, xmm8)
-			a2(pxor xmm7, xmm9)
-			a2(movdqa xmm10, xmm0)
-			a2(movdqa xmm11, xmm1)
-			a2(paddq xmm10, xmm6)
-			a2(paddq xmm11, xmm7)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 51)
-			a2(psrlq xmm11, 51)
-			a2(psllq xmm8, 13)
-			a2(psllq xmm9, 13)
-			a2(pxor xmm5, xmm10)
-			a2(pxor xmm4, xmm11)
-			a2(pxor xmm5, xmm8)
-			a2(pxor xmm4, xmm9)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(paddq xmm10, xmm5)
-			a2(paddq xmm11, xmm4)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 25)
-			a2(psrlq xmm11, 25)
-			a2(psllq xmm8, 39)
-			a2(psllq xmm9, 39)
-			a2(pxor xmm2, xmm10)
-			a2(pxor xmm3, xmm11)
-			a2(pxor xmm2, xmm8)
-			a2(pxor xmm3, xmm9)
-			a2(movdqa xmm8, xmm5)
-			a2(movdqa xmm9, xmm4)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm0, xmm8)
-			a2(pxor xmm1, xmm9)
-			a2(movdqa xmm8, xmm2)
-			a2(movdqa xmm9, xmm3)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(movdqa xmm2, xmm7)
-			a2(movdqa xmm3, xmm6)
-			a2(punpcklqdq xmm10, xmm6)
-			a2(punpcklqdq xmm11, xmm7)
-			a2(movdqa xmm6, xmm8)
-			a2(movdqa xmm7, xmm9)
-			a2(punpcklqdq xmm9, xmm9)
-			a2(punpcklqdq xmm8, xmm8)
-			a2(punpckhqdq xmm2, xmm10)
-			a2(punpckhqdq xmm3, xmm11)
-			a2(punpckhqdq xmm6, xmm9)
-			a2(punpckhqdq xmm7, xmm8)
-			aj(ja scrypt_salsa64_sse2_loop)
-		a2(paddq xmm0,[rsp+0])
-		a2(paddq xmm1,[rsp+16])
-		a2(paddq xmm2,[rsp+32])
-		a2(paddq xmm3,[rsp+48])
-		a2(paddq xmm4,[rsp+64])
-		a2(paddq xmm5,[rsp+80])
-		a2(paddq xmm6,[rsp+96])
-		a2(paddq xmm7,[rsp+112])
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0xff)
-		a2(add r9,128)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(movdqa [rax+0],xmm0)
-		a2(movdqa [rax+16],xmm1)
-		a2(movdqa [rax+32],xmm2)
-		a2(movdqa [rax+48],xmm3)
-		a2(movdqa [rax+64],xmm4)
-		a2(movdqa [rax+80],xmm5)
-		a2(movdqa [rax+96],xmm6)
-		a2(movdqa [rax+112],xmm7)
-		aj(jne scrypt_ChunkMix_sse2_loop)
-	a2(mov rsp, rbp)
-	a1(pop rbp)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_sse2)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
-
-#define SCRYPT_SALSA64_SSE2
-
-static void asm_calling_convention
-scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-	x4 = xmmp[4];
-	x5 = xmmp[5];
-	x6 = xmmp[6];
-	x7 = xmmp[7];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-			x4 = _mm_xor_si128(x4, xmmp[4]);
-			x5 = _mm_xor_si128(x5, xmmp[5]);
-			x6 = _mm_xor_si128(x6, xmmp[6]);
-			x7 = _mm_xor_si128(x7, xmmp[7]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-		t4 = x4;
-		t5 = x5;
-		t6 = x6;
-		t7 = x7;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x4 = _mm_xor_si128(x4, z2);
-			x5 = _mm_xor_si128(x5, z3);
-			x4 = _mm_xor_si128(x4, z0);
-			x5 = _mm_xor_si128(x5, z1);
-
-			z0 = _mm_add_epi64(x4, x6);
-			z1 = _mm_add_epi64(x5, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x4);
-			z1 = _mm_add_epi64(x3, x5);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x4;
-			z1 = x5;
-			z2 = x2;
-			z3 = x3;
-			x4 = z1;
-			x5 = z0;
-			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
-			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
-			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
-			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
-
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x4 = _mm_xor_si128(x4, z2);
-			x5 = _mm_xor_si128(x5, z3);
-			x4 = _mm_xor_si128(x4, z0);
-			x5 = _mm_xor_si128(x5, z1);
-
-			z0 = _mm_add_epi64(x4, x6);
-			z1 = _mm_add_epi64(x5, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x4);
-			z1 = _mm_add_epi64(x3, x5);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x4;
-			z1 = x5;
-			z2 = x2;
-			z3 = x3;
-			x4 = z1;
-			x5 = z0;
-			x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
-			x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
-			x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
-			x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
-		}
-
-		x0 = _mm_add_epi64(x0, t0);
-		x1 = _mm_add_epi64(x1, t1);
-		x2 = _mm_add_epi64(x2, t2);
-		x3 = _mm_add_epi64(x3, t3);
-		x4 = _mm_add_epi64(x4, t4);
-		x5 = _mm_add_epi64(x5, t5);
-		x6 = _mm_add_epi64(x6, t6);
-		x7 = _mm_add_epi64(x7, t7);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-		xmmp[4] = x4;
-		xmmp[5] = x5;
-		xmmp[6] = x6;
-		xmmp[7] = x7;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA64_SSE2)
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa64/8-SSE2"
-	#undef SCRYPT_SALSA64_INCLUDED
-	#define SCRYPT_SALSA64_INCLUDED
-#endif
-
-/* sse3/avx use this as well */
-#if defined(SCRYPT_SALSA64_INCLUDED)
-	/*
-		Default layout:
-		 0  1  2  3
-		 4  5  6  7
-		 8  9 10 11
-		12 13 14 15
-
-		SSE2 layout:
-		 0  5 10 15
-		12  1  6 11
-		 8 13  2  7
-		 4  9 14  3
-	*/
-
-
-	static void asm_calling_convention
-	salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
-		uint64_t t;
-		while (count--) {
-			t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
-			t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
-			t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
-			t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
-			t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
-			t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
-			blocks += 16;
-		}
-	}
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
@@ -1,399 +0,0 @@
-/* x64 */
-#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA64_SSSE3
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_ssse3)
-	a1(push rbp)
-	a2(mov rbp, rsp)
-	a2(and rsp, ~63)
-	a2(sub rsp, 128)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,7)
-	a2(lea r9,[rcx-128])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(movdqa xmm0,[rax+0])
-	a2(movdqa xmm1,[rax+16])
-	a2(movdqa xmm2,[rax+32])
-	a2(movdqa xmm3,[rax+48])
-	a2(movdqa xmm4,[rax+64])
-	a2(movdqa xmm5,[rax+80])
-	a2(movdqa xmm6,[rax+96])
-	a2(movdqa xmm7,[rax+112])
-	aj(jz scrypt_ChunkMix_ssse3_no_xor1)
-	a2(pxor xmm0,[r9+0])
-	a2(pxor xmm1,[r9+16])
-	a2(pxor xmm2,[r9+32])
-	a2(pxor xmm3,[r9+48])
-	a2(pxor xmm4,[r9+64])
-	a2(pxor xmm5,[r9+80])
-	a2(pxor xmm6,[r9+96])
-	a2(pxor xmm7,[r9+112])
-	a1(scrypt_ChunkMix_ssse3_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_ssse3_loop:)
-		a2(and rdx, rdx)
-		a2(pxor xmm0,[rsi+r9+0])
-		a2(pxor xmm1,[rsi+r9+16])
-		a2(pxor xmm2,[rsi+r9+32])
-		a2(pxor xmm3,[rsi+r9+48])
-		a2(pxor xmm4,[rsi+r9+64])
-		a2(pxor xmm5,[rsi+r9+80])
-		a2(pxor xmm6,[rsi+r9+96])
-		a2(pxor xmm7,[rsi+r9+112])
-		aj(jz scrypt_ChunkMix_ssse3_no_xor2)
-		a2(pxor xmm0,[rdx+r9+0])
-		a2(pxor xmm1,[rdx+r9+16])
-		a2(pxor xmm2,[rdx+r9+32])
-		a2(pxor xmm3,[rdx+r9+48])
-		a2(pxor xmm4,[rdx+r9+64])
-		a2(pxor xmm5,[rdx+r9+80])
-		a2(pxor xmm6,[rdx+r9+96])
-		a2(pxor xmm7,[rdx+r9+112])
-		a1(scrypt_ChunkMix_ssse3_no_xor2:)
-		a2(movdqa [rsp+0],xmm0)
-		a2(movdqa [rsp+16],xmm1)
-		a2(movdqa [rsp+32],xmm2)
-		a2(movdqa [rsp+48],xmm3)
-		a2(movdqa [rsp+64],xmm4)
-		a2(movdqa [rsp+80],xmm5)
-		a2(movdqa [rsp+96],xmm6)
-		a2(movdqa [rsp+112],xmm7)
-		a2(mov rax,8)
-		a1(scrypt_salsa64_ssse3_loop: )
-			a2(movdqa xmm8, xmm0)
-			a2(movdqa xmm9, xmm1)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm6, xmm8)
-			a2(pxor xmm7, xmm9)
-			a2(movdqa xmm10, xmm0)
-			a2(movdqa xmm11, xmm1)
-			a2(paddq xmm10, xmm6)
-			a2(paddq xmm11, xmm7)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 51)
-			a2(psrlq xmm11, 51)
-			a2(psllq xmm8, 13)
-			a2(psllq xmm9, 13)
-			a2(pxor xmm4, xmm10)
-			a2(pxor xmm5, xmm11)
-			a2(pxor xmm4, xmm8)
-			a2(pxor xmm5, xmm9)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(paddq xmm10, xmm4)
-			a2(paddq xmm11, xmm5)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 25)
-			a2(psrlq xmm11, 25)
-			a2(psllq xmm8, 39)
-			a2(psllq xmm9, 39)
-			a2(pxor xmm2, xmm10)
-			a2(pxor xmm3, xmm11)
-			a2(pxor xmm2, xmm8)
-			a2(pxor xmm3, xmm9)
-			a2(movdqa xmm8, xmm4)
-			a2(movdqa xmm9, xmm5)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm0, xmm8)
-			a2(pxor xmm1, xmm9)
-			a2(movdqa xmm10, xmm2)
-			a2(movdqa xmm11, xmm3)
-			a2(movdqa xmm2, xmm6)
-			a2(movdqa xmm3, xmm7)
-			a3(palignr xmm2, xmm7, 8)
-			a3(palignr xmm3, xmm6, 8)
-			a2(movdqa xmm6, xmm11)
-			a2(movdqa xmm7, xmm10)
-			a3(palignr xmm6, xmm10, 8)
-			a3(palignr xmm7, xmm11, 8)
-			a2(sub rax, 2)
-			a2(movdqa xmm8, xmm0)
-			a2(movdqa xmm9, xmm1)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm6, xmm8)
-			a2(pxor xmm7, xmm9)
-			a2(movdqa xmm10, xmm0)
-			a2(movdqa xmm11, xmm1)
-			a2(paddq xmm10, xmm6)
-			a2(paddq xmm11, xmm7)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 51)
-			a2(psrlq xmm11, 51)
-			a2(psllq xmm8, 13)
-			a2(psllq xmm9, 13)
-			a2(pxor xmm5, xmm10)
-			a2(pxor xmm4, xmm11)
-			a2(pxor xmm5, xmm8)
-			a2(pxor xmm4, xmm9)
-			a2(movdqa xmm10, xmm6)
-			a2(movdqa xmm11, xmm7)
-			a2(paddq xmm10, xmm5)
-			a2(paddq xmm11, xmm4)
-			a2(movdqa xmm8, xmm10)
-			a2(movdqa xmm9, xmm11)
-			a2(psrlq xmm10, 25)
-			a2(psrlq xmm11, 25)
-			a2(psllq xmm8, 39)
-			a2(psllq xmm9, 39)
-			a2(pxor xmm2, xmm10)
-			a2(pxor xmm3, xmm11)
-			a2(pxor xmm2, xmm8)
-			a2(pxor xmm3, xmm9)
-			a2(movdqa xmm8, xmm5)
-			a2(movdqa xmm9, xmm4)
-			a2(paddq xmm8, xmm2)
-			a2(paddq xmm9, xmm3)
-			a3(pshufd xmm8, xmm8, 0xb1)
-			a3(pshufd xmm9, xmm9, 0xb1)
-			a2(pxor xmm0, xmm8)
-			a2(pxor xmm1, xmm9)
-			a2(movdqa xmm10, xmm2)
-			a2(movdqa xmm11, xmm3)
-			a2(movdqa xmm2, xmm6)
-			a2(movdqa xmm3, xmm7)
-			a3(palignr xmm2, xmm7, 8)
-			a3(palignr xmm3, xmm6, 8)
-			a2(movdqa xmm6, xmm11)
-			a2(movdqa xmm7, xmm10)
-			a3(palignr xmm6, xmm10, 8)
-			a3(palignr xmm7, xmm11, 8)
-			aj(ja scrypt_salsa64_ssse3_loop)
-		a2(paddq xmm0,[rsp+0])
-		a2(paddq xmm1,[rsp+16])
-		a2(paddq xmm2,[rsp+32])
-		a2(paddq xmm3,[rsp+48])
-		a2(paddq xmm4,[rsp+64])
-		a2(paddq xmm5,[rsp+80])
-		a2(paddq xmm6,[rsp+96])
-		a2(paddq xmm7,[rsp+112])
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0xff)
-		a2(add r9,128)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(movdqa [rax+0],xmm0)
-		a2(movdqa [rax+16],xmm1)
-		a2(movdqa [rax+32],xmm2)
-		a2(movdqa [rax+48],xmm3)
-		a2(movdqa [rax+64],xmm4)
-		a2(movdqa [rax+80],xmm5)
-		a2(movdqa [rax+96],xmm6)
-		a2(movdqa [rax+112],xmm7)
-		aj(jne scrypt_ChunkMix_ssse3_loop)
-	a2(mov rsp, rbp)
-	a1(pop rbp)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_ssse3)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
-
-#define SCRYPT_SALSA64_SSSE3
-
-static void asm_calling_convention
-scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-	x4 = xmmp[4];
-	x5 = xmmp[5];
-	x6 = xmmp[6];
-	x7 = xmmp[7];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-			x4 = _mm_xor_si128(x4, xmmp[4]);
-			x5 = _mm_xor_si128(x5, xmmp[5]);
-			x6 = _mm_xor_si128(x6, xmmp[6]);
-			x7 = _mm_xor_si128(x7, xmmp[7]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-		t4 = x4;
-		t5 = x5;
-		t6 = x6;
-		t7 = x7;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x4 = _mm_xor_si128(x4, z2);
-			x5 = _mm_xor_si128(x5, z3);
-			x4 = _mm_xor_si128(x4, z0);
-			x5 = _mm_xor_si128(x5, z1);
-
-			z0 = _mm_add_epi64(x4, x6);
-			z1 = _mm_add_epi64(x5, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x4);
-			z1 = _mm_add_epi64(x3, x5);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z2 = _mm_srli_epi64(z0, 64-13);
-			z3 = _mm_srli_epi64(z1, 64-13);
-			z0 = _mm_slli_epi64(z0, 13);
-			z1 = _mm_slli_epi64(z1, 13);
-			x5 = _mm_xor_si128(x5, z2);
-			x4 = _mm_xor_si128(x4, z3);
-			x5 = _mm_xor_si128(x5, z0);
-			x4 = _mm_xor_si128(x4, z1);
-
-			z0 = _mm_add_epi64(x5, x6);
-			z1 = _mm_add_epi64(x4, x7);
-			z2 = _mm_srli_epi64(z0, 64-39);
-			z3 = _mm_srli_epi64(z1, 64-39);
-			z0 = _mm_slli_epi64(z0, 39);
-			z1 = _mm_slli_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z2);
-			x3 = _mm_xor_si128(x3, z3);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x5);
-			z1 = _mm_add_epi64(x3, x4);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-		}
-
-		x0 = _mm_add_epi64(x0, t0);
-		x1 = _mm_add_epi64(x1, t1);
-		x2 = _mm_add_epi64(x2, t2);
-		x3 = _mm_add_epi64(x3, t3);
-		x4 = _mm_add_epi64(x4, t4);
-		x5 = _mm_add_epi64(x5, t5);
-		x6 = _mm_add_epi64(x6, t6);
-		x7 = _mm_add_epi64(x7, t7);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-		xmmp[4] = x4;
-		xmmp[5] = x5;
-		xmmp[6] = x6;
-		xmmp[7] = x7;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA64_SSSE3)
-	/* uses salsa64_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa64/8-SSSE3"
-	#undef SCRYPT_SALSA64_INCLUDED
-	#define SCRYPT_SALSA64_INCLUDED
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h
@@ -1,335 +0,0 @@
-/* x64 */
-#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
-
-#define SCRYPT_SALSA64_XOP
-
-asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
-asm_naked_fn(scrypt_ChunkMix_xop)
-	a1(push rbp)
-	a2(mov rbp, rsp)
-	a2(and rsp, ~63)
-	a2(sub rsp, 128)
-	a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
-	a2(shl rcx,7)
-	a2(lea r9,[rcx-128])
-	a2(lea rax,[rsi+r9])
-	a2(lea r9,[rdx+r9])
-	a2(and rdx, rdx)
-	a2(vmovdqa xmm0,[rax+0])
-	a2(vmovdqa xmm1,[rax+16])
-	a2(vmovdqa xmm2,[rax+32])
-	a2(vmovdqa xmm3,[rax+48])
-	a2(vmovdqa xmm4,[rax+64])
-	a2(vmovdqa xmm5,[rax+80])
-	a2(vmovdqa xmm6,[rax+96])
-	a2(vmovdqa xmm7,[rax+112])
-	aj(jz scrypt_ChunkMix_xop_no_xor1)
-	a3(vpxor xmm0,xmm0,[r9+0])
-	a3(vpxor xmm1,xmm1,[r9+16])
-	a3(vpxor xmm2,xmm2,[r9+32])
-	a3(vpxor xmm3,xmm3,[r9+48])
-	a3(vpxor xmm4,xmm4,[r9+64])
-	a3(vpxor xmm5,xmm5,[r9+80])
-	a3(vpxor xmm6,xmm6,[r9+96])
-	a3(vpxor xmm7,xmm7,[r9+112])
-	a1(scrypt_ChunkMix_xop_no_xor1:)
-	a2(xor r9,r9)
-	a2(xor r8,r8)
-	a1(scrypt_ChunkMix_xop_loop:)
-		a2(and rdx, rdx)
-		a3(vpxor xmm0,xmm0,[rsi+r9+0])
-		a3(vpxor xmm1,xmm1,[rsi+r9+16])
-		a3(vpxor xmm2,xmm2,[rsi+r9+32])
-		a3(vpxor xmm3,xmm3,[rsi+r9+48])
-		a3(vpxor xmm4,xmm4,[rsi+r9+64])
-		a3(vpxor xmm5,xmm5,[rsi+r9+80])
-		a3(vpxor xmm6,xmm6,[rsi+r9+96])
-		a3(vpxor xmm7,xmm7,[rsi+r9+112])
-		aj(jz scrypt_ChunkMix_xop_no_xor2)
-		a3(vpxor xmm0,xmm0,[rdx+r9+0])
-		a3(vpxor xmm1,xmm1,[rdx+r9+16])
-		a3(vpxor xmm2,xmm2,[rdx+r9+32])
-		a3(vpxor xmm3,xmm3,[rdx+r9+48])
-		a3(vpxor xmm4,xmm4,[rdx+r9+64])
-		a3(vpxor xmm5,xmm5,[rdx+r9+80])
-		a3(vpxor xmm6,xmm6,[rdx+r9+96])
-		a3(vpxor xmm7,xmm7,[rdx+r9+112])
-		a1(scrypt_ChunkMix_xop_no_xor2:)
-		a2(vmovdqa [rsp+0],xmm0)
-		a2(vmovdqa [rsp+16],xmm1)
-		a2(vmovdqa [rsp+32],xmm2)
-		a2(vmovdqa [rsp+48],xmm3)
-		a2(vmovdqa [rsp+64],xmm4)
-		a2(vmovdqa [rsp+80],xmm5)
-		a2(vmovdqa [rsp+96],xmm6)
-		a2(vmovdqa [rsp+112],xmm7)
-		a2(mov rax,8)
-		a1(scrypt_salsa64_xop_loop: )
-			a3(vpaddq xmm8, xmm0, xmm2)
-			a3(vpaddq xmm9, xmm1, xmm3)
-			a3(vpshufd xmm8, xmm8, 0xb1)
-			a3(vpshufd xmm9, xmm9, 0xb1)
-			a3(vpxor xmm6, xmm6, xmm8)
-			a3(vpxor xmm7, xmm7, xmm9)
-			a3(vpaddq xmm10, xmm0, xmm6)
-			a3(vpaddq xmm11, xmm1, xmm7)
-			a3(vprotq xmm10, xmm10, 13)
-			a3(vprotq xmm11, xmm11, 13)
-			a3(vpxor xmm4, xmm4, xmm10)
-			a3(vpxor xmm5, xmm5, xmm11)
-			a3(vpaddq xmm8, xmm6, xmm4)
-			a3(vpaddq xmm9, xmm7, xmm5)
-			a3(vprotq xmm8, xmm8, 39)
-			a3(vprotq xmm9, xmm9, 39)
-			a3(vpxor xmm2, xmm2, xmm8)
-			a3(vpxor xmm3, xmm3, xmm9)
-			a3(vpaddq xmm10, xmm4, xmm2)
-			a3(vpaddq xmm11, xmm5, xmm3)
-			a3(vpshufd xmm10, xmm10, 0xb1)
-			a3(vpshufd xmm11, xmm11, 0xb1)
-			a3(vpxor xmm0, xmm0, xmm10)
-			a3(vpxor xmm1, xmm1, xmm11)
-			a2(vmovdqa xmm8, xmm2)
-			a2(vmovdqa xmm9, xmm3)
-			a4(vpalignr xmm2, xmm6, xmm7, 8)
-			a4(vpalignr xmm3, xmm7, xmm6, 8)
-			a4(vpalignr xmm6, xmm9, xmm8, 8)
-			a4(vpalignr xmm7, xmm8, xmm9, 8)
-			a3(vpaddq xmm10, xmm0, xmm2)
-			a3(vpaddq xmm11, xmm1, xmm3)
-			a3(vpshufd xmm10, xmm10, 0xb1)
-			a3(vpshufd xmm11, xmm11, 0xb1)
-			a3(vpxor xmm6, xmm6, xmm10)
-			a3(vpxor xmm7, xmm7, xmm11)
-			a3(vpaddq xmm8, xmm0, xmm6)
-			a3(vpaddq xmm9, xmm1, xmm7)
-			a3(vprotq xmm8, xmm8, 13)
-			a3(vprotq xmm9, xmm9, 13)
-			a3(vpxor xmm5, xmm5, xmm8)
-			a3(vpxor xmm4, xmm4, xmm9)
-			a3(vpaddq xmm10, xmm6, xmm5)
-			a3(vpaddq xmm11, xmm7, xmm4)
-			a3(vprotq xmm10, xmm10, 39)
-			a3(vprotq xmm11, xmm11, 39)
-			a3(vpxor xmm2, xmm2, xmm10)
-			a3(vpxor xmm3, xmm3, xmm11)
-			a3(vpaddq xmm8, xmm5, xmm2)
-			a3(vpaddq xmm9, xmm4, xmm3)
-			a3(vpshufd xmm8, xmm8, 0xb1)
-			a3(vpshufd xmm9, xmm9, 0xb1)
-			a3(vpxor xmm0, xmm0, xmm8)
-			a3(vpxor xmm1, xmm1, xmm9)
-			a2(vmovdqa xmm10, xmm2)
-			a2(vmovdqa xmm11, xmm3)
-			a4(vpalignr xmm2, xmm6, xmm7, 8)
-			a4(vpalignr xmm3, xmm7, xmm6, 8)
-			a4(vpalignr xmm6, xmm11, xmm10, 8)
-			a4(vpalignr xmm7, xmm10, xmm11, 8)
-			a2(sub rax, 2)
-			aj(ja scrypt_salsa64_xop_loop)
-		a3(vpaddq xmm0,xmm0,[rsp+0])
-		a3(vpaddq xmm1,xmm1,[rsp+16])
-		a3(vpaddq xmm2,xmm2,[rsp+32])
-		a3(vpaddq xmm3,xmm3,[rsp+48])
-		a3(vpaddq xmm4,xmm4,[rsp+64])
-		a3(vpaddq xmm5,xmm5,[rsp+80])
-		a3(vpaddq xmm6,xmm6,[rsp+96])
-		a3(vpaddq xmm7,xmm7,[rsp+112])
-		a2(lea rax,[r8+r9])
-		a2(xor r8,rcx)
-		a2(and rax,~0xff)
-		a2(add r9,128)
-		a2(shr rax,1)
-		a2(add rax, rdi)
-		a2(cmp r9,rcx)
-		a2(vmovdqa [rax+0],xmm0)
-		a2(vmovdqa [rax+16],xmm1)
-		a2(vmovdqa [rax+32],xmm2)
-		a2(vmovdqa [rax+48],xmm3)
-		a2(vmovdqa [rax+64],xmm4)
-		a2(vmovdqa [rax+80],xmm5)
-		a2(vmovdqa [rax+96],xmm6)
-		a2(vmovdqa [rax+112],xmm7)
-		aj(jne scrypt_ChunkMix_xop_loop)
-	a2(mov rsp, rbp)
-	a1(pop rbp)
-	a1(ret)
-asm_naked_fn_end(scrypt_ChunkMix_xop)
-
-#endif
-
-
-/* intrinsic */
-#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
-
-#define SCRYPT_SALSA64_XOP
-
-static void asm_calling_convention
-scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
-	uint32_t i, blocksPerChunk = r * 2, half = 0;
-	xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1;
-	size_t rounds;
-
-	/* 1: X = B_{2r - 1} */
-	xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
-	x0 = xmmp[0];
-	x1 = xmmp[1];
-	x2 = xmmp[2];
-	x3 = xmmp[3];
-	x4 = xmmp[4];
-	x5 = xmmp[5];
-	x6 = xmmp[6];
-	x7 = xmmp[7];
-
-	if (Bxor) {
-		xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= r) {
-		/* 3: X = H(X ^ B_i) */
-		xmmp = (xmmi *)scrypt_block(Bin, i);
-		x0 = _mm_xor_si128(x0, xmmp[0]);
-		x1 = _mm_xor_si128(x1, xmmp[1]);
-		x2 = _mm_xor_si128(x2, xmmp[2]);
-		x3 = _mm_xor_si128(x3, xmmp[3]);
-		x4 = _mm_xor_si128(x4, xmmp[4]);
-		x5 = _mm_xor_si128(x5, xmmp[5]);
-		x6 = _mm_xor_si128(x6, xmmp[6]);
-		x7 = _mm_xor_si128(x7, xmmp[7]);
-
-		if (Bxor) {
-			xmmp = (xmmi *)scrypt_block(Bxor, i);
-			x0 = _mm_xor_si128(x0, xmmp[0]);
-			x1 = _mm_xor_si128(x1, xmmp[1]);
-			x2 = _mm_xor_si128(x2, xmmp[2]);
-			x3 = _mm_xor_si128(x3, xmmp[3]);
-			x4 = _mm_xor_si128(x4, xmmp[4]);
-			x5 = _mm_xor_si128(x5, xmmp[5]);
-			x6 = _mm_xor_si128(x6, xmmp[6]);
-			x7 = _mm_xor_si128(x7, xmmp[7]);
-		}
-
-		t0 = x0;
-		t1 = x1;
-		t2 = x2;
-		t3 = x3;
-		t4 = x4;
-		t5 = x5;
-		t6 = x6;
-		t7 = x7;
-
-		for (rounds = 8; rounds; rounds -= 2) {
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z0 = _mm_roti_epi64(z0, 13);
-			z1 = _mm_roti_epi64(z1, 13);
-			x4 = _mm_xor_si128(x4, z0);
-			x5 = _mm_xor_si128(x5, z1);
-
-			z0 = _mm_add_epi64(x4, x6);
-			z1 = _mm_add_epi64(x5, x7);
-			z0 = _mm_roti_epi64(z0, 39);
-			z1 = _mm_roti_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x4);
-			z1 = _mm_add_epi64(x3, x5);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-
-			z0 = _mm_add_epi64(x0, x2);
-			z1 = _mm_add_epi64(x1, x3);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x6 = _mm_xor_si128(x6, z0);
-			x7 = _mm_xor_si128(x7, z1);
-
-			z0 = _mm_add_epi64(x6, x0);
-			z1 = _mm_add_epi64(x7, x1);
-			z0 = _mm_roti_epi64(z0, 13);
-			z1 = _mm_roti_epi64(z1, 13);
-			x5 = _mm_xor_si128(x5, z0);
-			x4 = _mm_xor_si128(x4, z1);
-
-			z0 = _mm_add_epi64(x5, x6);
-			z1 = _mm_add_epi64(x4, x7);
-			z0 = _mm_roti_epi64(z0, 39);
-			z1 = _mm_roti_epi64(z1, 39);
-			x2 = _mm_xor_si128(x2, z0);
-			x3 = _mm_xor_si128(x3, z1);
-
-			z0 = _mm_add_epi64(x2, x5);
-			z1 = _mm_add_epi64(x3, x4);
-			z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
-			z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
-			x0 = _mm_xor_si128(x0, z0);
-			x1 = _mm_xor_si128(x1, z1);
-
-			z0 = x2;
-			z1 = x3;
-			x2 = _mm_alignr_epi8(x6, x7, 8);
-			x3 = _mm_alignr_epi8(x7, x6, 8);
-			x6 = _mm_alignr_epi8(z1, z0, 8);
-			x7 = _mm_alignr_epi8(z0, z1, 8);
-		}
-
-		x0 = _mm_add_epi64(x0, t0);
-		x1 = _mm_add_epi64(x1, t1);
-		x2 = _mm_add_epi64(x2, t2);
-		x3 = _mm_add_epi64(x3, t3);
-		x4 = _mm_add_epi64(x4, t4);
-		x5 = _mm_add_epi64(x5, t5);
-		x6 = _mm_add_epi64(x6, t6);
-		x7 = _mm_add_epi64(x7, t7);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
-		xmmp[0] = x0;
-		xmmp[1] = x1;
-		xmmp[2] = x2;
-		xmmp[3] = x3;
-		xmmp[4] = x4;
-		xmmp[5] = x5;
-		xmmp[6] = x6;
-		xmmp[7] = x7;
-	}
-}
-
-#endif
-
-#if defined(SCRYPT_SALSA64_XOP)
-	/* uses salsa64_core_tangle_sse2 */
-
-	#undef SCRYPT_MIX
-	#define SCRYPT_MIX "Salsa64/8-XOP"
-	#undef SCRYPT_SALSA64_INCLUDED
-	#define SCRYPT_SALSA64_INCLUDED
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h
@@ -1,41 +0,0 @@
-#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
-
-#undef SCRYPT_MIX
-#define SCRYPT_MIX "Salsa64/8 Ref"
-
-#undef SCRYPT_SALSA64_INCLUDED
-#define SCRYPT_SALSA64_INCLUDED
-#define SCRYPT_SALSA64_BASIC
-
-static void
-salsa64_core_basic(uint64_t state[16]) {
-	const size_t rounds = 8;
-	uint64_t v[16], t;
-	size_t i;
-
-	for (i = 0; i < 16; i++) v[i] = state[i];
-
-	#define G(a,b,c,d) \
-		t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
-		t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
-		t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
-		t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
-
-	for (i = 0; i < rounds; i += 2) {
-		G( 0, 4, 8,12);
-		G( 5, 9,13, 1);
-		G(10,14, 2, 6);
-		G(15, 3, 7,11);
-		G( 0, 1, 2, 3);
-		G( 5, 6, 7, 4);
-		G(10,11, 8, 9);
-		G(15,12,13,14);
-	}
-
-	for (i = 0; i < 16; i++) state[i] += v[i];
-
-	#undef G
-}
-
-#endif
-
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h
@@ -1,112 +0,0 @@
-typedef struct scrypt_hmac_state_t {
-	scrypt_hash_state inner, outer;
-} scrypt_hmac_state;
-
-
-static void
-scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
-	scrypt_hash_state st;
-	scrypt_hash_init(&st);
-	scrypt_hash_update(&st, m, mlen);
-	scrypt_hash_finish(&st, hash);
-}
-
-/* hmac */
-static void
-scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
-	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
-	size_t i;
-
-	scrypt_hash_init(&st->inner);
-	scrypt_hash_init(&st->outer);
-
-	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
-		/* use the key directly if it's <= blocksize bytes */
-		memcpy(pad, key, keylen);
-	} else {
-		/* if it's > blocksize bytes, hash it */
-		scrypt_hash(pad, key, keylen);
-	}
-
-	/* inner = (key ^ 0x36) */
-	/* h(inner || ...) */
-	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
-		pad[i] ^= 0x36;
-	scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
-
-	/* outer = (key ^ 0x5c) */
-	/* h(outer || ...) */
-	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
-		pad[i] ^= (0x5c ^ 0x36);
-	scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
-
-	scrypt_ensure_zero(pad, sizeof(pad));
-}
-
-static void
-scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
-	/* h(inner || m...) */
-	scrypt_hash_update(&st->inner, m, mlen);
-}
-
-static void
-scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
-	/* h(inner || m) */
-	scrypt_hash_digest innerhash;
-	scrypt_hash_finish(&st->inner, innerhash);
-
-	/* h(outer || h(inner || m)) */
-	scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
-	scrypt_hash_finish(&st->outer, mac);
-
-	scrypt_ensure_zero(st, sizeof(*st));
-}
-
-static void
-scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
-	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
-	scrypt_hash_digest ti, u;
-	uint8_t be[4];
-	uint32_t i, j, blocks;
-	uint64_t c;
-
-	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
-
-	/* hmac(password, ...) */
-	scrypt_hmac_init(&hmac_pw, password, password_len);
-
-	/* hmac(password, salt...) */
-	hmac_pw_salt = hmac_pw;
-	scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
-
-	blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
-	for (i = 1; i <= blocks; i++) {
-		/* U1 = hmac(password, salt || be(i)) */
-		U32TO8_BE(be, i);
-		work = hmac_pw_salt;
-		scrypt_hmac_update(&work, be, 4);
-		scrypt_hmac_finish(&work, ti);
-		memcpy(u, ti, sizeof(u));
-
-		/* T[i] = U1 ^ U2 ^ U3... */
-		for (c = 0; c < N - 1; c++) {
-			/* UX = hmac(password, U{X-1}) */
-			work = hmac_pw;
-			scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
-			scrypt_hmac_finish(&work, u);
-
-			/* T[i] ^= UX */
-			for (j = 0; j < sizeof(u); j++)
-				ti[j] ^= u[j];
-		}
-
-		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
-		out += SCRYPT_HASH_DIGEST_SIZE;
-		bytes -= SCRYPT_HASH_DIGEST_SIZE;
-	}
-
-	scrypt_ensure_zero(ti, sizeof(ti));
-	scrypt_ensure_zero(u, sizeof(u));
-	scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
-	scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
-}
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
@@ -1,463 +0,0 @@
-#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
-	#define X86ASM
-
-	/* gcc 2.95 royally screws up stack alignments on variables */
-	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
-		#define X86ASM_SSE
-		#define X86ASM_SSE2
-	#endif
-	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
-		#define X86ASM_SSSE3
-	#endif
-	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
-		#define X86ASM_AVX
-		#define X86ASM_XOP
-	#endif
-	#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
-		#define X86ASM_AVX2
-	#endif
-#endif
-
-#if defined(CPU_X86_64) && defined(COMPILER_GCC)
-	#define X86_64ASM
-	#define X86_64ASM_SSE2
-	#if (COMPILER_GCC >= 40102)
-		#define X86_64ASM_SSSE3
-	#endif
-	#if (COMPILER_GCC >= 40400)
-		#define X86_64ASM_AVX
-		#define X86_64ASM_XOP
-	#endif
-	#if (COMPILER_GCC >= 40700)
-		#define X86_64ASM_AVX2
-	#endif
-#endif
-
-#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
-	#define X86_INTRINSIC
-	#if defined(CPU_X86_64) || defined(X86ASM_SSE)
-		#define X86_INTRINSIC_SSE
-	#endif
-	#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
-		#define X86_INTRINSIC_SSE2
-	#endif
-	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
-		#define X86_INTRINSIC_SSSE3
-	#endif
-	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
-		#define X86_INTRINSIC_AVX
-		#define X86_INTRINSIC_XOP
-	#endif
-	#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
-		#define X86_INTRINSIC_AVX2
-	#endif
-#endif
-
-#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
-	#define X86_INTRINSIC
-	#if defined(__SSE__)
-		#define X86_INTRINSIC_SSE
-	#endif
-	#if defined(__SSE2__)
-		#define X86_INTRINSIC_SSE2
-	#endif
-	#if defined(__SSSE3__)
-		#define X86_INTRINSIC_SSSE3
-	#endif
-	#if defined(__AVX__)
-		#define X86_INTRINSIC_AVX
-	#endif
-	#if defined(__XOP__)
-		#define X86_INTRINSIC_XOP
-	#endif
-	#if defined(__AVX2__)
-		#define X86_INTRINSIC_AVX2
-	#endif
-#endif
-
-/* only use simd on windows (or SSE2 on gcc)! */
-#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
-	#if defined(X86_INTRINSIC_SSE)
-		#include <mmintrin.h>
-		#include <xmmintrin.h>
-		typedef __m64 qmm;
-		typedef __m128 xmm;
-		typedef __m128d xmmd;
-	#endif
-	#if defined(X86_INTRINSIC_SSE2)
-		#include <emmintrin.h>
-		typedef __m128i xmmi;
-	#endif
-	#if defined(X86_INTRINSIC_SSSE3)
-		#include <tmmintrin.h>
-	#endif
-	#if defined(X86_INTRINSIC_AVX)
-		#include <immintrin.h>
-	#endif
-	#if defined(X86_INTRINSIC_XOP)
-		#if defined(COMPILER_MSVC)
-			#include <intrin.h>
-		#else
-			#include <x86intrin.h>
-		#endif
-	#endif
-	#if defined(X86_INTRINSIC_AVX2)
-		typedef __m256i ymmi;
-	#endif
-#endif
-
-#if defined(X86_INTRINSIC_SSE2)
-	typedef union packedelem8_t {
-		uint8_t u[16];
-		xmmi v;
-	} packedelem8;
-
-	typedef union packedelem32_t {
-		uint32_t u[4];
-		xmmi v;
-	} packedelem32;
-
-	typedef union packedelem64_t {
-		uint64_t u[2];
-		xmmi v;
-	} packedelem64;
-#else
-	typedef union packedelem8_t {
-		uint8_t u[16];
-		uint32_t dw[4];
-	} packedelem8;
-
-	typedef union packedelem32_t {
-		uint32_t u[4];
-		uint8_t b[16];
-	} packedelem32;
-
-	typedef union packedelem64_t {
-		uint64_t u[2];
-		uint8_t b[16];
-	} packedelem64;
-#endif
-
-#if defined(X86_INTRINSIC_SSSE3)
-	static const packedelem8 ALIGN(16) ssse3_rotl16_32bit      = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
-	static const packedelem8 ALIGN(16) ssse3_rotl8_32bit       = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
-#endif
-
-/*
-	x86 inline asm for gcc/msvc. usage:
-
-	asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
-	asm_naked_fn(name)
-		a1(..)
-		a2(.., ..)
-		a3(.., .., ..)
-		64bit OR 0 paramters: a1(ret)
-		32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
-	asm_naked_fn_end(name)
-*/
-
-#if defined(X86ASM) || defined(X86_64ASM)
-
-#if defined(COMPILER_MSVC)
-	#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
-	#define a1(x) __asm {x}
-	#define a2(x, y) __asm {x, y}
-	#define a3(x, y, z) __asm {x, y, z}
-	#define a4(x, y, z, w) __asm {x, y, z, w}
-	#define aj(x) __asm {x}
-	#define asm_align8 a1(ALIGN 8)
-	#define asm_align16 a1(ALIGN 16)
-
-	#define asm_calling_convention STDCALL
-	#define aret(n) a1(ret n)
-	#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
-	#define asm_naked_fn(fn) {
-	#define asm_naked_fn_end(fn) }
-#elif defined(COMPILER_GCC)
-	#define GNU_AS1(x) #x ";\n"
-	#define GNU_AS2(x, y) #x ", " #y ";\n"
-	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
-	#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
-	#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
-	#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
-
-	#define a1(x) GNU_AS1(x)
-	#define a2(x, y) GNU_AS2(x, y)
-	#define a3(x, y, z) GNU_AS3(x, y, z)
-	#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
-	#define aj(x) GNU_ASJ(x)
-	#define asm_align8 ".p2align 3,,7"
-	#define asm_align16 ".p2align 4,,15"
-
-	#if defined(OS_WINDOWS)
-		#define asm_calling_convention CDECL
-		#define aret(n) a1(ret)
-
-		#if defined(X86_64ASM)
-			#define asm_naked_fn(fn) ; __asm__ ( \
-				".text\n"                        \
-				asm_align16 GNU_ASFN(fn)         \
-				"subq $136, %rsp;"               \
-			 	"movdqa %xmm6, 0(%rsp);"         \
-				"movdqa %xmm7, 16(%rsp);"        \
-			 	"movdqa %xmm8, 32(%rsp);"        \
-				"movdqa %xmm9, 48(%rsp);"        \
-			 	"movdqa %xmm10, 64(%rsp);"       \
-				"movdqa %xmm11, 80(%rsp);"       \
-				"movdqa %xmm12, 96(%rsp);"       \
-				"movq %rdi, 112(%rsp);"          \
-				"movq %rsi, 120(%rsp);"          \
-				"movq %rcx, %rdi;"               \
-				"movq %rdx, %rsi;"               \
-				"movq %r8, %rdx;"                \
-				"movq %r9, %rcx;"                \
-				"call 1f;"                       \
-				"movdqa 0(%rsp), %xmm6;"         \
-				"movdqa 16(%rsp), %xmm7;"        \
-				"movdqa 32(%rsp), %xmm8;"        \
-				"movdqa 48(%rsp), %xmm9;"        \
-				"movdqa 64(%rsp), %xmm10;"       \
-				"movdqa 80(%rsp), %xmm11;"       \
-				"movdqa 96(%rsp), %xmm12;"       \
-				"movq 112(%rsp), %rdi;"          \
-				"movq 120(%rsp), %rsi;"          \
-				"addq $136, %rsp;"               \
-				"ret;"                           \
-				".intel_syntax noprefix;"        \
-				".p2align 4,,15;"                \
-				"1:;"
-		#else
-			#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
-		#endif
-	#else
-		#define asm_calling_convention STDCALL
-		#define aret(n) a1(ret n)
-		#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
-	#endif
-
-	#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
-	#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
-
-	#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
-	#define asm_gcc_parms() ".att_syntax prefix;"
-	#define asm_gcc_trashed() __asm__ __volatile__("" :::
-	#define asm_gcc_end() );
-#else
-	need x86 asm
-#endif
-
-#endif /* X86ASM || X86_64ASM */
-
-
-#if defined(CPU_X86) || defined(CPU_X86_64)
-
-typedef enum cpu_flags_x86_t {
-	cpu_mmx = 1 << 0,
-	cpu_sse = 1 << 1,
-	cpu_sse2 = 1 << 2,
-	cpu_sse3 = 1 << 3,
-	cpu_ssse3 = 1 << 4,
-	cpu_sse4_1 = 1 << 5,
-	cpu_sse4_2 = 1 << 6,
-	cpu_avx = 1 << 7,
-	cpu_xop = 1 << 8,
-	cpu_avx2 = 1 << 9
-} cpu_flags_x86;
-
-typedef enum cpu_vendors_x86_t {
-	cpu_nobody,
-	cpu_intel,
-	cpu_amd
-} cpu_vendors_x86;
-
-typedef struct x86_regs_t {
-	uint32_t eax, ebx, ecx, edx;
-} x86_regs;
-
-#if defined(X86ASM)
-asm_naked_fn_proto(int, has_cpuid)(void)
-asm_naked_fn(has_cpuid)
-	a1(pushfd)
-	a1(pop eax)
-	a2(mov ecx, eax)
-	a2(xor eax, 0x200000)
-	a1(push eax)
-	a1(popfd)
-	a1(pushfd)
-	a1(pop eax)
-	a2(xor eax, ecx)
-	a2(shr eax, 21)
-	a2(and eax, 1)
-	a1(push ecx)
-	a1(popfd)
-	a1(ret)
-asm_naked_fn_end(has_cpuid)
-#endif /* X86ASM */
-
-
-static void NOINLINE
-get_cpuid(x86_regs *regs, uint32_t flags) {
-#if defined(COMPILER_MSVC)
-	__cpuid((int *)regs, (int)flags);
-#else
-	#if defined(CPU_X86_64)
-		#define cpuid_bx rbx
-	#else
-		#define cpuid_bx ebx
-	#endif
-
-	asm_gcc()
-		a1(push cpuid_bx)
-		a2(xor ecx, ecx)
-		a1(cpuid)
-		a2(mov [%1 + 0], eax)
-		a2(mov [%1 + 4], ebx)
-		a2(mov [%1 + 8], ecx)
-		a2(mov [%1 + 12], edx)
-		a1(pop cpuid_bx)
-		asm_gcc_parms() : "+a"(flags) : "S"(regs)  : "%ecx", "%edx", "cc"
-	asm_gcc_end()
-#endif
-}
-
-#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
-static uint64_t NOINLINE
-get_xgetbv(uint32_t flags) {
-#if defined(COMPILER_MSVC)
-	return _xgetbv(flags);
-#else
-	uint32_t lo, hi;
-	asm_gcc()
-		a1(xgetbv)
-		asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
-	asm_gcc_end()
-	return ((uint64_t)lo | ((uint64_t)hi << 32));
-#endif
-}
-#endif // AVX support
-
-#if defined(SCRYPT_TEST_SPEED)
-size_t cpu_detect_mask = (size_t)-1;
-#endif
-
-static size_t
-detect_cpu(void) {
-	//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
-	//cpu_vendors_x86 vendor = cpu_nobody;
-	x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
-	uint32_t max_level, max_ext_level;
-	size_t cpu_flags = 0;
-#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
-	uint64_t xgetbv_flags;
-#endif
-
-#if defined(CPU_X86)
-	if (!has_cpuid())
-		return cpu_flags;
-#endif
-
-	get_cpuid(&regs, 0);
-	max_level = regs.eax;
-#if 0
-	vendor_string.i[0] = regs.ebx;
-	vendor_string.i[1] = regs.edx;
-	vendor_string.i[2] = regs.ecx;
-
-	if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
-		vendor = cpu_intel;
-	else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
-		vendor = cpu_amd;
-#endif
-	if (max_level & 0x00000500) {
-		/* "Intel P5 pre-B0" */
-		cpu_flags |= cpu_mmx;
-		return cpu_flags;
-	}
-
-	if (max_level < 1)
-		return cpu_flags;
-
-	get_cpuid(&regs, 1);
-#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
-	/* xsave/xrestore */
-	if (regs.ecx & (1 << 27)) {
-		xgetbv_flags = get_xgetbv(0);
-		if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
-	}
-#endif
-	if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
-	if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
-	if (regs.ecx & (1 <<  9)) cpu_flags |= cpu_ssse3;
-	if (regs.ecx & (1      )) cpu_flags |= cpu_sse3;
-	if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
-	if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
-	if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
-
-	if (cpu_flags & cpu_avx) {
-		if (max_level >= 7) {
-			get_cpuid(&regs, 7);
-			if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
-		}
-
-		get_cpuid(&regs, 0x80000000);
-		max_ext_level = regs.eax;
-		if (max_ext_level >= 0x80000001) {
-			get_cpuid(&regs, 0x80000001);
-			if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
-		}
-	}
-
-
-#if defined(SCRYPT_TEST_SPEED)
-	cpu_flags &= cpu_detect_mask;
-#endif
-
-	return cpu_flags;
-}
-
-#if defined(SCRYPT_TEST_SPEED)
-static const char *
-get_top_cpuflag_desc(size_t flag) {
-	if (flag & cpu_avx2) return "AVX2";
-	else if (flag & cpu_xop) return "XOP";
-	else if (flag & cpu_avx) return "AVX";
-	else if (flag & cpu_sse4_2) return "SSE4.2";
-	else if (flag & cpu_sse4_1) return "SSE4.1";
-	else if (flag & cpu_ssse3) return "SSSE3";
-	else if (flag & cpu_sse2) return "SSE2";
-	else if (flag & cpu_sse) return "SSE";
-	else if (flag & cpu_mmx) return "MMX";
-	else return "Basic";
-}
-#endif
-
-/* enable the highest system-wide option */
-#if defined(SCRYPT_CHOOSE_COMPILETIME)
-	#if !defined(__AVX2__)
-		#undef X86_64ASM_AVX2
-		#undef X86ASM_AVX2
-		#undef X86_INTRINSIC_AVX2
-	#endif
-	#if !defined(__XOP__)
-		#undef X86_64ASM_XOP
-		#undef X86ASM_XOP
-		#undef X86_INTRINSIC_XOP
-	#endif
-	#if !defined(__AVX__)
-		#undef X86_64ASM_AVX
-		#undef X86ASM_AVX
-		#undef X86_INTRINSIC_AVX
-	#endif
-	#if !defined(__SSSE3__)
-		#undef X86_64ASM_SSSE3
-		#undef X86ASM_SSSE3
-		#undef X86_INTRINSIC_SSSE3
-	#endif
-	#if !defined(__SSE2__)
-		#undef X86_64ASM_SSE2
-		#undef X86ASM_SSE2
-		#undef X86_INTRINSIC_SSE2
-	#endif
-#endif
-
-#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h
@@ -1,310 +0,0 @@
-/* determine os */
-#if defined(_WIN32)	|| defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
-	#include <windows.h>
-	#include <wincrypt.h>
-	#define OS_WINDOWS
-#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
-	#include <sys/mman.h>
-	#include <sys/time.h>
-	#include <fcntl.h>
-
-	#define OS_SOLARIS
-#else
-	#include <sys/mman.h>
-	#include <sys/time.h>
-	#include <sys/param.h> /* need this to define BSD */
-	#include <unistd.h>
-	#include <fcntl.h>
-
-	#define OS_NIX
-	#if defined(__linux__)
-		#include <endian.h>
-		#define OS_LINUX
-	#elif defined(BSD)
-		#define OS_BSD
-
-		#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
-			#define OS_OSX
-		#elif defined(macintosh) || defined(Macintosh)
-			#define OS_MAC
-		#elif defined(__OpenBSD__)
-			#define OS_OPENBSD
-		#endif
-	#endif
-#endif
-
-
-/* determine compiler */
-#if defined(_MSC_VER)
-	#define COMPILER_MSVC_VS6       120000000
-	#define COMPILER_MSVC_VS6PP     121000000
-	#define COMPILER_MSVC_VS2002    130000000
-	#define COMPILER_MSVC_VS2003    131000000
-	#define COMPILER_MSVC_VS2005    140050727
-	#define COMPILER_MSVC_VS2008    150000000
-	#define COMPILER_MSVC_VS2008SP1 150030729
-	#define COMPILER_MSVC_VS2010    160000000
-	#define COMPILER_MSVC_VS2010SP1 160040219
-	#define COMPILER_MSVC_VS2012RC  170000000
-	#define COMPILER_MSVC_VS2012    170050727
-
-	#if _MSC_FULL_VER > 100000000
-		#define COMPILER_MSVC (_MSC_FULL_VER)
-	#else
-		#define COMPILER_MSVC (_MSC_FULL_VER * 10)
-	#endif
-
-	#if ((_MSC_VER == 1200) && defined(_mm_free))
-		#undef COMPILER_MSVC
-		#define COMPILER_MSVC COMPILER_MSVC_VS6PP
-	#endif
-
-	#pragma warning(disable : 4127) /* conditional expression is constant */
-	#pragma warning(disable : 4100) /* unreferenced formal parameter */
-
-	#ifndef _CRT_SECURE_NO_WARNINGS
-	#define _CRT_SECURE_NO_WARNINGS
-	#endif
-
-	#include <float.h>
-	#include <stdlib.h> /* _rotl */
-	#include <intrin.h>
-
-	typedef unsigned char uint8_t;
-	typedef unsigned short uint16_t;
-	typedef unsigned int uint32_t;
-	typedef signed int int32_t;
-	typedef unsigned __int64 uint64_t;
-	typedef signed __int64 int64_t;
-
-	#define ROTL32(a,b) _rotl(a,b)
-	#define ROTR32(a,b) _rotr(a,b)
-	#define ROTL64(a,b) _rotl64(a,b)
-	#define ROTR64(a,b) _rotr64(a,b)
-	#undef NOINLINE
-	#define NOINLINE __declspec(noinline)
-	#undef NORETURN
-	#define NORETURN
-	#undef INLINE
-	#define INLINE __forceinline
-	#undef FASTCALL
-	#define FASTCALL __fastcall
-	#undef CDECL
-	#define CDECL __cdecl
-	#undef STDCALL
-	#define STDCALL __stdcall
-	#undef NAKED
-	#define NAKED __declspec(naked)
-	#define ALIGN(n) __declspec(align(n))
-#endif
-#if defined(__ICC)
-	#define COMPILER_INTEL
-#endif
-#if defined(__GNUC__)
-	#if (__GNUC__ >= 3)
-		#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
-	#else
-		#define COMPILER_GCC_PATCHLEVEL 0
-	#endif
-	#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
-	#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
-	#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
-	#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
-	#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
-	#undef NOINLINE
-	#if (COMPILER_GCC >= 30000)
-		#define NOINLINE __attribute__((noinline))
-	#else
-		#define NOINLINE
-	#endif
-	#undef NORETURN
-	#if (COMPILER_GCC >= 30000)
-		#define NORETURN __attribute__((noreturn))
-	#else
-		#define NORETURN
-	#endif
-	#undef INLINE
-	#if (COMPILER_GCC >= 30000)
-		#define INLINE __attribute__((always_inline))
-	#else
-		#define INLINE inline
-	#endif
-	#undef FASTCALL
-	#if (COMPILER_GCC >= 30400)
-		#define FASTCALL __attribute__((fastcall))
-	#else
-		#define FASTCALL
-	#endif
-	#undef CDECL
-	#define CDECL __attribute__((cdecl))
-	#undef STDCALL
-	#define STDCALL __attribute__((stdcall))
-	#define ALIGN(n) __attribute__((aligned(n)))
-	#include <stdint.h>
-#endif
-#if defined(__MINGW32__) || defined(__MINGW64__)
-	#define COMPILER_MINGW
-#endif
-#if defined(__PATHCC__)
-	#define COMPILER_PATHCC
-#endif
-
-#define OPTIONAL_INLINE
-#if defined(OPTIONAL_INLINE)
-	#undef OPTIONAL_INLINE
-	#define OPTIONAL_INLINE INLINE
-#else
-	#define OPTIONAL_INLINE
-#endif
-
-#define CRYPTO_FN NOINLINE STDCALL
-
-/* determine cpu */
-#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
-	#define CPU_X86_64
-#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
-	#define CPU_X86 500
-#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
-	#define CPU_X86 400
-#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
-	#define CPU_X86 300
-#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
-	#define CPU_IA64
-#endif
-
-#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
-	#define CPU_SPARC
-	#if defined(__sparcv9)
-		#define CPU_SPARC64
-	#endif
-#endif
-
-#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
-	#define CPU_64BITS
-	#undef FASTCALL
-	#define FASTCALL
-	#undef CDECL
-	#define CDECL
-	#undef STDCALL
-	#define STDCALL
-#endif
-
-#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
-	#define CPU_PPC
-	#if defined(_ARCH_PWR7)
-		#define CPU_POWER7
-	#elif defined(__64BIT__)
-		#define CPU_PPC64
-	#else
-		#define CPU_PPC32
-	#endif
-#endif
-
-#if defined(__hppa__) || defined(__hppa)
-	#define CPU_HPPA
-#endif
-
-#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
-	#define CPU_ALPHA
-#endif
-
-/* endian */
-
-#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
-	 (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
-	 (defined(CPU_X86) || defined(CPU_X86_64)) || \
-	 (defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
-#define CPU_LE
-#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
-	   (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
-	   (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
-#define CPU_BE
-#else
-	/* unknown endian! */
-#endif
-
-
-#define U8TO32_BE(p)                                            \
-	(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) |  \
-	 ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])      ))
-
-#define U8TO32_LE(p)                                            \
-	(((uint32_t)((p)[0])      ) | ((uint32_t)((p)[1]) <<  8) |  \
-	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
-
-#define U32TO8_BE(p, v)                                           \
-	(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
-	(p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
-
-#define U32TO8_LE(p, v)                                           \
-	(p)[0] = (uint8_t)((v)      ); (p)[1] = (uint8_t)((v) >>  8); \
-	(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
-
-#define U8TO64_BE(p)                                                  \
-	(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
-
-#define U8TO64_LE(p)                                                  \
-	(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
-
-#define U64TO8_BE(p, v)                        \
-	U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
-	U32TO8_BE((p) + 4, (uint32_t)((v)      ));
-
-#define U64TO8_LE(p, v)                        \
-	U32TO8_LE((p),     (uint32_t)((v)      )); \
-	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
-
-#define U32_SWAP(v) {                                             \
-	(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF );  \
-    (v) = ((v) << 16) | ((v) >> 16);                              \
-}
-
-#define U64_SWAP(v) {                                                                       \
-	(v) = (((v) <<  8) & 0xFF00FF00FF00FF00ull ) | (((v) >>  8) & 0x00FF00FF00FF00FFull );  \
-	(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull );  \
-    (v) = ((v) << 32) | ((v) >> 32);                                                        \
-}
-
-static int
-scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
-	uint32_t differentbits = 0;
-	while (len--)
-		differentbits |= (*x++ ^ *y++);
-	return (1 & ((differentbits - 1) >> 8));
-}
-
-static void
-scrypt_ensure_zero(void *p, size_t len) {
-#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
-		__stosb((unsigned char *)p, 0, len);
-#elif (defined(CPU_X86) && defined(COMPILER_GCC))
-	__asm__ __volatile__(
-		"pushl %%edi;\n"
-		"pushl %%ecx;\n"
-		"rep stosb;\n"
-		"popl %%ecx;\n"
-		"popl %%edi;\n"
-		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
-	);
-#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
-	__asm__ __volatile__(
-		"pushq %%rdi;\n"
-		"pushq %%rcx;\n"
-		"rep stosb;\n"
-		"popq %%rcx;\n"
-		"popq %%rdi;\n"
-		:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
-	);
-#else
-	volatile uint8_t *b = (volatile uint8_t *)p;
-	size_t i;
-	for (i = 0; i < len; i++)
-		b[i] = 0;
-#endif
-}
-
-#include "scrypt-jane-portable-x86.h"
-
-#if !defined(asm_calling_convention)
-#define asm_calling_convention
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
@@ -1,75 +0,0 @@
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-/* function type returned by scrypt_getROMix, used with cpu detection */
-typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
-#endif
-
-/* romix pre/post nop function */
-/*
-static void asm_calling_convention
-scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
-	(void)blocks; (void)nblocks;
-}
-*/
-/* romix pre/post endian conversion function */
-static void asm_calling_convention
-scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
-#if !defined(CPU_LE)
-	static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
-	size_t i;
-	if (endian_test.w == 0x100) {
-		nblocks *= SCRYPT_BLOCK_WORDS;
-		for (i = 0; i < nblocks; i++) {
-			SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
-		}
-	}
-#else
-	(void)blocks; (void)nblocks;
-#endif
-}
-
-/* chunkmix test function */
-typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
-typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
-
-static int
-scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
-	/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
-	const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
-#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
-	scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
-#else
-	scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
-#endif
-	uint8_t final[16];
-	size_t i;
-
-	for (i = 0; i < words; i++) {
-		v = (scrypt_mix_word_t)i;
-		v = (v << 8) | v;
-		v = (v << 16) | v;
-		chunk[0][i] = v;
-	}
-
-	prefn(chunk[0], blocks);
-	mixfn(chunk[1], chunk[0], NULL, r);
-	postfn(chunk[1], blocks);
-
-	/* grab the last 16 bytes of the final block */
-	for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
-		SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
-	}
-
-	return scrypt_verify(expected, final, 16);
-}
-
-/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
-static scrypt_mix_word_t *
-scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
-	return base + (i * len);
-}
-
-/* returns a pointer to block i */
-static scrypt_mix_word_t *
-scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
-	return base + (i * SCRYPT_BLOCK_WORDS);
-}
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h
@@ -1,122 +0,0 @@
-#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
-
-#if defined(SCRYPT_CHOOSE_COMPILETIME)
-#undef SCRYPT_ROMIX_FN
-#define SCRYPT_ROMIX_FN scrypt_ROMix
-#endif
-
-#undef SCRYPT_HAVE_ROMIX
-#define SCRYPT_HAVE_ROMIX
-
-#if !defined(SCRYPT_CHUNKMIX_FN)
-
-#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
-
-/*
-	Bout = ChunkMix(Bin)
-
-	2*r: number of blocks in the chunk
-*/
-static void asm_calling_convention
-SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
-#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
-	scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
-#else
-	scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
-#endif
-	uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
-
-	/* 1: X = B_{2r - 1} */
-	block = scrypt_block(Bin, blocksPerChunk - 1);
-	for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
-		X[i] = block[i];
-
-	if (Bxor) {
-		block = scrypt_block(Bxor, blocksPerChunk - 1);
-		for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
-			X[i] ^= block[i];
-	}
-
-	/* 2: for i = 0 to 2r - 1 do */
-	for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
-		/* 3: X = H(X ^ B_i) */
-		block = scrypt_block(Bin, i);
-		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
-			X[j] ^= block[j];
-
-		if (Bxor) {
-			block = scrypt_block(Bxor, i);
-			for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
-				X[j] ^= block[j];
-		}
-		SCRYPT_MIX_FN(X);
-
-		/* 4: Y_i = X */
-		/* 6: B'[0..r-1] = Y_even */
-		/* 6: B'[r..2r-1] = Y_odd */
-		block = scrypt_block(Bout, (i / 2) + half);
-		for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
-			block[j] = X[j];
-	}
-}
-#endif
-
-/*
-	X = ROMix(X)
-
-	X: chunk to mix
-	Y: scratch chunk
-	N: number of rounds
-	V[N]: array of chunks to randomly index in to
-	2*r: number of blocks in a chunk
-*/
-
-static void NOINLINE FASTCALL
-SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
-	uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
-	scrypt_mix_word_t *block = V;
-
-	SCRYPT_ROMIX_TANGLE_FN(X, 2);
-
-	/* 1: X = B */
-	/* implicit */
-
-	/* 2: for i = 0 to N - 1 do */
-	memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
-	for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
-		/* 3: V_i = X */
-		/* 4: X = H(X) */
-		SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
-	}
-	SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
-
-	/* 6: for i = 0 to N - 1 do */
-	for (i = 0; i < /*N*/512; i += 2) {
-		/* 7: j = Integerify(X) % N */
-		j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
-
-		/* 8: X = H(Y ^ V_j) */
-		SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
-
-		/* 7: j = Integerify(Y) % N */
-		j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
-
-		/* 8: X = H(Y ^ V_j) */
-		SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
-	}
-
-	/* 10: B' = X */
-	/* implicit */
-
-	SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
-}
-
-#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
-
-
-#undef SCRYPT_CHUNKMIX_FN
-#undef SCRYPT_ROMIX_FN
-#undef SCRYPT_MIX_FN
-#undef SCRYPT_ROMIX_TANGLE_FN
-#undef SCRYPT_ROMIX_UNTANGLE_FN
-
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h
@@ -1,23 +0,0 @@
-#if defined(SCRYPT_SALSA64)
-#include "scrypt-jane-salsa64.h"
-#else
-	#define SCRYPT_MIX_BASE "ERROR"
-	typedef uint32_t scrypt_mix_word_t;
-	#define SCRYPT_WORDTO8_LE U32TO8_LE
-	#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
-	#define SCRYPT_BLOCK_BYTES 64
-	#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
-	#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-		static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
-		static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
-	#else
-		static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
-	#endif
-	static int scrypt_test_mix(void) { return 0; }
-	#error must define a mix function!
-#endif
-
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-#undef SCRYPT_MIX
-#define SCRYPT_MIX SCRYPT_MIX_BASE
-#endif
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h
@@ -1,183 +0,0 @@
-#define SCRYPT_MIX_BASE "Salsa64/8"
-
-typedef uint64_t scrypt_mix_word_t;
-
-#define SCRYPT_WORDTO8_LE U64TO8_LE
-#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
-
-#define SCRYPT_BLOCK_BYTES 128
-#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
-
-/* must have these here in case block bytes is ever != 64 */
-#include "scrypt-jane-romix-basic.h"
-
-#include "scrypt-jane-mix_salsa64-avx2.h"
-#include "scrypt-jane-mix_salsa64-xop.h"
-#include "scrypt-jane-mix_salsa64-avx.h"
-#include "scrypt-jane-mix_salsa64-ssse3.h"
-#include "scrypt-jane-mix_salsa64-sse2.h"
-#include "scrypt-jane-mix_salsa64.h"
-
-#if defined(SCRYPT_SALSA64_AVX2)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
-	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA64_XOP)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
-	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
-	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA64_SSSE3)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
-	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-#if defined(SCRYPT_SALSA64_SSE2)
-	#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
-	#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
-	#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
-	#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
-	#include "scrypt-jane-romix-template.h"
-#endif
-
-/* cpu agnostic */
-#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
-#define SCRYPT_MIX_FN salsa64_core_basic
-#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
-#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
-#include "scrypt-jane-romix-template.h"
-
-#if !defined(SCRYPT_CHOOSE_COMPILETIME)
-static scrypt_ROMixfn
-scrypt_getROMix(void) {
-	size_t cpuflags = detect_cpu();
-
-#if defined(SCRYPT_SALSA64_AVX2)
-	if (cpuflags & cpu_avx2)
-		return scrypt_ROMix_avx2;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA64_XOP)
-	if (cpuflags & cpu_xop)
-		return scrypt_ROMix_xop;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX)
-	if (cpuflags & cpu_avx)
-		return scrypt_ROMix_avx;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA64_SSSE3)
-	if (cpuflags & cpu_ssse3)
-		return scrypt_ROMix_ssse3;
-	else
-#endif
-
-#if defined(SCRYPT_SALSA64_SSE2)
-	if (cpuflags & cpu_sse2)
-		return scrypt_ROMix_sse2;
-	else
-#endif
-
-	return scrypt_ROMix_basic;
-}
-#endif
-
-
-#if defined(SCRYPT_TEST_SPEED)
-static size_t
-available_implementations(void) {
-	size_t cpuflags = detect_cpu();
-	size_t flags = 0;
-
-#if defined(SCRYPT_SALSA64_AVX2)
-	if (cpuflags & cpu_avx2)
-		flags |= cpu_avx2;
-#endif
-
-#if defined(SCRYPT_SALSA64_XOP)
-	if (cpuflags & cpu_xop)
-		flags |= cpu_xop;
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX)
-	if (cpuflags & cpu_avx)
-		flags |= cpu_avx;
-#endif
-
-#if defined(SCRYPT_SALSA64_SSSE3)
-	if (cpuflags & cpu_ssse3)
-		flags |= cpu_ssse3;
-#endif
-
-#if defined(SCRYPT_SALSA64_SSE2)
-	if (cpuflags & cpu_sse2)
-		flags |= cpu_sse2;
-#endif
-
-	return flags;
-}
-#endif
-
-static int
-scrypt_test_mix(void) {
-	static const uint8_t expected[16] = {
-		0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
-	};
-
-	int ret = 1;
-	size_t cpuflags = detect_cpu();
-
-#if defined(SCRYPT_SALSA64_AVX2)
-	if (cpuflags & cpu_avx2)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA64_XOP)
-	if (cpuflags & cpu_xop)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA64_AVX)
-	if (cpuflags & cpu_avx)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA64_SSSE3)
-	if (cpuflags & cpu_ssse3)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA64_SSE2)
-	if (cpuflags & cpu_sse2)
-		ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
-#endif
-
-#if defined(SCRYPT_SALSA64_BASIC)
-	ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
-#endif
-
-	return ret;
-}
-
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h
@@ -1,28 +0,0 @@
-typedef struct scrypt_test_setting_t {
-	const char *pw, *salt;
-	uint8_t Nfactor, rfactor, pfactor;
-} scrypt_test_setting;
-
-static const scrypt_test_setting post_settings[] = {
-	{"", "", 3, 0, 0},
-	{"password", "NaCl", 9, 3, 4},
-	{0, 0, 0, 0, 0}
-};
-
-#if defined(SCRYPT_SKEIN512)
-	#if defined(SCRYPT_SALSA64)
-		static const uint8_t post_vectors[][64] = {
-			{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
-			 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
-			 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
-			 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
-			{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
-			 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
-			 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
-			 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
-		};
-	#endif
-#else
-	static const uint8_t post_vectors[][64] = {{0}};
-#endif
-
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -1,85 +0,0 @@
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include <openssl/sha.h>
-#include "ar2/argon2.h"
-#include "ar2/cores.h"
-#include "ar2/ar2-scrypt-jane.h"
-#include "algo-gate-api.h"
-
-#define T_COSTS 2
-#define M_COSTS 16
-#define MASK 8
-#define ZERO 0
-
-inline void argon_call(void *out, void *in, void *salt, int type)
-{
-	argon2_context context;
-
-	context.out = (uint8_t *)out;
-	context.pwd = (uint8_t *)in;
-	context.salt = (uint8_t*)salt;
-	context.pwdlen = 0;
-	context.allocate_cbk = NULL;
-	context.free_cbk = NULL;
-
-	ar2_argon2_core(&context, type);
-}
-
-void argon2hash(void *output, const void *input)
-{
-	uint32_t _ALIGN(64) hashA[8], hashB[8];
-
-	my_scrypt((const unsigned char *)input, 80,
-		(const unsigned char *)input, 80,
-		(unsigned char *)hashA);
-
-	argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);
-
-	my_scrypt((const unsigned char *)hashB, 32,
-		(const unsigned char *)hashB, 32,
-		(unsigned char *)output);
-}
-
-int scanhash_argon2( struct work* work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr )
-{
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t _ALIGN(64) hash[8];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	uint32_t nonce = first_nonce;
-
-        swab32_array( endiandata, pdata, 20 );
-
-	do {
-		be32enc(&endiandata[19], nonce);
-		argon2hash(hash, endiandata);
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			pdata[19] = nonce;
-         submit_solution( work, hash, mythr );
-		}
-		nonce++;
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-
-bool register_argon2_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
-  gate->scanhash        = (void*)&scanhash_argon2;
-  gate->hash            = (void*)&argon2hash;
-  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
-  opt_target_factor = 65536.0;
-
-  return true;
-};
-
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -68,7 +68,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -137,7 +137,7 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
@@ -182,7 +182,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
        opt_target_factor = 65536.0;
        return true;
 }
--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
--- a/algo/argon2/argon2d/argon2d/argon2d_thread.c
+++ b/algo/argon2/argon2d/argon2d/argon2d_thread.c
--- a/algo/argon2/argon2d/argon2d/argon2d_thread.h
+++ b/algo/argon2/argon2d/argon2d/argon2d_thread.h
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -28,7 +28,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>

 #include "core.h"
 #include "argon2d_thread.h"
@@ -100,7 +100,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
    if (context->allocate_cbk) {
        (context->allocate_cbk)(memory, memory_size);
    } else {
-        *memory = _mm_malloc( memory_size, 64 );
+        *memory = mm_malloc( memory_size, 64 );
 //        *memory = malloc(memory_size);
    }

@@ -119,7 +119,7 @@ void free_memory(const argon2_context *context, uint8_t *memory,
        (context->free_cbk)(memory, memory_size);
    } else {
 //        free(memory);
-        _mm_free( memory );
+        mm_free( memory );
    }
 }

--- a/algo/argon2/argon2d/argon2d/core.h
+++ b/algo/argon2/argon2d/argon2d/core.h
@@ -18,6 +18,7 @@
 #ifndef ARGON2_CORE_H
 #define ARGON2_CORE_H

+#include "miner.h"
 #include "argon2.h"

 #define CONST_CAST(x) (x)(uintptr_t)
--- a/algo/argon2/argon2d/argon2d/encoding.c
+++ b/algo/argon2/argon2d/argon2d/encoding.c
--- a/algo/argon2/argon2d/argon2d/encoding.h
+++ b/algo/argon2/argon2d/argon2d/encoding.h
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -86,23 +86,27 @@ static void fill_block( __m512i *state, const block *ref_block,

 #elif defined(__AVX2__)

-static void fill_block(__m256i *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+static void fill_block( __m256i *state, const block *ref_block,
+                       block *next_block, int with_xor )
+{
    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
    unsigned int i;

-    if (with_xor) {
-        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
-            state[i] = _mm256_xor_si256(
-                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
-            block_XY[i] = _mm256_xor_si256(
-                state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
+        {
+            state[i] = _mm256_xor_si256( state[i],
+                       _mm256_load_si256( (const __m256i*)ref_block->v + i) );
+            block_XY[i] = _mm256_xor_si256( state[i],
+                       _mm256_load_si256( (const __m256i*)next_block->v + i) );
        }
-    } else {
-        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = _mm256_xor_si256(
-                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
    }
+    else
+    {
+        for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = _mm256_xor_si256( state[i],
+                        _mm256_load_si256( (const __m256i*)ref_block->v + i) );
    }

    BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
@@ -123,31 +127,36 @@ static void fill_block(__m256i *state, const block *ref_block,
    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
                    state[19], state[23], state[27], state[31] );

-    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
-        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
-        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
+    {
+        state[i] = _mm256_xor_si256( state[i], block_XY[i] );
+        _mm256_store_si256( (__m256i*)next_block->v + i, state[i] );
    }
 }

 #else  // SSE2

 static void fill_block( v128_t *state, const block *ref_block,
-                       block *next_block, int with_xor) {
+                       block *next_block, int with_xor )
+{
    v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
    unsigned int i;

-    if (with_xor) {
-        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            state[i] = v128_xor(
-                state[i], v128_load((const v128_t *)ref_block->v + i));
-            block_XY[i] = v128_xor(
-                state[i], v128_load((const v128_t *)next_block->v + i));
+    if ( with_xor )
+    {
+        for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
+        {
+            state[i] = v128_xor( state[i],
+                              v128_load( (const v128_t*)ref_block->v + i) );
+            block_XY[i] = v128_xor( state[i],
+                              v128_load( (const v128_t*)next_block->v + i) );
        }
-    } else {
-        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-            block_XY[i] = state[i] = v128_xor(
-                state[i], v128_load((const v128_t *)ref_block->v + i));
    }
+    else
+    {
+        for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
+            block_XY[i] = state[i] = v128_xor( state[i],
+                              v128_load( (const v128_t*)ref_block->v + i) );
    }

    BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
@@ -184,9 +193,10 @@ static void fill_block( v128_t *state, const block *ref_block,
    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
                  state[39], state[47], state[55], state[63] );

-    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
-        state[i] = v128_xor(state[i], block_XY[i]);
-        v128_store((v128_t *)next_block->v + i, state[i]);
+    for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
+    {
+        state[i] = v128_xor( state[i], block_XY[i] );
+        v128_store( (v128_t*)next_block->v + i, state[i] );
    }
 }

--- a/algo/argon2/argon2d/blake2/blake2-impl.h
+++ b/algo/argon2/argon2d/blake2/blake2-impl.h
--- a/algo/argon2/argon2d/blake2/blake2.h
+++ b/algo/argon2/argon2d/blake2/blake2.h
--- a/algo/argon2/argon2d/blake2/blake2b.c
+++ b/algo/argon2/argon2d/blake2/blake2b.c
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -22,34 +22,13 @@
 #include "simd-utils.h"

 #if !defined(__AVX512F__)
+
+
 #if !defined(__AVX2__)
-#if !defined(__XOP__)
-#if defined(__SSSE3__)
-#define r16                                                                    \
-    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
-#define r24                                                                    \
-    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define v128_ror64(x, c)                                                   \
-    (-(c) == 32)                                                               \
-        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
-        : (-(c) == 24)                                                         \
-              ? _mm_shuffle_epi8((x), r24)                                     \
-              : (-(c) == 16)                                                   \
-                    ? _mm_shuffle_epi8((x), r16)                               \
-                    : (-(c) == 63)                                             \
-                          ? v128_xor(v128_sr64((x), -(c)),           \
-                                          v128_add64((x), (x)))             \
-                          : v128_xor(v128_sr64((x), -(c)),           \
-                                          v128_sl64((x), 64 - (-(c))))
-#else /* defined(__SSE2__) */
-#define v128_ror64(r, c)                                                   \
-    v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c))))
-#endif
-#else
-#endif
+

 static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
-    const v128_t z = v128_mul32(x, y);
+    const v128_t z = v128_mulw32(x, y);
    return v128_add64(v128_add64(x, y), v128_add64(z, z));
 }

@@ -61,8 +40,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
        D0 = v128_xor(D0, A0);                                            \
        D1 = v128_xor(D1, A1);                                            \
                                                                               \
-        D0 = v128_ror64(D0, -32);                                          \
-        D1 = v128_ror64(D1, -32);                                          \
+        D0 = v128_ror64(D0, 32);                                          \
+        D1 = v128_ror64(D1, 32);                                          \
                                                                               \
        C0 = fBlaMka(C0, D0);                                                  \
        C1 = fBlaMka(C1, D1);                                                  \
@@ -70,8 +49,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
        B0 = v128_xor(B0, C0);                                            \
        B1 = v128_xor(B1, C1);                                            \
                                                                               \
-        B0 = v128_ror64(B0, -24);                                          \
-        B1 = v128_ror64(B1, -24);                                          \
+        B0 = v128_ror64(B0, 24);                                          \
+        B1 = v128_ror64(B1, 24);                                          \
    } while ((void)0, 0)

 #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
@@ -82,8 +61,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
        D0 = v128_xor(D0, A0);                                            \
        D1 = v128_xor(D1, A1);                                            \
                                                                               \
-        D0 = v128_ror64(D0, -16);                                          \
-        D1 = v128_ror64(D1, -16);                                          \
+        D0 = v128_ror64(D0, 16);                                          \
+        D1 = v128_ror64(D1, 16);                                          \
                                                                               \
        C0 = fBlaMka(C0, D0);                                                  \
        C1 = fBlaMka(C1, D1);                                                  \
@@ -91,11 +70,12 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
        B0 = v128_xor(B0, C0);                                            \
        B1 = v128_xor(B1, C1);                                            \
                                                                               \
-        B0 = v128_ror64(B0, -63);                                          \
-        B1 = v128_ror64(B1, -63);                                          \
+        B0 = v128_ror64(B0, 63);                                          \
+        B1 = v128_ror64(B1, 63);                                          \
    } while ((void)0, 0)

-#if defined(__SSSE3__)
+#if defined(__SSSE3__)  || defined(__ARM_NEON)
+
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
    do {                                                                       \
        v128_t t0 = v128_alignr8(B1, B0, 8);                               \
@@ -129,7 +109,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
        D0 = t1;                                                               \
        D1 = t0;                                                               \
    } while ((void)0, 0)
+
 #else /* SSE2 */
+
 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
    do {                                                                       \
        v128_t t0 = D0;                                                       \
--- a/algo/argon2/argon2d/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-ref.h
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -273,43 +273,43 @@ static const unsigned sigma[16][16] = {
 /////////////////////////////////////////
 //
 // Blake-256 1 way SIMD
-// Only used for prehash, otherwise 4way is used with SSE2.
+// Only used for prehash, otherwise 4x32 is used with SSE2.

 #define BLAKE256_ROUND( r ) \
 { \
   V0 = v128_add32( V0, v128_add32( V1, \
-                           v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+                           v128_set32( CSx( r, 7 ) ^ Mx( r, 6 ), \
                                       CSx( r, 5 ) ^ Mx( r, 4 ), \
                                       CSx( r, 3 ) ^ Mx( r, 2 ), \
                                       CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
-   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V3 = v128_ror32( v128_xor( V3, V0 ), 16 ); \
   V2 = v128_add32( V2, V3 ); \
   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
   V0 = v128_add32( V0, v128_add32( V1, \
-                           v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+                           v128_set32( CSx( r, 6 ) ^ Mx( r, 7 ), \
                                       CSx( r, 4 ) ^ Mx( r, 5 ), \
                                       CSx( r, 2 ) ^ Mx( r, 3 ), \
                                       CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
-   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V3 = v128_ror32( v128_xor( V3, V0 ), 8 ); \
   V2 = v128_add32( V2, V3 ); \
   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
   V0 = v128_shufll32( V0 ); \
   V3 = v128_swap64( V3 ); \
   V2 = v128_shuflr32( V2 ); \
   V0 = v128_add32( V0, v128_add32( V1, \
-                           v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
+                           v128_set32( CSx( r, D ) ^ Mx( r, C ), \
                                       CSx( r, B ) ^ Mx( r, A ), \
                                       CSx( r, 9 ) ^ Mx( r, 8 ), \
                                       CSx( r, F ) ^ Mx( r, E ) ) ) ); \
-   V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
+   V3 = v128_ror32( v128_xor( V3, V0 ), 16 ); \
   V2 = v128_add32( V2, V3 ); \
   V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
   V0 = v128_add32( V0, v128_add32( V1, \
-                           v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
+                           v128_set32( CSx( r, C ) ^ Mx( r, D ), \
                                       CSx( r, A ) ^ Mx( r, B ), \
                                       CSx( r, 8 ) ^ Mx( r, 9 ), \
                                       CSx( r, E ) ^ Mx( r, F ) ) ) ); \
-   V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
+   V3 = v128_ror32( v128_xor( V3, V0 ), 8 ); \
   V2 = v128_add32( V2, V3 ); \
   V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
   V0 = v128_shuflr32( V0 ); \
@@ -325,8 +325,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
   V0 = casti_v128( H, 0 );
   V1 = casti_v128( H, 1 );
-   V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
-   V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
+   V2 = v128_set32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
+   V3 = v128_set32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
                    T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
   M0 = buf[ 0];
   M1 = buf[ 1];
@@ -367,39 +367,37 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 ////////////////////////////////////////////
 //
-// Blake-256 4 way
+//    Blake-256 4 way SSE2, NEON

-#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
+#define GS_4X32( m0, m1, c0, c1, a, b, c, d ) \
 { \
-   a = v128_add32( v128_add32( a, b ), \
-                      v128_xor( v128_32( c1 ), m0 ) ); \
-   d = v128_swap32_16( v128_xor( d, a ) ); \
+   a = v128_add32( v128_add32( a, b ), v128_xor( v128_32( c1 ), m0 ) ); \
+   d = v128_ror32( v128_xor( d, a ), 16 ); \
   c = v128_add32( c, d ); \
   b = v128_ror32( v128_xor( b, c ), 12 ); \
-   a = v128_add32( v128_add32( a, b ), \
-                      v128_xor( v128_32( c0 ), m1 ) ); \
-   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   a = v128_add32( v128_add32( a, b ), v128_xor( v128_32( c0 ), m1 ) ); \
+   d = v128_ror32( v128_xor( d, a ), 8 ); \
   c = v128_add32( c, d ); \
   b = v128_ror32( v128_xor( b, c ), 7 ); \
 }

-#define ROUND_S_4WAY(r) \
+#define ROUND_S_4X32(r) \
 { \
-	GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
-	GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
-	GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
-	GS_4WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
-	GS_4WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
-	GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
-	GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
-	GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+	GS_4X32(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+	GS_4X32(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+	GS_4X32(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+	GS_4X32(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+	GS_4X32(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+	GS_4X32(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+	GS_4X32(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+	GS_4X32(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

-#define DECL_STATE32_4WAY \
+#define DECL_STATE32_4X32 \
 	v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
        uint32_t T0, T1;

-#define READ_STATE32_4WAY(state)   do { \
+#define READ_STATE32_4X32(state)   do { \
 		H0 = casti_v128( state->H, 0 ); \
 		H1 = casti_v128( state->H, 1 ); \
 		H2 = casti_v128( state->H, 2 ); \
@@ -412,7 +410,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		T1 = (state)->T1; \
 	} while (0)

-#define WRITE_STATE32_4WAY(state)   do { \
+#define WRITE_STATE32_4X32(state)   do { \
 		casti_v128( state->H, 0 ) = H0; \
 		casti_v128( state->H, 1 ) = H1; \
 		casti_v128( state->H, 2 ) = H2; \
@@ -428,9 +426,9 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #if defined(__SSSE3__)

-#define BLAKE256_4WAY_BLOCK_BSWAP32 \
+#define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
-   v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
+   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
                                          0x0405060700010203 ); \
   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
@@ -452,7 +450,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #else  // SSE2

-#define BLAKE256_4WAY_BLOCK_BSWAP32 \
+#define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   M0 = v128_bswap32( buf[0] ); \
   M1 = v128_bswap32( buf[1] ); \
@@ -474,7 +472,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #endif  // SSSE3 else SSE2

-#define COMPRESS32_4WAY( rounds ) \
+#define COMPRESS32_4X32( rounds ) \
 { \
   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
   v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -488,31 +486,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = v128_64( 0x243F6A88243F6A88 ); \
-   V9 = v128_64( 0x85A308D385A308D3 ); \
-   VA = v128_64( 0x13198A2E13198A2E ); \
-   VB = v128_64( 0x0370734403707344 ); \
-   VC = v128_32( T0 ^ 0xA4093822 ); \
-   VD = v128_32( T0 ^ 0x299F31D0 ); \
-   VE = v128_32( T1 ^ 0x082EFA98 ); \
-   VF = v128_32( T1 ^ 0xEC4E6C89 ); \
-   BLAKE256_4WAY_BLOCK_BSWAP32; \
-   ROUND_S_4WAY(0); \
-   ROUND_S_4WAY(1); \
-   ROUND_S_4WAY(2); \
-   ROUND_S_4WAY(3); \
-   ROUND_S_4WAY(4); \
-   ROUND_S_4WAY(5); \
-   ROUND_S_4WAY(6); \
-   ROUND_S_4WAY(7); \
+   V8 = v128_32( 0x243F6A88 ); \
+   V9 = v128_32( 0x85A308D3 ); \
+   VA = v128_32( 0x13198A2E ); \
+   VB = v128_32( 0x03707344 ); \
+   VC = v128_32( 0xA4093822 ^ T0 ); \
+   VD = v128_32( 0x299F31D0 ^ T0 ); \
+   VE = v128_32( 0x082EFA98 ^ T1 ); \
+   VF = v128_32( 0xEC4E6C89 ^ T1 ); \
+   BLAKE256_4X32_BLOCK_BSWAP32; \
+   ROUND_S_4X32(0); \
+   ROUND_S_4X32(1); \
+   ROUND_S_4X32(2); \
+   ROUND_S_4X32(3); \
+   ROUND_S_4X32(4); \
+   ROUND_S_4X32(5); \
+   ROUND_S_4X32(6); \
+   ROUND_S_4X32(7); \
   if (rounds == 14) \
   { \
-      ROUND_S_4WAY(8); \
-      ROUND_S_4WAY(9); \
-      ROUND_S_4WAY(0); \
-      ROUND_S_4WAY(1); \
-      ROUND_S_4WAY(2); \
-      ROUND_S_4WAY(3); \
+      ROUND_S_4X32(8); \
+      ROUND_S_4X32(9); \
+      ROUND_S_4X32(0); \
+      ROUND_S_4X32(1); \
+      ROUND_S_4X32(2); \
+      ROUND_S_4X32(3); \
   } \
   H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
   H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
@@ -524,6 +522,438 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
 }

+#define G256_4X32_ALT( a, b, c, d, m0, m1 ) \
+{ \
+   a = v128_add32( v128_add32( a, b ), m0 ); \
+   d = v128_ror32( v128_xor( d, a ), 16 ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ), 12 ); \
+   a = v128_add32( v128_add32( a, b ), m1 ); \
+   d = v128_ror32( v128_xor( d, a ),  8 ); \
+   c = v128_add32( c, d ); \
+   b = v128_ror32( v128_xor( b, c ),  7 ); \
+}
+
+// Message expansion optimized to ignore padding M[5..12,14] for each round.
+#define ROUND_S_4X32_0 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC, v128_xor( M0, v128_32( CS1 ) ), \
+                                  v128_xor( M1, v128_32( CS0 ) ) ); \
+   G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M2, v128_32( CS3 ) ), \
+                                  v128_xor( M3, v128_32( CS2 ) ) ); \
+   G256_4X32_ALT( V2, V6, VA, VE, v128_xor( M4, v128_32( CS5 ) ), \
+                                                v128_32( CS4 )   ); \
+   G256_4X32_ALT( V3, V7, VB, VF,               v128_32( CS7 )  , \
+                                                v128_32( CS6 )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF,               v128_32( CS9 )  , \
+                                                v128_32( CS8 )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CSB )  , \
+                                                v128_32( CSA )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CSD )  , \
+                                  v128_xor( MD, v128_32( CSC ) ) ); \
+   G256_4X32_ALT( V3, V4, V9, VE,               v128_32( CSF )  , \
+                                  v128_xor( MF, v128_32( CSE ) ) ); \
+}
+
+#define ROUND_S_4X32_1 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CSA )  , \
+                                                v128_32( CSE )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M4, v128_32( CS8 ) ), \
+                                                v128_32( CS4 )   ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CSF )  , \
+                                  v128_xor( MF, v128_32( CS9 ) ) ); \
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( MD, v128_32( CS6 ) ), \
+                                                v128_32( CSD )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M1, v128_32( CSC ) ), \
+                                                v128_32( CS1 )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC, v128_xor( M0, v128_32( CS2 ) ), \
+                                  v128_xor( M2, v128_32( CS0 ) ) ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CS7 )  , \
+                                                v128_32( CSB )   ); \
+   G256_4X32_ALT( V3, V4, V9, VE,               v128_32( CS3 )  , \
+                                  v128_xor( M3, v128_32( CS5 ) ) ); \
+}
+
+#define ROUND_S_4X32_2 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CS8 )  , \
+                                                v128_32( CSB )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CS0 )  , \
+                                  v128_xor( M0, v128_32( CSC ) ) ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CS2 )  , \
+                                  v128_xor( M2, v128_32( CS5 ) ) ); \
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( MF, v128_32( CSD ) ), \
+                                  v128_xor( MD, v128_32( CSF ) ) ); \
+   G256_4X32_ALT( V0, V5, VA, VF,               v128_32( CSE )  , \
+                                                v128_32( CSA )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC, v128_xor( M3, v128_32( CS6 ) ), \
+                                                v128_32( CS3 )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CS1 )  , \
+                                  v128_xor( M1, v128_32( CS7 ) ) ); \
+   G256_4X32_ALT( V3, V4, V9, VE,               v128_32( CS4 )  , \
+                                  v128_xor( M4, v128_32( CS9 ) ) ); \
+}
+
+#define ROUND_S_4X32_3 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CS9 )  , \
+                                                v128_32( CS7 )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD, \
+                                  v128_xor( M3, v128_32( CS1 ) ), \
+                                  v128_xor( M1, v128_32( CS3 ) ) ); \
+   G256_4X32_ALT( V2, V6, VA, VE, v128_xor( MD, v128_32( CSC ) ), \
+                                                v128_32( CSD )   ); \
+   G256_4X32_ALT( V3, V7, VB, VF,               v128_32( CSE )  , \
+                                                v128_32( CSB )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF, \
+                                  v128_xor( M2, v128_32( CS6 ) ), \
+                                                v128_32( CS2 )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CSA )  , \
+                                                v128_32( CS5 )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M4, v128_32( CS0 ) ), \
+                                  v128_xor( M0, v128_32( CS4 ) ) ); \
+   G256_4X32_ALT( V3, V4, V9, VE, \
+                                  v128_xor( MF, v128_32( CS8 ) ), \
+                                                v128_32( CSF )   ); \
+}
+
+#define ROUND_S_4X32_4 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CS0 )  , \
+                                  v128_xor( M0, v128_32( CS9 ) ) ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CS7 )  , \
+                                                v128_32( CS5 )   ); \
+   G256_4X32_ALT( V2, V6, VA, VE, v128_xor( M2, v128_32( CS4 ) ), \
+                                  v128_xor( M4, v128_32( CS2 ) )  ); \
+   G256_4X32_ALT( V3, V7, VB, VF,               v128_32( CSF )  , \
+                                  v128_xor( MF, v128_32(  CSA ) ) ); \
+   G256_4X32_ALT( V0, V5, VA, VF,               v128_32( CS1 )  , \
+                                  v128_xor( M1, v128_32( CSE ) ) ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CSC )  , \
+                                                v128_32( CSB )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CS8 )  , \
+                                                v128_32( CS6 )   ); \
+   G256_4X32_ALT( V3, V4, V9, VE, v128_xor( M3, v128_32( CSD ) ), \
+                                  v128_xor( MD, v128_32( CS3 ) ) ); \
+}
+#define ROUND_S_4X32_5 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC, v128_xor( M2, v128_32( CSC ) ), \
+                                                v128_32( CS2 )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CSA )  , \
+                                                v128_32( CS6 )   ); \
+   G256_4X32_ALT( V2, V6, VA, VE, \
+                                  v128_xor( M0, v128_32( CSB ) ), \
+                                                v128_32( CS0 )   ); \
+   G256_4X32_ALT( V3, V7, VB, VF,               v128_32( CS3 )  , \
+                                  v128_xor( M3, v128_32( CS8 ) ) ); \
+   G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M4, v128_32( CSD ) ), \
+                                  v128_xor( MD, v128_32( CS4 ) ) ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CS5 )  , \
+                                                v128_32( CS7 )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD, \
+                                  v128_xor( MF, v128_32( CSE ) ), \
+                                                v128_32( CSF )   ); \
+   G256_4X32_ALT( V3, V4, V9, VE, \
+                                  v128_xor( M1, v128_32( CS9 ) ), \
+                                                v128_32( CS1 )   ); \
+} 
+#define ROUND_S_4X32_6 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CS5 )  , \
+                                                v128_32( CSC )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M1, v128_32( CSF ) ), \
+                                  v128_xor( MF, v128_32( CS1 ) ) ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CSD )  , \
+                                  v128_xor( MD, v128_32( CSE ) ) );\
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M4, v128_32( CSA ) ), \
+                                                v128_32( CS4 )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M0, v128_32( CS7 ) ), \
+                                                v128_32( CS0 )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CS3 )  , \
+                                  v128_xor( M3, v128_32( CS6 ) ) ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CS2 )  , \
+                                  v128_xor( M2, v128_32( CS9 ) ) ); \
+   G256_4X32_ALT( V3, V4, V9, VE,               v128_32( CSB )  , \
+                                                v128_32( CS8 )   ); \
+}
+
+#define ROUND_S_4X32_7 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC, v128_xor( MD, v128_32( CSB ) ), \
+                                                v128_32( CSD )   ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CSE )  , \
+                                                v128_32( CS7 )   ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CS1 )  , \
+                                  v128_xor( M1, v128_32( CSC ) ) ); \
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M3, v128_32( CS9 ) ), \
+                                                v128_32( CS3 )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF,               v128_32( CS0 )  , \
+                                  v128_xor( M0, v128_32( CS5 ) ) ); \
+   G256_4X32_ALT( V1, V6, VB, VC, v128_xor( MF, v128_32( CS4 ) ), \
+                                  v128_xor( M4, v128_32( CSF ) ) ); \
+   G256_4X32_ALT( V2, V7, V8, VD,               v128_32( CS6 )  , \
+                                                v128_32( CS8 )   ); \
+   G256_4X32_ALT( V3, V4, V9, VE, v128_xor( M2, v128_32( CSA ) ), \
+                                                v128_32( CS2 )   ); \
+}
+
+#define ROUND_S_4X32_8 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CSF   ), \
+                                  v128_xor( MF, v128_32( CS6 ) ) ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CS9 )  , \
+                                                v128_32( CSE )   ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CS3 )  , \
+                                  v128_xor( M3, v128_32( CSB ) ) ); \
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M0, v128_32( CS8 ) ), \
+                                                v128_32( CS0 )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF,               v128_32( CS2 )  , \
+                                  v128_xor( M2, v128_32( CSC ) ) ); \
+   G256_4X32_ALT( V1, V6, VB, VC, \
+                                  v128_xor( MD, v128_32( CS7 ) ), \
+                                                v128_32( CSD )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M1, v128_32( CS4 ) ), \
+                                  v128_xor( M4, v128_32( CS1 ) ) ); \
+   G256_4X32_ALT( V3, V4, V9, VE,               v128_32( CS5 )  , \
+                                                v128_32( CSA )   ); \
+}
+
+#define ROUND_S_4X32_9 \
+{ \
+   G256_4X32_ALT( V0, V4, V8, VC,               v128_32( CS2 )  , \
+                                  v128_xor( M2, v128_32( CSA ) ) ); \
+   G256_4X32_ALT( V1, V5, V9, VD,               v128_32( CS4 )  , \
+                                  v128_xor( M4, v128_32( CS8 ) ) ); \
+   G256_4X32_ALT( V2, V6, VA, VE,               v128_32( CS6 )  , \
+                                                v128_32( CS7 )    ); \
+   G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M1, v128_32( CS5 ) ), \
+                                                v128_32( CS1 )   ); \
+   G256_4X32_ALT( V0, V5, VA, VF, v128_xor( MF, v128_32( CSB ) ), \
+                                                v128_32( CSF )   ); \
+   G256_4X32_ALT( V1, V6, VB, VC,               v128_32( CSE )  , \
+                                                v128_32( CS9 )   ); \
+   G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M3, v128_32( CSC ) ), \
+                                                v128_32( CS3 )   ); \
+   G256_4X32_ALT( V3, V4, V9, VE, v128_xor( MD, v128_32( CS0 ) ), \
+                                  v128_xor( M0, v128_32( CSD ) ) ); \
+}
+
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data )
+{
+   v128_t *M = (v128_t*)data;
+   v128_t *V = (v128_t*)midstate;
+   const v128_t *H = (const v128_t*)midhash;
+
+   V[ 0] = H[0];
+   V[ 1] = H[1];
+   V[ 2] = H[2];
+   V[ 3] = H[3];
+   V[ 4] = H[4];
+   V[ 5] = H[5];
+   V[ 6] = H[6];
+   V[ 7] = H[7];
+   V[ 8] = v128_32( CS0 );
+   V[ 9] = v128_32( CS1 );
+   V[10] = v128_32( CS2 );
+   V[11] = v128_32( CS3 );
+   V[12] = v128_32( CS4 ^ 0x280 );
+   V[13] = v128_32( CS5 ^ 0x280 );
+   V[14] = v128_32( CS6 );
+   V[15] = v128_32( CS7 );
+
+// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
+// M[ 5:12,14 ] are always zero and not needed or used.
+// M[ 4], M[13], M[15] are constant and are initialized here.
+// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
+
+   M[ 4] = v128_32( 0x80000000 );
+   M[13] = v128_32( 1 );
+   M[15] = v128_32( 80*8 );
+
+   M[ 5] = v128_xor( M[13], v128_32( CSC ) );
+
+   // G0
+   GS_4X32( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
+
+   // G1
+   V[ 1] = v128_add32( v128_add32( V[ 1], V[ 5] ),
+                       v128_xor( v128_32( CS3 ), M[ 2] ) );
+   V[13] = v128_ror32( v128_xor( V[13], V[ 1] ), 16 );
+   V[ 9] = v128_add32( V[ 9], V[13] );
+   V[ 5] = v128_ror32( v128_xor( V[ 5], V[ 9] ), 12 );
+   V[ 1] = v128_add32( V[ 1], V[ 5] );
+
+   // G2
+   // GS_4X32( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
+   V[ 2] = v128_add32( v128_add32( V[ 2], V[ 6] ),
+                       v128_xor( v128_32( CS5 ), M[ 4] ) );
+   V[14] = v128_ror32( v128_xor( V[14], V[ 2] ), 16 );
+   V[10] = v128_add32( V[10], V[14] );
+   V[ 6] = v128_ror32( v128_xor( V[ 6], V[10] ), 12 );
+   V[ 2] = v128_add32( v128_add32( V[ 2], V[ 6] ), v128_32( CS4 ) );
+   V[14] = v128_ror32( v128_xor( V[14], V[ 2] ), 8 );
+   V[10] = v128_add32( V[10], V[14] );
+   V[ 6] = v128_ror32( v128_xor( V[ 6], V[10] ), 7 );
+
+   // G3
+   // GS_4X32( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
+   V[ 3] = v128_add32( v128_add32( V[ 3], V[ 7] ), v128_32( CS7 ) );
+   V[15] = v128_ror32( v128_xor( V[15], V[ 3] ), 16 );
+   V[11] = v128_add32( V[11], V[15] );
+   V[ 7] = v128_ror32( v128_xor( V[ 7], V[11] ), 12 );
+   V[ 3] = v128_add32( v128_add32( V[ 3], V[ 7] ), v128_32( CS6 ) );
+   V[15] = v128_ror32( v128_xor( V[15], V[ 3] ), 8 );
+   V[11] = v128_add32( V[11], V[15] );
+   V[ 7] = v128_ror32( v128_xor( V[ 7], V[11] ), 7 );
+
+   // G4
+   V[ 0] = v128_add32( V[ 0], v128_32( CS9 ) );
+
+   // G5
+   // GS_4X32( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
+
+   // G6
+   V[ 2] = v128_add32( v128_add32( V[ 2], V[ 7] ), v128_32( CSD ) );
+
+   // G7
+   V[ 3] = v128_add32( v128_add32( V[ 3], V[ 4] ), v128_32( CSF ) );
+   V[14] = v128_ror32( v128_xor( V[14], V[ 3] ), 16 );
+   V[ 3] = v128_add32( V[ 3], v128_xor( v128_32( CSE ), M[15] ) );
+}
+
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds )
+{
+   v128_t *H = (v128_t*)final_hash;
+   const v128_t *h = (const v128_t*)midhash;
+   v128_t V0, V1, V2, V3, V4, V5, V6, V7;
+   v128_t V8, V9, VA, VB, VC, VD, VE, VF;
+   v128_t M0, M1, M2, M3, M4, MD, MF;
+   v128_t MDxorCSC;
+
+   V0 = v128_load( (v128_t*)midstate +  0 );
+   V1 = v128_load( (v128_t*)midstate +  1 );
+   V2 = v128_load( (v128_t*)midstate +  2 );
+   V3 = v128_load( (v128_t*)midstate +  3 );
+   V4 = v128_load( (v128_t*)midstate +  4 );
+   V5 = v128_load( (v128_t*)midstate +  5 );
+   V6 = v128_load( (v128_t*)midstate +  6 );
+   V7 = v128_load( (v128_t*)midstate +  7 );
+   V8 = v128_load( (v128_t*)midstate +  8 );
+   V9 = v128_load( (v128_t*)midstate +  9 );
+   VA = v128_load( (v128_t*)midstate + 10 );
+   VB = v128_load( (v128_t*)midstate + 11 );
+   VC = v128_load( (v128_t*)midstate + 12 );
+   VD = v128_load( (v128_t*)midstate + 13 );
+   VE = v128_load( (v128_t*)midstate + 14 );
+   VF = v128_load( (v128_t*)midstate + 15 );
+
+   M0 = v128_load( (v128_t*)data +  0 );
+   M1 = v128_load( (v128_t*)data +  1 );
+   M2 = v128_load( (v128_t*)data +  2 );
+   M3 = v128_load( (v128_t*)data +  3 );
+   M4 = v128_load( (v128_t*)data +  4 );
+   // M5 to MC & ME zero padding & optimised out.
+   MD = v128_load( (v128_t*)data + 13 );
+   MF = v128_load( (v128_t*)data + 15 );
+   // precalculated MD^CSC, used in round0 G6.
+   MDxorCSC = v128_load( (v128_t*)data +  5 );
+
+   // Finish round 0 with nonce in M3
+   // G1
+   V1 = v128_add32( V1,
+                         v128_xor( v128_32( CS2 ), M3 ) );
+   VD = v128_ror32( v128_xor( VD, V1 ), 8 );
+   V9 = v128_add32( V9, VD );
+   V5 = v128_ror32( v128_xor( V5, V9 ), 7 );
+
+   // G4
+   V0 = v128_add32( V0, V5 );
+   VF = v128_ror32( v128_xor( VF, V0 ), 16 );
+   VA = v128_add32( VA, VF );
+   V5 = v128_ror32( v128_xor( V5, VA ), 12 );
+   V0 = v128_add32( V0, v128_add32( V5, v128_32( CS8 ) ) );
+   VF = v128_ror32( v128_xor( VF, V0 ), 8 );
+   VA = v128_add32( VA, VF );
+   V5 = v128_ror32( v128_xor( V5, VA ), 7 );
+
+   // G5
+   // GS_4X32( MA, MB, CSA, CSB, V1, V6, VB, VC );
+   V1 = v128_add32( v128_add32( V1, V6 ), v128_32( CSB ) );
+   VC = v128_ror32( v128_xor( VC, V1 ), 16 );
+   VB = v128_add32( VB, VC );
+   V6 = v128_ror32( v128_xor( V6, VB ), 12 );
+   V1 = v128_add32( v128_add32( V1, V6 ), v128_32( CSA ) );
+   VC = v128_ror32( v128_xor( VC, V1 ), 8 );
+   VB = v128_add32( VB, VC );
+   V6 = v128_ror32( v128_xor( V6, VB ), 7 );
+
+   // G6
+   VD = v128_ror32( v128_xor( VD, V2 ), 16 );
+   V8 = v128_add32( V8, VD );
+   V7 = v128_ror32( v128_xor( V7, V8 ), 12 );
+   V2 = v128_add32( V2, v128_add32( V7, MDxorCSC ) );
+   VD = v128_ror32( v128_xor( VD, V2 ), 8 );
+   V8 = v128_add32( V8, VD );
+   V7 = v128_ror32( v128_xor( V7, V8 ), 7 );
+
+   // G7
+   V9 = v128_add32( V9, VE );
+   V4 = v128_ror32( v128_xor( V4, V9 ), 12 );
+   V3 = v128_add32( V3, V4 );
+   VE = v128_ror32( v128_xor( VE, V3 ), 8 );
+   V9 = v128_add32( V9, VE );
+   V4 = v128_ror32( v128_xor( V4, V9 ), 7 );
+
+   // Remaining rounds
+   ROUND_S_4X32_1;
+   ROUND_S_4X32_2;
+   ROUND_S_4X32_3;
+   ROUND_S_4X32_4;
+   ROUND_S_4X32_5;
+   ROUND_S_4X32_6;
+   ROUND_S_4X32_7;
+   if ( rounds > 8 )
+   {
+      ROUND_S_4X32_8;
+      ROUND_S_4X32_9;
+      ROUND_S_4X32_0;
+      ROUND_S_4X32_1;
+      ROUND_S_4X32_2;
+      ROUND_S_4X32_3;
+   }
+
+#if defined(__SSSE3__)
+
+   const v128_t shuf_bswap32 =
+                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+
+   H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
+   H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
+   H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
+   H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
+   H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
+   H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
+   H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
+   H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
+
+#else
+
+   H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
+   H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
+   H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
+   H[3] = v128_bswap32( v128_xor3( VB, V3, h[3] ) );
+   H[4] = v128_bswap32( v128_xor3( VC, V4, h[4] ) );
+   H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
+   H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
+   H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
+
+#endif
+}
+
 #if defined (__AVX2__)

 /////////////////////////////////
@@ -534,12 +964,12 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 { \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                         _mm256_xor_si256( v256_32( c1 ), m0 ) ); \
-   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
                         _mm256_xor_si256( v256_32( c0 ), m1 ) ); \
-   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
 }
@@ -562,11 +992,11 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 #define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
 { \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
-   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
-   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
 }
@@ -807,7 +1237,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                  _mm256_xor_si256( M0, v256_32( CSD ) ) ); \
 }

-
 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
   uint32_t T0, T1;
@@ -1013,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // G1   
   V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
                         _mm256_xor_si256( v256_32( CS3 ), M[ 2] ) );
-   V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
+   V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
   V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
   V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
   V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
@@ -1022,7 +1451,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
                       _mm256_xor_si256( v256_32( CS5 ), M[ 4] ) );
-   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 16 );
   V[10] = _mm256_add_epi32( V[10], V[14] );
   V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
   V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
@@ -1035,7 +1464,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
                             v256_32( CS7 ) );
-   V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
+   V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 16 );
   V[11] = _mm256_add_epi32( V[11], V[15] );
   V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
@@ -1057,7 +1486,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   // G7   
   V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
                             v256_32( CSF ) );
-   V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
+   V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
   V[ 3] = _mm256_add_epi32( V[ 3],
                         _mm256_xor_si256( v256_32( CSE ), M[15] ) );
 }
@@ -1104,18 +1533,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   // G1   
   V1 = _mm256_add_epi32( V1,
                         _mm256_xor_si256( v256_32( CS2 ), M3 ) );
-   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
   V9 = _mm256_add_epi32( V9, VD );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );

   // G4
   V0 = _mm256_add_epi32( V0, V5 );
-   VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
   V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
                             v256_32( CS8 ) ) );
-   VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
+   VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
   VA = _mm256_add_epi32( VA, VF );
   V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );

@@ -1123,7 +1552,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   // GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
                          v256_32( CSB ) );
-   VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
+   VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 16 );
   VB = _mm256_add_epi32( VB, VC );
   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
   V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
@@ -1133,11 +1562,11 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );

   // G6
-   VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
   V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
-   VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
+   VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
   V8 = _mm256_add_epi32( V8, VD );
   V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );

@@ -1145,7 +1574,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   V9 = _mm256_add_epi32( V9, VE );
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
   V3 = _mm256_add_epi32( V3, V4 );
-   VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
+   VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
   V9 = _mm256_add_epi32( V9, VE );
   V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );

@@ -1504,7 +1933,7 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x( \
+   const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
@@ -1845,7 +2274,7 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   }

   // Byte swap final hash
-   const __m512i shuf_bswap32 =  mm512_bcast_m128( _mm_set_epi64x( 
+   const __m512i shuf_bswap32 =  mm512_bcast_m128( v128_set64( 
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
@@ -1861,10 +2290,10 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,

 // Blake-256 4 way

-static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 };
+static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };

 static void
-blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
+blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
@@ -1881,14 +2310,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
 }

 static void
-blake32_4way( blake_4way_small_context *ctx, const void *data,
+blake32_4x32( blake_4x32_small_context *ctx, const void *data,
              size_t len )
 {
   v128_t *buf = (v128_t*)ctx->buf;
   size_t  bptr = ctx->ptr<<2;
   size_t  vptr = ctx->ptr >> 2;
   size_t  blen = len << 2;
-   DECL_STATE32_4WAY
+   DECL_STATE32_4X32;

   if ( blen < (sizeof ctx->buf) - bptr )
   {
@@ -1898,7 +2327,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
      return;
   }

-   READ_STATE32_4WAY( ctx );
+   READ_STATE32_4X32( ctx );
   while ( blen > 0 )
   {
      size_t clen = ( sizeof ctx->buf ) - bptr;
@@ -1913,16 +2342,16 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
      {
         if ( ( T0 = T0 + 512 ) < 512 )
            T1 = T1 + 1;
-         COMPRESS32_4WAY( ctx->rounds );
+         COMPRESS32_4X32( ctx->rounds );
 	 bptr = 0;
      }
   }
-   WRITE_STATE32_4WAY( ctx );
+   WRITE_STATE32_4X32( ctx );
   ctx->ptr = bptr>>2;
 }

 static void
-blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
+blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
               void *dst, size_t out_size_w32 )
 {
   v128_t buf[16] __attribute__ ((aligned (64)));
@@ -1953,22 +2382,22 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = v128_32( bswap_32( th ) );
      buf[ 15 ] = v128_32( bswap_32( tl ) );
-      blake32_4way( ctx, buf + vptr, 64 - ptr );
+      blake32_4x32( ctx, buf + vptr, 64 - ptr );
   }
   else
   {
      v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
-      blake32_4way( ctx, buf + vptr, 64 - ptr );
+      blake32_4x32( ctx, buf + vptr, 64 - ptr );
      ctx->T0 = 0xFFFFFE00UL;
      ctx->T1 = 0xFFFFFFFFUL;
      v128_memset_zero( buf, 56>>2 );
      buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = v128_32( bswap_32( th ) );
      buf[ 15 ] = v128_32( bswap_32( tl ) );
-      blake32_4way( ctx, buf, 64 );
+      blake32_4x32( ctx, buf, 64 );
   }

-   v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
+   v128_block_bswap32_256( (v128_t*)dst, (v128_t*)ctx->H );
 }

 #if defined (__AVX2__)
@@ -2087,7 +2516,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       *(buf+(60>>2)) = v256_32( bswap_32( tl ) );
       blake32_8way( sc, buf, 64 );
   }
-   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
+   mm256_block_bswap32_256( (__m256i*)dst, (__m256i*)sc->H );
 }

 static void
@@ -2182,7 +2611,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
       *(buf+(60>>2)) = v256_32( tl );
       blake32_8way_le( sc, buf, 64 );
   }
-   mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
+   mm256_block_bswap32_256( (__m256i*)dst, (__m256i*)sc->H );
 }

 #endif
@@ -2300,7 +2729,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
       buf[60>>2] = v512_32( bswap_32( tl ) );
       blake32_16way( sc, buf, 64 );
   }
-   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+   mm512_block_bswap32_256( (__m512i*)dst, (__m512i*)sc->H );
 }

 static void
@@ -2394,7 +2823,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
       buf[60>>2] = v512_32( tl );
       blake32_16way_le( sc, buf, 64 );
   }
-   mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
+   mm512_block_bswap32_256( (__m512i*)dst, (__m512i*)sc->H );
 }

 void
@@ -2467,21 +2896,21 @@ blake256r8_16way_close(void *cc, void *dst)

 // default 14 rounds, backward copatibility
 void
-blake256_4way_init(void *ctx)
+blake256_4x32_init(void *ctx)
 {
-   blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 );
+   blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
 }

 void
-blake256_4way_update(void *ctx, const void *data, size_t len)
+blake256_4x32_update(void *ctx, const void *data, size_t len)
 {
-	blake32_4way(ctx, data, len);
+	blake32_4x32(ctx, data, len);
 }

 void
-blake256_4way_close(void *ctx, void *dst)
+blake256_4x32_close(void *ctx, void *dst)
 {
-        blake32_4way_close(ctx, 0, 0, dst, 8);
+        blake32_4x32_close(ctx, 0, 0, dst, 8);
 }

 #if defined(__AVX2__)
@@ -2521,21 +2950,21 @@ blake256_8way_close_le(void *cc, void *dst)
 #endif

 // 14 rounds Blake, Decred
-void blake256r14_4way_init(void *cc)
+void blake256r14_4x32_init(void *cc)
 {
-   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
+   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
 }

 void
-blake256r14_4way_update(void *cc, const void *data, size_t len)
+blake256r14_4x32_update(void *cc, const void *data, size_t len)
 {
-   blake32_4way(cc, data, len);
+   blake32_4x32(cc, data, len);
 }

 void
-blake256r14_4way_close(void *cc, void *dst)
+blake256r14_4x32_close(void *cc, void *dst)
 {
-   blake32_4way_close(cc, 0, 0, dst, 8);
+   blake32_4x32_close(cc, 0, 0, dst, 8);
 }

 #if defined(__AVX2__)
@@ -2560,21 +2989,21 @@ blake256r14_8way_close(void *cc, void *dst)
 #endif

 // 8 rounds Blakecoin, Vanilla
-void blake256r8_4way_init(void *cc)
+void blake256r8_4x32_init(void *cc)
 {
-   blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 );
+   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
 }

 void
-blake256r8_4way_update(void *cc, const void *data, size_t len)
+blake256r8_4x32_update(void *cc, const void *data, size_t len)
 {
-   blake32_4way(cc, data, len);
+   blake32_4x32(cc, data, len);
 }

 void
-blake256r8_4way_close(void *cc, void *dst)
+blake256r8_4x32_close(void *cc, void *dst)
 {
-   blake32_4way_close(cc, 0, 0, dst, 8);
+   blake32_4x32_close(cc, 0, 0, dst, 8);
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -3,51 +3,102 @@

 #include <stddef.h>
 #include "simd-utils.h"
+#include "sph_blake.h"

-/////////////////////////
+////////////////////////////
 //
 //  Blake-256 1 way SSE2
+//
+
+//#define blake256_context    sph_blake256_context
+#define blake256_init       sph_blake256_init
+#define blake256_update     sph_blake256
+#define blake256_update_le  sph_blake256_update_le
+#define blake256_close      sph_blake256_close
+
+//TODO decouple from SPH
+
+typedef struct
+{
+   unsigned char buf[64];  
+   size_t ptr;
+   uint32_t H[8];
+   uint32_t S[4];
+   uint32_t T0, T1;
+} blake256_context __attribute__ ((aligned (32)));

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds );
+/*
+void blake256_init( blake256_context *sc );
+void blake256_update( blake256_context *sc, const void *data, size_t len );
+void blake256_close( blake256_context *sc, void *dst );
+void blake256_full( blake256_context *sc, void *dst, const void *data,
+                    size_t len );
+*/

-//////////////////////////
+//////////////////////////////////
 //
-//   Blake-256 4 way SSE2
+//   Blake-256 4 way SSE2, NEON

-typedef struct {
+typedef struct
+{
   unsigned char buf[64<<2];
   uint32_t H[8<<2];
   size_t ptr;
   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
-} blake_4way_small_context __attribute__ ((aligned (64)));
+} blake_4x32_small_context __attribute__ ((aligned (64)));

 // Default, 14 rounds
-typedef blake_4way_small_context blake256_4way_context;
-void blake256_4way_init(void *ctx);
-void blake256_4way_update(void *ctx, const void *data, size_t len);
-void blake256_4way_close(void *ctx, void *dst);
+typedef blake_4x32_small_context blake256_4x32_context;
+void blake256_4x32_init(void *ctx);
+void blake256_4x32_update(void *ctx, const void *data, size_t len);
+void blake256_4x32_close(void *ctx, void *dst);

 // 14 rounds
-typedef blake_4way_small_context blake256r14_4way_context;
-void blake256r14_4way_init(void *cc);
-void blake256r14_4way_update(void *cc, const void *data, size_t len);
-void blake256r14_4way_close(void *cc, void *dst);
+typedef blake_4x32_small_context blake256r14_4x32_context;
+void blake256r14_4x32_init(void *cc);
+void blake256r14_4x32_update(void *cc, const void *data, size_t len);
+void blake256r14_4x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_4way_small_context blake256r8_4way_context;
-void blake256r8_4way_init(void *cc);
-void blake256r8_4way_update(void *cc, const void *data, size_t len);
-void blake256r8_4way_close(void *cc, void *dst);
+typedef blake_4x32_small_context blake256r8_4x32_context;
+void blake256r8_4x32_init(void *cc);
+void blake256r8_4x32_update(void *cc, const void *data, size_t len);
+void blake256r8_4x32_close(void *cc, void *dst);
+
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data );
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds );
+
+#define blake_4way_small_context         blake256_4x32_context
+#define blake256_4way_context            blake256_4x32_context
+#define blake256_4way_init               blake256_4x32_init
+#define blake256_4way_update             blake256_4x32_update
+#define blake256_4way_close              blake256_4x32_close
+#define blake256_4way_update_le          blake256_4x32_update_le
+#define blake256_4way_close_le           blake256_4x32_close_le
+#define blake256_4way_round0_prehash_le  blake256_4x32_round0_prehash_le
+#define blake256_4way_final_rounds_le    blake256_4x32_final_rounds_le
+#define blake256r14_4way_context         blake256r14_4x32_context
+#define blake256r14_4way_init            blake256r14_4x32_init
+#define blake256r14_4way_update          blake256r14_4x32_update
+#define blake256r14_4way_close           blake256r14_4x32_close
+#define blake256r8_4way_context          blake256r14_4x32_context
+#define blake256r8_4way_init             blake256r14_4x32_init
+#define blake256r8_4way_update           blake256r14_4x32_update
+#define blake256r8_4way_close            blake256r14_4x32_close

 #ifdef __AVX2__

-//////////////////////////
+//////////////////////////////
 //
 //   Blake-256 8 way AVX2

-typedef struct {
+typedef struct
+{
   __m256i buf[16] __attribute__ ((aligned (64)));
   __m256i H[8];
   size_t ptr;
@@ -79,13 +130,31 @@ void blake256r8_8way_init(void *cc);
 void blake256r8_8way_update(void *cc, const void *data, size_t len);
 void blake256r8_8way_close(void *cc, void *dst);

+#define blake_8x32_small_context      blake256_8way_context
+#define blake_8x32_init               blake256_8way_init
+#define blake_8x32_update             blake256_8way_update
+#define blake_8x32_close              blake256_8way_close
+#define blake_8x32_update_le          blake256_8way_update_le
+#define blake_8x32_close_le           blake256_8way_close_le
+#define blake_8x32_round0_prehash_le  blake256_8way_round0_prehash
+#define blake_8x32_final_rounds_le    blake256_8way_final_rounds_le
+#define blake256r14_8x32_context      blake256r14_8way_context
+#define blake256r14_8x32_init         blake256r14_8way_init
+#define blake256r14_8x32_update       blake256r14_8way_update
+#define blake256r14_8x32_close        blake256r14_8way_close
+#define blake256r8_8x32_context       blake256r14_8way_context
+#define blake256r8_8x32_init          blake256r14_8way_init
+#define blake256r8_8x32_update        blake256r14_8way_update
+#define blake256r8_8x32_close         blake256r14_8way_close
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-////////////////////////////
+///////////////////////////////////
 //
 //   Blake-256 16 way AVX512

-typedef struct {
+typedef struct
+{
   __m512i buf[16];
   __m512i H[8];
   size_t ptr;
@@ -118,6 +187,23 @@ void blake256r8_16way_init(void *cc);
 void blake256r8_16way_update(void *cc, const void *data, size_t len);
 void blake256r8_16way_close(void *cc, void *dst);

+#define blake_16x32_small_context      blake256_16way_context
+#define blake_16x32_init               blake256_16way_init
+#define blake_16x32_update             blake256_16way_update
+#define blake_16x32_close              blake256_16way_close
+#define blake_16x32_update_le          blake256_16way_update_le
+#define blake_16x32_close_le           blake256_16way_close_le
+#define blake_16x32_round0_prehash_le  blake256_16way_round0_prehash
+#define blake_16x32_final_rounds_le    blake256_16way_final_rounds_le
+#define blake256r14_16x32_context      blake256r14_16way_context
+#define blake256r14_16x32_init         blake256r14_16way_init
+#define blake256r14_16x32_update       blake256r14_16way_update
+#define blake256r14_16x32_close        blake256r14_16way_close
+#define blake256r8_16x32_context       blake256r8_16way_context
+#define blake256r8_16x32_init          blake256r8_16way_init
+#define blake256r8_16x32_update        blake256r8_16way_update
+#define blake256r8_16x32_close         blake256r8_16way_close
+
 #endif  // AVX512
 #endif  // AVX2

--- a/algo/blake/blake2b-hash.c
+++ b/algo/blake/blake2b-hash.c
@@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
 #define B2B_G(a, b, c, d, x, y) \
 { \
   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
-	v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
-	v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
 	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
-	v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
 	v[c] = _mm256_add_epi64( v[c], v[d] ); \
 	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
 }
--- a/algo/blake/blake2s-hash.c
+++ b/algo/blake/blake2s-hash.c
@@ -108,11 +108,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
-   d = v128_swap32_16( v128_xor( d, a ) ); \
+   d = v128_ror32( v128_xor( d, a ), 16 ); \
   c = v128_add32( c, d ); \
   b = v128_ror32( v128_xor( b, c ), 12 ); \
   a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
-   d = v128_shuflr32_8( v128_xor( d, a ) ); \
+   d = v128_ror32( v128_xor( d, a ),  8 ); \
   c = v128_add32( c, d ); \
   b = v128_ror32( v128_xor( b, c ),  7 ); \
 } while(0)
@@ -320,11 +320,11 @@ do { \
   uint8_t s0 = sigma0; \
   uint8_t s1 = sigma1; \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
-   d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
-   d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
   c = _mm256_add_epi32( c, d ); \
   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
 } while(0)
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -7,8 +7,8 @@
  #define BLAKE2S_16WAY
 #elif defined(__AVX2__)
  #define BLAKE2S_8WAY
-#elif defined(__SSE2__)
-  #define BLAKE2S_4WAY
+#elif defined(__SSE2__) || defined(__ARM_NEON)
+//  #define BLAKE2S_4WAY
 #endif

 #if defined(BLAKE2S_16WAY)
@@ -145,7 +145,7 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
+   v128_t  *noncev = (v128_t*)vdata + 19;   // aligned
   uint32_t n = first_nonce;
   int thr_id = mythr->id; 

@@ -154,7 +154,7 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
+      *noncev = v128_bswap32( v128_set32( n+3, n+2, n+1, n ) );
      pdata[19] = n;

      blake2s_4way_hash( hash, vdata );
@@ -245,7 +245,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_blake2s;
  gate->hash      = (void*)&blake2s_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  return true;
 };

--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
--- a/algo/blake/blake512-hash.h
+++ b/algo/blake/blake512-hash.h
@@ -4,11 +4,14 @@
 #include <stddef.h>
 #include "simd-utils.h"

+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 /////////////////////////
 //
-//  Blake-512 1 way SSE2 & AVX2
+//  Blake-512 1 way SSE2, AVX2, NEON

-typedef struct {
+typedef struct
+{
   unsigned char buf[128];    /* first field, for alignment */
   uint64_t H[8];
   uint64_t T0, T1;
@@ -23,61 +26,113 @@ void blake512_close( blake512_context *sc, void *dst );
 void blake512_full( blake512_context *sc, void *dst, const void *data,
                    size_t len );

+/////////////////////////
+//
+//  Blake-512 2 way SSE2 & NEON
+
+typedef struct
+{
+   v128u64_t buf[16];
+   v128u64_t H[8];
+   v128u64_t S[4];
+   size_t ptr;
+   uint64_t T0, T1;
+} blake_2x64_big_context __attribute__ ((aligned (32)));
+
+typedef blake_2x64_big_context blake512_2x64_context;
+
+void blake512_2x64_init( blake_2x64_big_context *sc );
+void blake512_2x64_update( void *cc, const void *data, size_t len );
+void blake512_2x64_close( void *cc, void *dst );
+void blake512_2x64_full( blake_2x64_big_context *sc, void * dst,
+                         const void *data, size_t len );
+void blake512_2x64_full_le( blake_2x64_big_context *sc, void * dst,
+                            const void *data, size_t len );
+void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
+                                     v128u64_t *midstate, const void *data );
+void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc,
+              void *hash, const v128u64_t nonce, const v128u64_t *midstate );
+
 #ifdef __AVX2__

+/////////////////////////
+//
 // Blake-512 4 way AVX2

-typedef struct {
+typedef struct
+{
   __m256i buf[16];
   __m256i H[8];
   __m256i S[4];   
   size_t ptr;
   uint64_t T0, T1;
-} blake_4way_big_context __attribute__ ((aligned (64)));
+} blake_4x64_big_context __attribute__ ((aligned (64)));

-typedef blake_4way_big_context blake512_4way_context;
+typedef blake_4x64_big_context blake512_4x64_context;

-void blake512_4way_init( blake_4way_big_context *sc );
-void blake512_4way_update( void *cc, const void *data, size_t len );
-void blake512_4way_close( void *cc, void *dst );
-void blake512_4way_full( blake_4way_big_context *sc, void * dst,
+void blake512_4x64_init( blake_4x64_big_context *sc );
+void blake512_4x64_update( void *cc, const void *data, size_t len );
+void blake512_4x64_close( void *cc, void *dst );
+void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
                         const void *data, size_t len );
-void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
+void blake512_4x64_full_le( blake_4x64_big_context *sc, void * dst,
                            const void *data, size_t len );
-void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
+void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
                               const void *data );
-void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
+void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
                             const __m256i nonce, const __m256i *midstate );

+#define blake_4way_big_context    blake_4x64_big_context
+#define blake512_4way_context     blake512_4x64_context
+#define blake512_4way_init        blake512_4x64_init
+#define blake512_4way_update      blake512_4x64_update
+#define blake512_4way_close       blake512_4x64_close 
+#define blake512_4way_full        blake512_4x64_full
+#define blake512_4way_full_le     blake512_4x64_full_le
+#define blake512_4way_prehash_le  blake512_4x64_prehash_le
+#define blake512_4way_final_le    blake512_4x64_final_le
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 ////////////////////////////
 //
-//// Blake-512 8 way AVX512
+//   Blake-512 8 way AVX512

-typedef struct {
+typedef struct
+{
   __m512i buf[16];
   __m512i H[8];
   __m512i S[4];
   size_t ptr;
   uint64_t T0, T1;
-} blake_8way_big_context __attribute__ ((aligned (128)));
+} blake_8x64_big_context __attribute__ ((aligned (128)));

-typedef blake_8way_big_context blake512_8way_context;
+typedef blake_8x64_big_context blake512_8x64_context;

-void blake512_8way_init( blake_8way_big_context *sc );
-void blake512_8way_update( void *cc, const void *data, size_t len );
-void blake512_8way_close( void *cc, void *dst );
-void blake512_8way_full( blake_8way_big_context *sc, void * dst,
+void blake512_8x64_init( blake_8x64_big_context *sc );
+void blake512_8x64_update( void *cc, const void *data, size_t len );
+void blake512_8x64_close( void *cc, void *dst );
+void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
                        const void *data, size_t len );
-void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
+void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
                            const void *data, size_t len );
-void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
+void blake512_8x64_prehash_le( blake_8x64_big_context *sc, __m512i *midstate,
                               const void *data );
-void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
+void blake512_8x64_final_le( blake_8x64_big_context *sc, void *hash,
                             const __m512i nonce, const __m512i *midstate );

+#define blake_8way_big_context      blake_8x64_big_context
+#define blake512_8way_context       blake512_8x64_context
+#define blake512_8way_init          blake512_8x64_init
+#define blake512_8way_update        blake512_8x64_update
+#define blake512_8way_close         blake512_8x64_close
+#define blake512_8way_full          blake512_8x64_full  
+#define blake512_8way_full_le       blake512_8x64_full_le
+#define blake512_8way_prehash_le    blake512_8x64_prehash_le
+#define blake512_8way_final_le      blake512_8x64_final_le
+
 #endif  // AVX512
 #endif  // AVX2
+#endif  // SSE2 or NEON

 #endif  // BLAKE512_HASH_H__
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -229,39 +229,39 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )

 #if defined(__SSE2__)

-   __m128i *V = (__m128i*)v;
+   v128_t *V = (v128_t*)v;

 #define BLAKE2S_ROUND( r ) \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
                  m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
                  m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
                   m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
                   m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[0] = mm128_shufll_32( V[0] ); \
-   V[3] = mm128_swap_64( V[3] ); \
-   V[2] = mm128_shuflr_32( V[2] ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shufll32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shuflr32( V[2] ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
                    m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
                    m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
-   V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
-   V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
+   V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
                    m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
                    m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
-   V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
-   V[2] = _mm_add_epi32( V[2], V[3] ); \
-   V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
-   V[0] = mm128_shuflr_32( V[0] ); \
-   V[3] = mm128_swap_64( V[3] ); \
-   V[2] = mm128_shufll_32( V[2] )
+   V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
+   V[2] = v128_add32( V[2], V[3] ); \
+   V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
+   V[0] = v128_shuflr32( V[0] ); \
+   V[3] = v128_swap64( V[3] ); \
+   V[2] = v128_shufll32( V[2] )

   BLAKE2S_ROUND(0);
   BLAKE2S_ROUND(1);
--- a/algo/blake/sph_blake.h
+++ b/algo/blake/sph_blake.h
@@ -82,9 +82,9 @@ typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
 	size_t ptr;
-	sph_u32 H[8];
-	sph_u32 S[4];
-	sph_u32 T0, T1;
+	uint32_t H[8];
+	uint32_t S[4];
+	uint32_t T0, T1;
 #endif
 } sph_blake_small_context;

--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -52,14 +52,14 @@
  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
              _mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
                                 m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-  V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 32 ); \
  V[2] = _mm256_add_epi64( V[2], V[3] ); \
-  V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
+  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 24 ); \
 \
  V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
              _mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
                                 m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-  V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
+  V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 16 ); \
  V[2] = _mm256_add_epi64( V[2], V[3] ); \
  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
 }
@@ -95,27 +95,27 @@
 }
 */

-#elif defined(__SSE2__) || defined(__NEON__)   // ready for NEON
+#elif defined(__SSE2__) || defined(__ARM_NEON)

 #define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
 { \
   Va = v128_add64( Va, v128_add64( Vb, \
-                 v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
-   Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
+                 v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
+   Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
   Vc = v128_add64( Vc, Vd ); \
-   Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
+   Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
 \
   Va = v128_add64( Va, v128_add64( Vb, \
-                 v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
-   Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
+                 v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
+   Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
   Vc = v128_add64( Vc, Vd ); \
   Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
 }

 #define BLAKE2B_ROUND( R ) \
 { \
-   __m128i *V = (__m128i*)v; \
-   __m128i V2, V3, V6, V7; \
+   v128_t *V = (v128_t*)v; \
+   v128_t V2, V3, V6, V7; \
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
@@ -152,8 +152,8 @@

 #define BLAKE2B_ROUND( R ) \
 { \
-   __m128i *V = (__m128i*)v; \
-   __m128i V2, V3, V6, V7; \
+   v128_t *V = (v128_t*)v; \
+   v128_t V2, V3, V6, V7; \
   const uint8_t *sigmaR = sigma[R]; \
   BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
   BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -36,10 +36,6 @@
 #ifndef BMW_HASH_H__
 #define BMW_HASH_H__

-#ifdef __cplusplus
-extern "C"{
-#endif
-
 #include <stddef.h>
 #include "simd-utils.h"

@@ -47,13 +43,12 @@ extern "C"{

 #define SPH_SIZE_bmw512   512

-#if defined(__SSE2__)
-
 // BMW-256 4 way 32

-typedef struct {
-   __m128i buf[64];
-   __m128i H[16];
+typedef struct
+{
+   v128_t buf[64];
+   v128_t H[16];
   size_t ptr;
   uint32_t bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;
@@ -70,13 +65,12 @@ void bmw256_4way_close(void *cc, void *dst);
 void bmw256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);

-#endif  // __SSE2__
-
 #if defined(__AVX2__)

 // BMW-256 8 way 32

-typedef struct {
+typedef struct
+{
   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
@@ -97,7 +91,8 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

 // BMW-256 16 way 32

-typedef struct {
+typedef struct
+{
   __m512i buf[16];
   __m512i H[16];
   size_t ptr;
@@ -113,73 +108,82 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );

 #endif

-
-#if defined(__SSE2__)
-
 // BMW-512 2 way 64

-typedef struct {
-   __m128i buf[16];
-   __m128i H[16];
+typedef struct
+{
+   v128u64_t buf[16];
+   v128u64_t H[16];
   size_t ptr;
   uint64_t bit_count; 
 } bmw_2way_big_context __attribute__ ((aligned (64)));

-typedef bmw_2way_big_context bmw512_2way_context;
+typedef bmw_2way_big_context bmw512_2x64_context;

-void bmw512_2way_init( bmw512_2way_context *ctx );
-void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
+void bmw512_2x64_init( bmw512_2x64_context *ctx );
+void bmw512_2x64_update( bmw512_2x64_context *ctx, const void *data,
                         size_t len );
-void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
-
-#endif // __SSE2__
+void bmw512_2x64_close( bmw512_2x64_context *ctx, void *dst );
+void bmw512_2x64_ctx( bmw512_2x64_context *ctx, void *dst, const void *data,
+                      size_t len );
+void bmw512_2x64( void *dst, const void *data, size_t len );

 #if defined(__AVX2__)

 // BMW-512 64 bit 4 way

-typedef struct {
+typedef struct
+{
   __m256i buf[16];
   __m256i H[16];
   size_t ptr;
   uint64_t bit_count;
 } bmw_4way_big_context __attribute__((aligned(128)));

-typedef bmw_4way_big_context bmw512_4way_context;
-
-void bmw512_4way_init(void *cc);
-
-void bmw512_4way_update(void *cc, const void *data, size_t len);
-#define bmw512_4way bmw512_4way_update
-
-void bmw512_4way_close(void *cc, void *dst);
+typedef bmw_4way_big_context bmw512_4x64_context;

+void bmw512_4x64_init(void *cc);
+void bmw512_4x64_update(void *cc, const void *data, size_t len);
+void bmw512_4x64_close(void *cc, void *dst);
 void bmw512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

+// legacy names
+#define bmw512_4way_context   bmw512_4x64_context
+#define bmw512_4way_init      bmw512_4x64_init
+#define bmw512_4way_update    bmw512_4x64_update
+#define bmw512_4way           bmw512_4x64_update
+#define bmw512_4way_close     bmw512_4x64_close
+
 #endif  // __AVX2__

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 // BMW-512 64 bit 8 way
-typedef struct {
+typedef struct
+{
   __m512i buf[16];
   __m512i H[16];
   size_t ptr;
   uint64_t bit_count;
-} bmw512_8way_context __attribute__((aligned(128)));
+} bmw512_8x64_context __attribute__((aligned(128)));

-void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
+void bmw512_8x64( bmw512_8x64_context *ctx, void *out, const void *data,
                         size_t len );
-void bmw512_8way_init( bmw512_8way_context *ctx );
-void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
+void bmw512_8x64_init( bmw512_8x64_context *ctx );
+void bmw512_8x64_update( bmw512_8x64_context *ctx, const void *data,
                         size_t len );
-void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
+void bmw512_8x64_close( bmw512_8x64_context *ctx, void *dst );
+
+// legacy names
+#define bmw512_8way_context   bmw512_8x64_context
+#define bmw512_8way_init      bmw512_8x64_init
+#define bmw512_8way_update    bmw512_8x64_update
+#define bmw512_8way_close     bmw512_8x64_close
+#define bmw512_8way           bmw512_8x64
+#define bmw512_8way_full      bmw512_8x64
+#define bmw512_8x64_full      bmw512_8x64

 #endif // AVX512

-#ifdef __cplusplus
-}
-#endif
-
 #endif // BMW_HASH_H__
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -35,14 +35,6 @@
 #include <limits.h>
 #include "bmw-hash-4way.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
 #define LPAR   (

 static const uint64_t IV512[] = {
@@ -56,509 +48,453 @@ static const uint64_t IV512[] = {
        0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
 };

-#if defined(__SSE2__)
+// SSE2 or NEON BMW-512 2 way 64

-// BMW-512 2 way 64 
-
-#define s2b0(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
-                                 _mm_slli_epi64( (x), 3) ), \
-                  _mm_xor_si128( mm128_rol_64( (x),  4), \
-                                 mm128_rol_64( (x), 37) ) )
-
-#define s2b1(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
-                                 _mm_slli_epi64( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_64( (x), 13), \
-                                 mm128_rol_64( (x), 43) ) )
-
-#define s2b2(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
-                                 _mm_slli_epi64( (x), 1) ), \
-                  _mm_xor_si128( mm128_rol_64( (x), 19), \
-                                 mm128_rol_64( (x), 53) ) )
-
-#define s2b3(x) \
-   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
-                                 _mm_slli_epi64( (x), 2) ), \
-                  _mm_xor_si128( mm128_rol_64( (x), 28), \
-                                 mm128_rol_64( (x), 59) ) )
-
-#define s2b4(x) \
-   _mm_xor_si128( (x), _mm_srli_epi64( (x), 1 ) )
-
-#define s2b5(x) \
-   _mm_xor_si128( (x), _mm_srli_epi64( (x), 2 ) )
-
-
-#define r2b1(x)    mm128_rol_64( x,  5 )
-#define r2b2(x)    mm128_rol_64( x, 11 )
-#define r2b3(x)    mm128_rol_64( x, 27 )
-#define r2b4(x)    mm128_rol_64( x, 32 )
-#define r2b5(x)    mm128_rol_64( x, 37 )
-#define r2b6(x)    mm128_rol_64( x, 43 )
-#define r2b7(x)    mm128_rol_64( x, 53 )
-
-#define mm128_rol_off_64( M, j, off ) \
-   mm128_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
-                  ( ( (j) + (off) ) & 0xF ) + 1 )
-
-#define add_elt_2b( M, H, j ) \
-   _mm_xor_si128( \
-      _mm_add_epi64( \
-            _mm_sub_epi64( _mm_add_epi64( mm128_rol_off_64( M, j, 0 ), \
-                                          mm128_rol_off_64( M, j, 3 ) ), \
-                           mm128_rol_off_64( M, j, 10 ) ), \
-            _mm_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
-       H[ ( (j)+7 ) & 0xF ] )
-
-
-#define expand1_2b( qt, M, H, i ) \
-   _mm_add_epi64( \
-      _mm_add_epi64( \
-         _mm_add_epi64( \
-             _mm_add_epi64( \
-                _mm_add_epi64( s2b1( qt[ (i)-16 ] ), \
-                               s2b2( qt[ (i)-15 ] ) ), \
-                _mm_add_epi64( s2b3( qt[ (i)-14 ] ), \
-                               s2b0( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi64( \
-                _mm_add_epi64( s2b1( qt[ (i)-12 ] ), \
-                               s2b2( qt[ (i)-11 ] ) ), \
-                _mm_add_epi64( s2b3( qt[ (i)-10 ] ), \
-                               s2b0( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi64( \
-             _mm_add_epi64( \
-                _mm_add_epi64( s2b1( qt[ (i)- 8 ] ), \
-                               s2b2( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi64( s2b3( qt[ (i)- 6 ] ), \
-                               s2b0( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi64( \
-                _mm_add_epi64( s2b1( qt[ (i)- 4 ] ), \
-                               s2b2( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi64( s2b3( qt[ (i)- 2 ] ), \
-                               s2b0( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_2b( M, H, (i)-16 ) )
-
-#define expand2_2b( qt, M, H, i) \
-   _mm_add_epi64( \
-      _mm_add_epi64( \
-         _mm_add_epi64( \
-             _mm_add_epi64( \
-                _mm_add_epi64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ) ), \
-                _mm_add_epi64( qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ) ), \
-             _mm_add_epi64( \
-                _mm_add_epi64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ) ), \
-                _mm_add_epi64( qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ) ) ), \
-         _mm_add_epi64( \
-             _mm_add_epi64( \
-                _mm_add_epi64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ) ), \
-                _mm_add_epi64( qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ) ), \
-             _mm_add_epi64( \
-                _mm_add_epi64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ) ), \
-                _mm_add_epi64( s2b4( qt[ (i)- 2 ] ), \
-                               s2b5( qt[ (i)- 1 ] ) ) ) ) ), \
-      add_elt_2b( M, H, (i)-16 ) )
-
-
-#define W2b0 \
-   _mm_add_epi64( \
-       _mm_add_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 5], H[ 5] ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
-          _mm_xor_si128( M[13], H[13] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
-
-#define W2b1 \
-   _mm_sub_epi64( \
-       _mm_add_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 6], H[ 6] ), \
-                            _mm_xor_si128( M[ 8], H[ 8] ) ), \
-             _mm_xor_si128( M[11], H[11] ) ), \
-          _mm_xor_si128( M[14], H[14] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define W2b2 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_add_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
-                            _mm_xor_si128( M[ 7], H[ 7] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define W2b3 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
-                               _mm_xor_si128( M[ 1], H[ 1] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
-
-#define W2b4 \
-   _mm_sub_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
-             _mm_xor_si128( M[ 9], H[ 9] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
-
-#define W2b5 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
-                            _mm_xor_si128( M[ 2], H[ 2] ) ), \
-             _mm_xor_si128( M[10], H[10] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define W2b6 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 4], H[ 4] ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
-             _mm_xor_si128( M[ 3], H[ 3] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
-
-#define W2b7 \
-   _mm_sub_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 5], H[ 5] ) ), \
-          _mm_xor_si128( M[12], H[12] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
-
-#define W2b8 \
-   _mm_sub_epi64( \
-       _mm_add_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
-                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[13], H[13] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define W2b9 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
-       _mm_xor_si128( M[14], H[14] ) )
-
-#define W2b10 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
-                            _mm_xor_si128( M[ 1], H[ 1] ) ), \
-             _mm_xor_si128( M[ 4], H[ 4] ) ), \
-          _mm_xor_si128( M[ 7], H[ 7] ) ), \
-       _mm_xor_si128( M[15], H[15] ) )
-
-#define W2b11 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
-                            _mm_xor_si128( M[ 0], H[ 0] ) ), \
-             _mm_xor_si128( M[ 2], H[ 2] ) ), \
-          _mm_xor_si128( M[ 5], H[ 5] ) ), \
-       _mm_xor_si128( M[ 9], H[ 9] ) )
-
-#define W2b12 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
-                            _mm_xor_si128( M[ 3], H[ 3] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
-       _mm_xor_si128( M[10], H[10] ) )
-
-#define W2b13 \
-   _mm_add_epi64( \
-       _mm_add_epi64( \
-          _mm_add_epi64( \
-             _mm_add_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
-                            _mm_xor_si128( M[ 4], H[ 4] ) ), \
-             _mm_xor_si128( M[ 7], H[ 7] ) ), \
-          _mm_xor_si128( M[10], H[10] ) ), \
-       _mm_xor_si128( M[11], H[11] ) )
-
-#define W2b14 \
-   _mm_sub_epi64( \
-       _mm_sub_epi64( \
-          _mm_add_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
-                            _mm_xor_si128( M[ 5], H[ 5] ) ), \
-             _mm_xor_si128( M[ 8], H[ 8] ) ), \
-          _mm_xor_si128( M[11], H[11] ) ), \
-       _mm_xor_si128( M[12], H[12] ) )
-
-#define W2b15 \
-   _mm_add_epi64( \
-       _mm_sub_epi64( \
-          _mm_sub_epi64( \
-             _mm_sub_epi64( _mm_xor_si128( M[12], H[12] ), \
-                            _mm_xor_si128( M[ 4], H[4] ) ), \
-             _mm_xor_si128( M[ 6], H[ 6] ) ), \
-          _mm_xor_si128( M[ 9], H[ 9] ) ), \
-       _mm_xor_si128( M[13], H[13] ) )
-
-
-void compress_big_2way( const __m128i *M, const __m128i H[16],
-                        __m128i dH[16] )
-{
-   __m128i qt[32], xl, xh;
-
-   qt[ 0] = _mm_add_epi64( s2b0( W2b0 ), H[ 1] );
-   qt[ 1] = _mm_add_epi64( s2b1( W2b1 ), H[ 2] );
-   qt[ 2] = _mm_add_epi64( s2b2( W2b2 ), H[ 3] );
-   qt[ 3] = _mm_add_epi64( s2b3( W2b3 ), H[ 4] );
-   qt[ 4] = _mm_add_epi64( s2b4( W2b4 ), H[ 5] );
-   qt[ 5] = _mm_add_epi64( s2b0( W2b5 ), H[ 6] );
-   qt[ 6] = _mm_add_epi64( s2b1( W2b6 ), H[ 7] );
-   qt[ 7] = _mm_add_epi64( s2b2( W2b7 ), H[ 8] );
-   qt[ 8] = _mm_add_epi64( s2b3( W2b8 ), H[ 9] );
-   qt[ 9] = _mm_add_epi64( s2b4( W2b9 ), H[10] );
-   qt[10] = _mm_add_epi64( s2b0( W2b10), H[11] );
-   qt[11] = _mm_add_epi64( s2b1( W2b11), H[12] );
-   qt[12] = _mm_add_epi64( s2b2( W2b12), H[13] );
-   qt[13] = _mm_add_epi64( s2b3( W2b13), H[14] );
-   qt[14] = _mm_add_epi64( s2b4( W2b14), H[15] );
-   qt[15] = _mm_add_epi64( s2b0( W2b15), H[ 0] );
-   qt[16] = expand1_2b( qt, M, H, 16 );
-   qt[17] = expand1_2b( qt, M, H, 17 );
-   qt[18] = expand2_2b( qt, M, H, 18 );
-   qt[19] = expand2_2b( qt, M, H, 19 );
-   qt[20] = expand2_2b( qt, M, H, 20 );
-   qt[21] = expand2_2b( qt, M, H, 21 );
-   qt[22] = expand2_2b( qt, M, H, 22 );
-   qt[23] = expand2_2b( qt, M, H, 23 );
-   qt[24] = expand2_2b( qt, M, H, 24 );
-   qt[25] = expand2_2b( qt, M, H, 25 );
-   qt[26] = expand2_2b( qt, M, H, 26 );
-   qt[27] = expand2_2b( qt, M, H, 27 );
-   qt[28] = expand2_2b( qt, M, H, 28 );
-   qt[29] = expand2_2b( qt, M, H, 29 );
-   qt[30] = expand2_2b( qt, M, H, 30 );
-   qt[31] = expand2_2b( qt, M, H, 31 );
-
-   xl = _mm_xor_si128(
-            _mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
-                           _mm_xor_si128( qt[18], qt[19] ) ),
-            _mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
-                           _mm_xor_si128( qt[22], qt[23] ) ) );
-   xh = _mm_xor_si128( xl,
-            _mm_xor_si128(
-                 _mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
-                                _mm_xor_si128( qt[26], qt[27] ) ),
-                 _mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
-                                _mm_xor_si128( qt[30], qt[31] ) ) ) );
-
-   dH[ 0] = _mm_add_epi64(
-              _mm_xor_si128( M[0],
-                    _mm_xor_si128( _mm_slli_epi64( xh, 5 ),
-                                   _mm_srli_epi64( qt[16], 5 ) ) ),
-              _mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ) );
-   dH[ 1] = _mm_add_epi64(
-              _mm_xor_si128( M[1],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 7 ),
-                                   _mm_slli_epi64( qt[17], 8 ) ) ),
-              _mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ) );
-   dH[ 2] = _mm_add_epi64(
-               _mm_xor_si128( M[2],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 5 ),
-                                _mm_slli_epi64( qt[18], 5 ) ) ),
-               _mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ) );
-   dH[ 3] = _mm_add_epi64(
-               _mm_xor_si128( M[3],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 1 ),
-                                   _mm_slli_epi64( qt[19], 5 ) ) ),
-               _mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ) );
-   dH[ 4] = _mm_add_epi64(
-               _mm_xor_si128( M[4],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 3 ),
-                                      _mm_slli_epi64( qt[20], 0 ) ) ),
-               _mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ) );
-   dH[ 5] = _mm_add_epi64(
-               _mm_xor_si128( M[5],
-                    _mm_xor_si128( _mm_slli_epi64( xh, 6 ),
-                                   _mm_srli_epi64( qt[21], 6 ) ) ),
-               _mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ) );
-   dH[ 6] = _mm_add_epi64(
-               _mm_xor_si128( M[6],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 4 ),
-                                   _mm_slli_epi64( qt[22], 6 ) ) ),
-                 _mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ) );
-   dH[ 7] = _mm_add_epi64(
-               _mm_xor_si128( M[7],
-                    _mm_xor_si128( _mm_srli_epi64( xh, 11 ),
-                                   _mm_slli_epi64( qt[23], 2 ) ) ),
-               _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ) );
-   dH[ 8] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[4], 9 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] ) ),
-               _mm_xor_si128( _mm_slli_epi64( xl, 8 ),
-                              _mm_xor_si128( qt[23], qt[ 8] ) ) );
-   dH[ 9] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[5], 10 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] ) ),
-               _mm_xor_si128( _mm_srli_epi64( xl, 6 ),
-                              _mm_xor_si128( qt[16], qt[ 9] ) ) );
-   dH[10] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[6], 11 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] ) ),
-               _mm_xor_si128( _mm_slli_epi64( xl, 6 ),
-                              _mm_xor_si128( qt[17], qt[10] ) ) );
-   dH[11] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[7], 12 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
-               _mm_xor_si128( _mm_slli_epi64( xl, 4 ),
-                              _mm_xor_si128( qt[18], qt[11] ) ) );
-   dH[12] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[0], 13 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] ) ),
-               _mm_xor_si128( _mm_srli_epi64( xl, 3 ),
-                              _mm_xor_si128( qt[19], qt[12] ) ) );
-   dH[13] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[1], 14 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] ) ),
-               _mm_xor_si128( _mm_srli_epi64( xl, 4 ),
-                              _mm_xor_si128( qt[20], qt[13] ) ) );
-   dH[14] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[2], 15 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] ) ),
-               _mm_xor_si128( _mm_srli_epi64( xl, 7 ),
-                              _mm_xor_si128( qt[21], qt[14] ) ) );
-   dH[15] = _mm_add_epi64( _mm_add_epi64(
-               mm128_rol_64( dH[3], 16 ),
-               _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] ) ),
-               _mm_xor_si128( _mm_srli_epi64( xl, 2 ),
-                              _mm_xor_si128( qt[22], qt[15] ) ) );
-}
-
-static const __m128i final_b2[16] =
+static const v128u64_t final_2x64[16] =
 {
   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
-   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
-   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
-   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
-   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
-   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
-   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
-   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
 };

-void bmw512_2way_init( bmw_2way_big_context *ctx )
+#define xor4( v3, v2, v1, v0 ) \
+   v128_xor( v128_xor( v3, v2 ), v128_xor( v1, v0 ) )
+
+#define s2b0(x) \
+   xor4( v128_sr64( (x), 1), v128_sl64( (x), 3), \
+         v128_rol64(     (x), 4),  v128_rol64(     (x),37) )
+
+#define s2b1(x) \
+   xor4( v128_sr64( (x), 1), v128_sl64( (x), 2), \
+         v128_rol64(     (x),13),  v128_rol64(     (x),43) )
+
+#define s2b2(x) \
+   xor4( v128_sr64( (x), 2), v128_sl64( (x), 1), \
+         v128_rol64(     (x),19),  v128_rol64(     (x),53) )
+
+#define s2b3(x) \
+   xor4( v128_sr64( (x), 2), v128_sl64( (x), 2), \
+         v128_rol64(     (x),28),  v128_rol64(     (x),59) )
+
+#define s2b4(x) \
+  v128_xor( (x), v128_sr64( (x), 1 ) )
+
+#define s2b5(x) \
+  v128_xor( (x), v128_sr64( (x), 2 ) )
+
+#define r2b1(x)    v128_rol64( x,  5 )
+#define r2b2(x)    v128_rol64( x, 11 )
+#define r2b3(x)    v128_rol64( x, 27 )
+#define r2b4(x)    v128_rol64( x, 32 )
+#define r2b5(x)    v128_rol64( x, 37 )
+#define r2b6(x)    v128_rol64( x, 43 )
+#define r2b7(x)    v128_rol64( x, 53 )
+
+#define add_elt_b( mj0, mj3, mj10, h, K ) \
+  v128_xor( h, v128_add64( K, \
+              v128_sub64( v128_add64( mj0, mj3 ), mj10 ) ) )
+
+#define expand1_b( qt, i ) \
+   v128_add4_64( \
+      v128_add4_64( s2b1( qt[ (i)-16 ] ), s2b2( qt[ (i)-15 ] ), \
+                    s2b3( qt[ (i)-14 ] ), s2b0( qt[ (i)-13 ] )), \
+      v128_add4_64( s2b1( qt[ (i)-12 ] ), s2b2( qt[ (i)-11 ] ), \
+                    s2b3( qt[ (i)-10 ] ), s2b0( qt[ (i)- 9 ] )), \
+      v128_add4_64( s2b1( qt[ (i)- 8 ] ), s2b2( qt[ (i)- 7 ] ), \
+                    s2b3( qt[ (i)- 6 ] ), s2b0( qt[ (i)- 5 ] )), \
+      v128_add4_64( s2b1( qt[ (i)- 4 ] ), s2b2( qt[ (i)- 3 ] ), \
+                    s2b3( qt[ (i)- 2 ] ), s2b0( qt[ (i)- 1 ] ) ) )
+
+#define expand2_b( qt, i) \
+   v128_add4_64( \
+      v128_add4_64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ), \
+                    qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ), \
+      v128_add4_64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ), \
+                    qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ), \
+      v128_add4_64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ), \
+                    qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ), \
+      v128_add4_64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ), \
+                     s2b4( qt[ (i)- 2 ] ), s2b5( qt[ (i)- 1 ] ) ) )
+
+#define W2b0 \
+   v128_add64( \
+      v128_add64( v128_sub64( mh[ 5], mh[ 7] ), mh[10] ), \
+      v128_add64( mh[13], mh[14] ) )
+
+#define W2b1 \
+   v128_add64( \
+       v128_add64( v128_sub64( mh[ 6], mh[ 8] ), mh[11] ), \
+       v128_sub64( mh[14], mh[15] ) )
+
+#define W2b2 \
+   v128_sub64( \
+      v128_add64( v128_add64( mh[ 0], mh[ 7] ), mh[ 9] ), \
+      v128_sub64( mh[12], mh[15] ) )
+
+#define W2b3 \
+   v128_sub64( \
+      v128_add64( v128_sub64( mh[ 0], mh[ 1] ), mh[ 8] ), \
+      v128_sub64( mh[10], mh[13] ) )
+
+#define W2b4 \
+   v128_sub64( \
+      v128_add64( v128_add64( mh[ 1], mh[ 2] ), mh[ 9] ), \
+      v128_add64( mh[11], mh[14] ) )
+
+#define W2b5 \
+   v128_sub64( \
+      v128_add64( v128_sub64( mh[ 3], mh[ 2] ), mh[10] ), \
+      v128_sub64( mh[12], mh[15] ) )
+
+#define W2b6 \
+   v128_sub64( \
+      v128_sub64( v128_sub64( mh[ 4], mh[ 0] ), mh[ 3] ), \
+      v128_sub64( mh[11], mh[13] ) )
+
+#define W2b7 \
+   v128_sub64( \
+      v128_sub64( v128_sub64( mh[ 1], mh[ 4] ), mh[ 5] ), \
+      v128_add64( mh[12], mh[14] ) )
+
+#define W2b8 \
+   v128_add64( \
+      v128_sub64( v128_sub64( mh[ 2], mh[ 5] ), mh[ 6] ), \
+      v128_sub64( mh[13], mh[15] ) )
+
+#define W2b9 \
+   v128_sub64( \
+      v128_add64( v128_sub64( mh[ 0], mh[ 3] ), mh[ 6] ), \
+      v128_sub64( mh[ 7], mh[14] ) )
+
+#define W2b10 \
+   v128_sub64( \
+      v128_sub64( v128_sub64( mh[ 8], mh[ 1] ), mh[ 4] ), \
+      v128_sub64( mh[ 7], mh[15] ) )
+
+#define W2b11 \
+   v128_sub64( \
+      v128_sub64( v128_sub64( mh[ 8], mh[ 0] ), mh[ 2] ), \
+      v128_sub64( mh[ 5], mh[ 9] ) )
+
+#define W2b12 \
+   v128_sub64( \
+      v128_sub64( v128_add64( mh[ 1], mh[ 3] ), mh[ 6] ), \
+      v128_sub64( mh[ 9], mh[10] ) )
+
+#define W2b13 \
+   v128_add64( \
+      v128_add64( v128_add64( mh[ 2], mh[ 4] ), mh[ 7] ), \
+      v128_add64( mh[10], mh[11] ) )
+
+#define W2b14 \
+   v128_sub64( \
+      v128_add64( v128_sub64( mh[ 3], mh[ 5] ), mh[ 8] ), \
+      v128_add64( mh[11], mh[12] ) )
+
+#define W2b15 \
+   v128_sub64( \
+      v128_sub64( v128_sub64( mh[12], mh[ 4] ), mh[ 6] ), \
+      v128_sub64( mh[ 9], mh[13] ) )
+
+void compress_2x64( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
 {
-   ctx->H[ 0] = _mm_set1_epi64x( IV512[ 0] );
-   ctx->H[ 1] = _mm_set1_epi64x( IV512[ 1] );
-   ctx->H[ 2] = _mm_set1_epi64x( IV512[ 2] );
-   ctx->H[ 3] = _mm_set1_epi64x( IV512[ 3] );
-   ctx->H[ 4] = _mm_set1_epi64x( IV512[ 4] );
-   ctx->H[ 5] = _mm_set1_epi64x( IV512[ 5] );
-   ctx->H[ 6] = _mm_set1_epi64x( IV512[ 6] );
-   ctx->H[ 7] = _mm_set1_epi64x( IV512[ 7] );
-   ctx->H[ 8] = _mm_set1_epi64x( IV512[ 8] );
-   ctx->H[ 9] = _mm_set1_epi64x( IV512[ 9] );
-   ctx->H[10] = _mm_set1_epi64x( IV512[10] );
-   ctx->H[11] = _mm_set1_epi64x( IV512[11] );
-   ctx->H[12] = _mm_set1_epi64x( IV512[12] );
-   ctx->H[13] = _mm_set1_epi64x( IV512[13] );
-   ctx->H[14] = _mm_set1_epi64x( IV512[14] );
-   ctx->H[15] = _mm_set1_epi64x( IV512[15] );
-   ctx->ptr = 0;
-   ctx->bit_count = 0;
+   v128u64_t qt[32], xl, xh;
+   v128u64_t mh[16];
+   int i;
+   v128u64_t K = v128_64( 16 * 0x0555555555555555ULL );
+   const v128u64_t Kincr = v128_64( 0x0555555555555555ULL );
+
+   for ( i = 0; i < 16; i++ )
+      mh[i] = v128_xor( M[i], H[i] );
+   qt[ 0] = v128_add64( s2b0( W2b0 ), H[ 1] );
+   qt[ 1] = v128_add64( s2b1( W2b1 ), H[ 2] );
+   qt[ 2] = v128_add64( s2b2( W2b2 ), H[ 3] );
+   qt[ 3] = v128_add64( s2b3( W2b3 ), H[ 4] );
+   qt[ 4] = v128_add64( s2b4( W2b4 ), H[ 5] );
+   qt[ 5] = v128_add64( s2b0( W2b5 ), H[ 6] );
+   qt[ 6] = v128_add64( s2b1( W2b6 ), H[ 7] );
+   qt[ 7] = v128_add64( s2b2( W2b7 ), H[ 8] );
+   qt[ 8] = v128_add64( s2b3( W2b8 ), H[ 9] );
+   qt[ 9] = v128_add64( s2b4( W2b9 ), H[10] );
+   qt[10] = v128_add64( s2b0( W2b10), H[11] );
+   qt[11] = v128_add64( s2b1( W2b11), H[12] );
+   qt[12] = v128_add64( s2b2( W2b12), H[13] );
+   qt[13] = v128_add64( s2b3( W2b13), H[14] );
+   qt[14] = v128_add64( s2b4( W2b14), H[15] );
+   qt[15] = v128_add64( s2b0( W2b15), H[ 0] );
+
+   v128u64_t mj[16];
+
+   mj[ 0] = v128_rol64( M[ 0],  1 );
+   mj[ 1] = v128_rol64( M[ 1],  2 );
+   mj[ 2] = v128_rol64( M[ 2],  3 );
+   mj[ 3] = v128_rol64( M[ 3],  4 );
+   mj[ 4] = v128_rol64( M[ 4],  5 );
+   mj[ 5] = v128_rol64( M[ 5],  6 );
+   mj[ 6] = v128_rol64( M[ 6],  7 );
+   mj[ 7] = v128_rol64( M[ 7],  8 );
+   mj[ 8] = v128_rol64( M[ 8],  9 );
+   mj[ 9] = v128_rol64( M[ 9], 10 );
+   mj[10] = v128_rol64( M[10], 11 );
+   mj[11] = v128_rol64( M[11], 12 );
+   mj[12] = v128_rol64( M[12], 13 );
+   mj[13] = v128_rol64( M[13], 14 );
+   mj[14] = v128_rol64( M[14], 15 );
+   mj[15] = v128_rol64( M[15], 16 );
+
+   qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
+   K = v128_add64( K, Kincr );
+   qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
+   K = v128_add64( K, Kincr );
+   qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
+   K = v128_add64( K, Kincr );
+   qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
+   K = v128_add64( K, Kincr );
+   qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
+   K = v128_add64( K, Kincr );
+   qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
+   K = v128_add64( K, Kincr );
+   qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
+   K = v128_add64( K, Kincr );
+   qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
+   K = v128_add64( K, Kincr );
+   qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
+   K = v128_add64( K, Kincr );
+   qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
+   K = v128_add64( K, Kincr );
+   qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
+   K = v128_add64( K, Kincr );
+   qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
+   K = v128_add64( K, Kincr );
+   qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
+   K = v128_add64( K, Kincr );
+   qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
+   K = v128_add64( K, Kincr );
+   qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
+   K = v128_add64( K, Kincr );
+   qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
+
+   qt[16] = v128_add64( qt[16], expand1_b( qt, 16 ) );
+   qt[17] = v128_add64( qt[17], expand1_b( qt, 17 ) );
+   qt[18] = v128_add64( qt[18], expand2_b( qt, 18 ) );
+   qt[19] = v128_add64( qt[19], expand2_b( qt, 19 ) );
+   qt[20] = v128_add64( qt[20], expand2_b( qt, 20 ) );
+   qt[21] = v128_add64( qt[21], expand2_b( qt, 21 ) );
+   qt[22] = v128_add64( qt[22], expand2_b( qt, 22 ) );
+   qt[23] = v128_add64( qt[23], expand2_b( qt, 23 ) );
+   qt[24] = v128_add64( qt[24], expand2_b( qt, 24 ) );
+   qt[25] = v128_add64( qt[25], expand2_b( qt, 25 ) );
+   qt[26] = v128_add64( qt[26], expand2_b( qt, 26 ) );
+   qt[27] = v128_add64( qt[27], expand2_b( qt, 27 ) );
+   qt[28] = v128_add64( qt[28], expand2_b( qt, 28 ) );
+   qt[29] = v128_add64( qt[29], expand2_b( qt, 29 ) );
+   qt[30] = v128_add64( qt[30], expand2_b( qt, 30 ) );
+   qt[31] = v128_add64( qt[31], expand2_b( qt, 31 ) );
+
+   xl = v128_xor3( v128_xor3( qt[16], qt[17], qt[18] ),
+                   v128_xor3( qt[19], qt[20], qt[21] ),
+                   v128_xor(  qt[22], qt[23] ) );
+
+   xh = v128_xor3( v128_xor3( xl,     qt[24], qt[25] ),
+                   v128_xor3( qt[26], qt[27], qt[28] ),
+                   v128_xor3( qt[29], qt[30], qt[31] ) );
+   
+#define DH1L( m, sl, sr, a, b, c ) \
+   v128_add64( \
+               v128_xor( M[m], \
+                  v128_xor( v128_sl64( xh, sl ), \
+                                    v128_sr64( qt[a], sr ) ) ), \
+               v128_xor( v128_xor( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   v128_add64( \
+               v128_xor( M[m], \
+                  v128_xor( v128_sr64( xh, sl ), \
+                                    v128_sl64( qt[a], sr ) ) ), \
+               v128_xor( v128_xor( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   v128_add64( v128_add64( \
+       v128_rol64( dH[h], rl ), \
+          v128_xor( v128_xor( xh, qt[a] ), M[m] )), \
+                 v128_xor( v128_sl64( xl, sl ), \
+                                   v128_xor( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   v128_add64( v128_add64( \
+       v128_rol64( dH[h], rl ), \
+          v128_xor( v128_xor( xh, qt[a] ), M[m] )), \
+                 v128_xor( v128_sr64( xl, sr ), \
+                                   v128_xor( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+}
+static void
+
+bmw64_2way_init( bmw_2way_big_context *sc, const uint64_t *iv )
+{
+   sc->H[ 0] = v128_64( 0x8081828384858687 );
+   sc->H[ 1] = v128_64( 0x88898A8B8C8D8E8F );
+   sc->H[ 2] = v128_64( 0x9091929394959697 );
+   sc->H[ 3] = v128_64( 0x98999A9B9C9D9E9F );
+   sc->H[ 4] = v128_64( 0xA0A1A2A3A4A5A6A7 );
+   sc->H[ 5] = v128_64( 0xA8A9AAABACADAEAF );
+   sc->H[ 6] = v128_64( 0xB0B1B2B3B4B5B6B7 );
+   sc->H[ 7] = v128_64( 0xB8B9BABBBCBDBEBF );
+   sc->H[ 8] = v128_64( 0xC0C1C2C3C4C5C6C7 );
+   sc->H[ 9] = v128_64( 0xC8C9CACBCCCDCECF );
+   sc->H[10] = v128_64( 0xD0D1D2D3D4D5D6D7 );
+   sc->H[11] = v128_64( 0xD8D9DADBDCDDDEDF );
+   sc->H[12] = v128_64( 0xE0E1E2E3E4E5E6E7 );
+   sc->H[13] = v128_64( 0xE8E9EAEBECEDEEEF );
+   sc->H[14] = v128_64( 0xF0F1F2F3F4F5F6F7 );
+   sc->H[15] = v128_64( 0xF8F9FAFBFCFDFEFF );
+   sc->ptr = 0;
+   sc->bit_count = 0;
 }

-void bmw512_2way( bmw_2way_big_context *ctx, const void *data, size_t len )
+static void
+bmw64_2way( bmw_2way_big_context *sc, const void *data, size_t len )
 {
-   __m128i *buf = (__m128i*)ctx->buf;
-   __m128i htmp[16];
-   __m128i *h1 = ctx->H;
-   __m128i *h2 = htmp;
-   size_t blen = len << 1;
-   size_t ptr = ctx->ptr;
-   size_t bptr = ctx->ptr << 1;
-   size_t vptr = ctx->ptr >> 3;
-//   const int buf_size = 128;  // bytes of one lane, compatible with len
+   v128u64_t *vdata = (v128u64_t*)data;
+   v128u64_t *buf;
+   v128u64_t htmp[16];
+   v128u64_t *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len

-   ctx->bit_count += len << 3;
-   while ( blen > 0 )
+   sc->bit_count += (uint64_t)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
   {
-      size_t clen = (sizeof ctx->buf ) - bptr;
-      if ( clen > blen )
-         clen = blen;
-      memcpy( buf + vptr, data, clen );
-      bptr += clen;
-      vptr = bptr >> 4;
-      data = (const unsigned char *)data + clen;
-      blen -= clen;
-      if ( ptr == (sizeof ctx->buf ) )
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      v128_memcpy( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
      {
-         __m128i *ht;
-         compress_big_2way( buf, h1, h2 );
+         v128u64_t *ht;
+         compress_2x64( buf, h1, h2 );
         ht = h1;
         h1 = h2;
         h2 = ht;
         ptr = 0;
      }
   }
-   ctx->ptr = ptr;
-   if ( h1 != ctx->H )
-        memcpy_128( ctx->H, h1, 16 );
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        v128_memcpy( sc->H, h1, 16 );
 }

-void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
+static void
+bmw64_2way_close( bmw_2way_big_context *sc, unsigned ub, unsigned n,
+   void *dst, size_t out_size_w64)
 {
-   __m128i h1[16], h2[16], *h;
-   __m128i *buf = (__m128i*)ctx->buf;
-   size_t   vptr    = ctx->ptr >> 3;
-//   unsigned bit_len = ( (unsigned)(ctx->ptr) << 1 );
+   v128u64_t *buf;
+   v128u64_t h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   const int buf_size = 128;  // bytes of one lane, compatible with len

-   buf[ vptr++ ] = _mm_set1_epi64x( 0x80 );
-   h = ctx->H;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   buf[ ptr>>3 ] = v128_64( 0x80 );
+   ptr += 8;
+   h = sc->H;

-   if ( vptr == 16 )
+   if (  ptr > (buf_size - 8) )
   {
-      compress_big_2way( buf, h, h1 );
-      vptr = 0;
+      v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_2x64( buf, h, h1 );
+      ptr = 0;
      h = h1;
   }
-   memset_zero_128( buf + vptr, 16 - vptr - 1 );
-   buf[ 15 ] = _mm_set1_epi64x( ctx->bit_count );
-   compress_big_2way( buf, h, h2 );
-   memcpy_128( buf, h2, 16 );
-   compress_big_2way( buf, final_b2, h1 );
-   memcpy( (__m128i*)dst, h1+8, 8 );
+   v128_memset_zero( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = v128_64( sc->bit_count + n );
+   compress_2x64( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_2x64( buf, (const v128u64_t*)final_2x64, h1 );
+   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+      casti_v128u64(dst,u) = h1[v];
 }

-#endif  // __SSE2__
+void
+bmw512_2x64_init( bmw512_2x64_context *cc )
+{
+   bmw64_2way_init( cc, IV512 );
+}
+
+void
+bmw512_2x64_update( bmw512_2x64_context *cc, const void *data, size_t len )
+{
+   bmw64_2way( cc, data, len );
+}
+
+void
+bmw512_2x64_close( bmw512_2x64_context *cc, void *dst )
+{
+   bmw64_2way_close( cc, 0, 0, dst, 8 );
+}
+
+void bmw512_2x64_ctx( bmw512_2x64_context *ctx, void *dst, const void *data,
+                      size_t len )
+{
+   bmw512_2x64_init( ctx );
+   bmw512_2x64_update( ctx, data, len );
+   bmw512_2x64_close( ctx, dst );
+}
+
+void bmw512_2x64( void *dst, const void *data, size_t len )
+{
+   bmw512_2x64_context ctx;
+   bmw512_2x64_init( &ctx );
+   bmw512_2x64_update( &ctx, data, len );
+   bmw512_2x64_close( &ctx, dst );
+}
+
+#undef add_elt_b
+#undef expand1_b
+#undef expand2_b

 #if defined(__AVX2__)

@@ -1472,13 +1408,15 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
 void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
                                size_t len )
 {
+   __m512i h1[16];
+   __m512i h2[16];
   __m512i *vdata = (__m512i*)data;
   __m512i *buf = ctx->buf;
-   __m512i htmp[16];
   __m512i *H = ctx->H;
-   __m512i *h2 = htmp;
+   __m512i *hptr = h2;
   uint64_t bit_count = len * 8;
   size_t ptr = 0;
+   size_t u, v;
   const int buf_size = 128;  // bytes of one lane, compatible with len

 // Init
@@ -1515,10 +1453,10 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
      if ( ptr == buf_size )
      {
         __m512i *ht;
-         compress_big_8way( buf, H, h2 );
+         compress_big_8way( buf, H, hptr );
         ht = H;
-         H = h2;
-         h2 = ht;
+         H = hptr;
+         hptr = ht;
         ptr = 0;
      }
   }
@@ -1526,9 +1464,6 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
      memcpy_512( ctx->H, H, 16 );

 // Close   
-{
-   __m512i h1[16], h2[16];
-   size_t u, v;

   buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
   ptr += 8;
@@ -1546,19 +1481,10 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
   for ( u = 0; u < 16; u ++ )
      buf[ u ] = h2[ u ];
   compress_big_8way( buf, final_b8, h1 );
-   for (u = 0, v = 8; u < 8; u ++, v ++)
+   for ( u = 0, v = 8; u < 8; u ++, v ++ )
      casti_m512i( out, u ) = h1[ v ];
 }

-
-
-}   
-
-
-
 #endif // AVX512

-#ifdef __cplusplus
-}
-#endif

--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -75,8 +75,6 @@ static void transform( cubehashParam *sp )

 #else   // AVX, SSE2, NEON

-#pragma message "NEON for Cubehash"
-
    v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;

    x0 = casti_v128( sp->x, 0 );
@@ -175,25 +173,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
    if ( hashbitlen == 512 )
    {

-       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }   

    return 0;
@@ -229,10 +227,10 @@ int cubehashDigest( cubehashParam *sp, void *digest )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
-                                      v128_set_64( 0, 0x80 ) );
+                                      v128_set64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -274,10 +272,10 @@ int cubehashUpdateDigest( cubehashParam *sp, void *digest,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
-                                      v128_set_64( 0, 0x80 ) );
+                                      v128_set64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
@@ -308,37 +306,34 @@ int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
    if ( hashbitlen == 512 )
    {

-       x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = v128_set64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = v128_set64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = v128_set64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = v128_set64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = v128_set64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = v128_set64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = v128_set64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = v128_set64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = v128_set64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = v128_set64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = v128_set64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = v128_set64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = v128_set64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = v128_set64( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = v128_set64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = v128_set64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }

-
-
-
    const int len = size / 16;
    const v128_t* in = (v128_t*)data;
    v128_t* hash = (v128_t*)digest;
    int i;

    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
-    // Current usage sata is either 64 or 80 bytes.
+    // Current usage data is either 64 or 80 bytes.

    for ( i = 0; i < len; i++ )
    {
@@ -353,10 +348,10 @@ int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
-                                      v128_set_64( 0, 0x80 ) );
+                                      v128_set64( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
+    sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -14,15 +14,11 @@
 *
 */

-//TODO NEON support, funky shuffles
-
-#if defined(__AES__)
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)

 #include <memory.h>
 #include "miner.h"
 #include "hash_api.h"
-//#include "vperm.h"
-#include <immintrin.h>
 #include "simd-utils.h"

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
@@ -57,61 +53,61 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2


 #define ECHO_SUBBYTES4(state, j) \
-   state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
-   k1 = _mm_add_epi32(k1, M128(const1));\
-   state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
-   k1 = _mm_add_epi32(k1, M128(const1));\
-   state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
-   k1 = _mm_add_epi32(k1, M128(const1));\
-   state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
-   k1 = _mm_add_epi32(k1, M128(const1));\
-   state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
-   state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
-   state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
-   state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
+   state[0][j] = v128_aesenc(state[0][j], k1);\
+   k1 = v128_add32(k1, cast_v128(const1));\
+   state[1][j] = v128_aesenc(state[1][j], k1);\
+   k1 = v128_add32(k1, cast_v128(const1));\
+   state[2][j] = v128_aesenc(state[2][j], k1);\
+   k1 = v128_add32(k1, cast_v128(const1));\
+   state[3][j] = v128_aesenc(state[3][j], k1);\
+   k1 = v128_add32(k1, cast_v128(const1));\
+   state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
+   state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
+   state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
+   state[3][j] = v128_aesenc(state[3][j], v128_zero )

 #define ECHO_SUBBYTES(state, i, j) \
-	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-   k1 = _mm_add_epi32(k1, M128(const1));\
-	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero))
+	state[i][j] = v128_aesenc(state[i][j], k1);\
+   k1 = v128_add32(k1, cast_v128(const1));\
+	state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))

 #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
-	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
-	t1 = _mm_srli_epi16(state1[0][j], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
+	s2 = v128_add8(state1[0][j], state1[0][j]);\
+	t1 = v128_sr16(state1[0][j], 7);\
+	t1 = v128_and(t1, cast_v128(lsbmask));\
+	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	s2 = v128_xor(s2, t2);\
 	state2[0][j] = s2;\
 	state2[1][j] = state1[0][j];\
 	state2[2][j] = state1[0][j];\
-	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
-	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
-	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
-	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
-	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
-	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
-	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
-	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
-	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
-	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+	state2[3][j] = v128_xor(s2, state1[0][j]);\
+	s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
+	t1 = v128_and(t1, cast_v128(lsbmask));\
+	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	s2 = v128_xor(s2, t2);\
+	state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
+	state2[1][j] = v128_xor(state2[1][j], s2);\
+	state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
+	state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
+	s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+	t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
+	t1 = v128_and(t1, cast_v128(lsbmask));\
+	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	s2 = v128_xor(s2, t2);\
+	state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
+	state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
+	state2[2][j] = v128_xor(state2[2][j], s2);\
+	state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
+	s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+	t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
+	t1 = v128_and(t1, cast_v128(lsbmask));\
+	t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
+	s2 = v128_xor(s2, t2);\
+	state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
+	state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
+	state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
+	state2[3][j] = v128_xor(state2[3][j], s2)


 #define ECHO_ROUND_UNROLL2 \
@@ -199,8 +195,8 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
   unsigned int r, b, i, j;
-   __m128i t1, t2, s2, k1;
-   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+   v128_t t1, t2, s2, k1;
+   v128_t _state[4][4], _state2[4][4], _statebackup[4][4]; 

   for(i = 0; i < 4; i++)
 	for(j = 0; j < ctx->uHashSize / 256; j++)
@@ -208,14 +204,14 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc

   for(b = 0; b < uBlockCount; b++)
   {
-   	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
+   	ctx->k = v128_add64(ctx->k, ctx->const1536);

   	// load message
 	   for(j = ctx->uHashSize / 256; j < 4; j++)
 	   {
 	      for(i = 0; i < 4; i++)
 	      {
-		     _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+		     _state[i][j] = v128_load((v128_t*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
 	      }
 	   }

@@ -233,25 +229,25 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
 	   {
 	      for(i = 0; i < 4; i++)
 	      {
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+		      _state[i][0] = v128_xor(_state[i][0], _state[i][1]);
+		      _state[i][0] = v128_xor(_state[i][0], _state[i][2]);
+		      _state[i][0] = v128_xor(_state[i][0], _state[i][3]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][0]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][1]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][2]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][3]);
 	      }
 	   }
 	   else
    	{
 	      for(i = 0; i < 4; i++)
 	      {
-      		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		      _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		      _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		      _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-		      _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+      		_state[i][0] = v128_xor(_state[i][0], _state[i][2]);
+		      _state[i][1] = v128_xor(_state[i][1], _state[i][3]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][0]);
+		      _state[i][0] = v128_xor(_state[i][0], _statebackup[i][2]);
+		      _state[i][1] = v128_xor(_state[i][1], _statebackup[i][1]);
+		      _state[i][1] = v128_xor(_state[i][1], _statebackup[i][3]);
         }
   	}
 	   pmsg += ctx->uBlockLength;
@@ -266,7 +262,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 {
 	int i, j;

-        ctx->k = _mm_setzero_si128(); 
+        ctx->k = v128_zero; 
 	ctx->processed_bits = 0;
 	ctx->uBufferBytes = 0;

@@ -276,16 +272,16 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 			ctx->uHashSize = 256;
 			ctx->uBlockLength = 192;
 			ctx->uRounds = 8;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
+			ctx->hashsize = v128_set32(0, 0, 0, 0x00000100);
+			ctx->const1536 = v128_set32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
 			break;

 		case 512:
 			ctx->uHashSize = 512;
 			ctx->uBlockLength = 128;
 			ctx->uRounds = 10;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
+			ctx->hashsize = v128_set32(0, 0, 0, 0x00000200);
+			ctx->const1536 = v128_set32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
 			break;

 		default:
@@ -299,7 +295,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)

 	for(i = 0; i < 4; i++)
 		for(j = nHashSize / 256; j < 4; j++)
-			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
+			ctx->state[i][j] = v128_set32(0, 0, 0, 0);

 	return SUCCESS;
 }
@@ -356,12 +352,12 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt

 HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 {
-	__m128i remainingbits;
+	v128_t remainingbits;

 	// Add remaining bytes in the buffer
 	state->processed_bits += state->uBufferBytes * 8;

-	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
+	remainingbits = v128_set32(0, 0, 0, state->uBufferBytes * 8);

 	// Pad with 0x80
 	state->buffer[state->uBufferBytes++] = 0x80;
@@ -382,13 +378,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 		// Last block contains message bits?
 		if(state->uBufferBytes == 1)
 		{
-			state->k = _mm_xor_si128(state->k, state->k);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
+			state->k = v128_xor(state->k, state->k);
+			state->k = v128_sub64(state->k, state->const1536);
 		}
 		else
 		{
-			state->k = _mm_add_epi64(state->k, remainingbits);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
+			state->k = v128_add64(state->k, remainingbits);
+			state->k = v128_sub64(state->k, state->const1536);
 		}

 		// Compress
@@ -398,8 +394,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	{
 		// Fill with zero and compress
 		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
-		state->k = _mm_add_epi64(state->k, remainingbits);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
+		state->k = v128_add64(state->k, remainingbits);
+		state->k = v128_sub64(state->k, state->const1536);
 		Compress(state, state->buffer, 1);

 		// Last block
@@ -413,19 +409,19 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;

 		// Compress the last block
-		state->k = _mm_xor_si128(state->k, state->k);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
+		state->k = v128_xor(state->k, state->k);
+		state->k = v128_sub64(state->k, state->const1536);
 		Compress(state, state->buffer, 1);
 	}

 	// Store the hash value
-	_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
+	v128_store((v128_t*)hashval + 0, state->state[0][0]);
+	v128_store((v128_t*)hashval + 1, state->state[1][0]);

 	if(state->uHashSize == 512)
 	{
-		_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
+		v128_store((v128_t*)hashval + 2, state->state[2][0]);
+		v128_store((v128_t*)hashval + 3, state->state[3][0]);
 	}

 	return SUCCESS;
@@ -477,12 +473,12 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
        state->uBufferBytes += uByteLength;
   }

-   __m128i remainingbits;
+   v128_t remainingbits;

   // Add remaining bytes in the buffer
   state->processed_bits += state->uBufferBytes * 8;

-   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+   remainingbits = v128_set32( 0, 0, 0, state->uBufferBytes * 8 );

   // Pad with 0x80
   state->buffer[state->uBufferBytes++] = 0x80;
@@ -503,13 +499,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
        // Last block contains message bits?
        if( state->uBufferBytes == 1 )
        {
-           state->k = _mm_xor_si128( state->k, state->k );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
+           state->k = v128_xor( state->k, state->k );
+           state->k = v128_sub64( state->k, state->const1536 );
        }
        else
        {
-           state->k = _mm_add_epi64( state->k, remainingbits );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
+           state->k = v128_add64( state->k, remainingbits );
+           state->k = v128_sub64( state->k, state->const1536 );
        }

        // Compress
@@ -520,8 +516,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
        // Fill with zero and compress
        memset( state->buffer + state->uBufferBytes, 0,
                state->uBlockLength - state->uBufferBytes );
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        state->k = v128_add64( state->k, remainingbits );
+        state->k = v128_sub64( state->k, state->const1536 );
        Compress( state, state->buffer, 1 );

        // Last block
@@ -536,19 +532,19 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
                   state->processed_bits;
        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
        // Compress the last block
-        state->k = _mm_xor_si128( state->k, state->k );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        state->k = v128_xor( state->k, state->k );
+        state->k = v128_sub64( state->k, state->const1536 );
        Compress( state, state->buffer, 1) ;
   }

   // Store the hash value
-   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
+   v128_store( (v128_t*)hashval + 0, state->state[0][0] );
+   v128_store( (v128_t*)hashval + 1, state->state[1][0] );

   if( state->uHashSize == 512 )
   {
-        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
+        v128_store( (v128_t*)hashval + 2, state->state[2][0] );
+        v128_store( (v128_t*)hashval + 3, state->state[3][0] );

   }
   return SUCCESS;
@@ -559,7 +555,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
 {
   int i, j;

-   state->k = m128_zero;
+   state->k = v128_zero;
   state->processed_bits = 0;
   state->uBufferBytes = 0;

@@ -569,16 +565,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
         state->uHashSize = 256;
         state->uBlockLength = 192;
         state->uRounds = 8;
-         state->hashsize = _mm_set_epi64x( 0, 0x100 );
-         state->const1536 = _mm_set_epi64x( 0, 0x600 );
+         state->hashsize = v128_set64( 0, 0x100 );
+         state->const1536 = v128_set64( 0, 0x600 );
         break;

      case 512:
         state->uHashSize = 512;
         state->uBlockLength = 128;
         state->uRounds = 10;
-         state->hashsize = _mm_set_epi64x( 0, 0x200 );
-         state->const1536 = _mm_set_epi64x( 0, 0x400 );
+         state->hashsize = v128_set64( 0, 0x200 );
+         state->const1536 = v128_set64( 0, 0x400 );
         break;

      default:
@@ -591,7 +587,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,

   for(i = 0; i < 4; i++)
      for(j = nHashSize / 256; j < 4; j++)
-         state->state[i][j] = m128_zero;
+         state->state[i][j] = v128_zero;


   unsigned int uBlockCount, uRemainingBytes;
@@ -635,12 +631,12 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        state->uBufferBytes += datalen;
   }

-   __m128i remainingbits;
+   v128_t remainingbits;

   // Add remaining bytes in the buffer
   state->processed_bits += state->uBufferBytes * 8;

-   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+   remainingbits = v128_set32( 0, 0, 0, state->uBufferBytes * 8 );

   // Pad with 0x80
   state->buffer[state->uBufferBytes++] = 0x80;
@@ -661,13 +657,13 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        // Last block contains message bits?
        if( state->uBufferBytes == 1 )
        {
-           state->k = _mm_xor_si128( state->k, state->k );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
+           state->k = v128_xor( state->k, state->k );
+           state->k = v128_sub64( state->k, state->const1536 );
        }
        else
        {
-           state->k = _mm_add_epi64( state->k, remainingbits );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
+           state->k = v128_add64( state->k, remainingbits );
+           state->k = v128_sub64( state->k, state->const1536 );
        }

        // Compress
@@ -678,8 +674,8 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
        // Fill with zero and compress
        memset( state->buffer + state->uBufferBytes, 0,
                state->uBlockLength - state->uBufferBytes );
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        state->k = v128_add64( state->k, remainingbits );
+        state->k = v128_sub64( state->k, state->const1536 );
        Compress( state, state->buffer, 1 );

        // Last block
@@ -694,19 +690,19 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
                   state->processed_bits;
        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
        // Compress the last block
-        state->k = _mm_xor_si128( state->k, state->k );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        state->k = v128_xor( state->k, state->k );
+        state->k = v128_sub64( state->k, state->const1536 );
        Compress( state, state->buffer, 1) ;
   }

   // Store the hash value
-   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
+   v128_store( (v128_t*)hashval + 0, state->state[0][0] );
+   v128_store( (v128_t*)hashval + 1, state->state[1][0] );

   if( state->uHashSize == 512 )
   {
-        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
+        v128_store( (v128_t*)hashval + 2, state->state[2][0] );
+        v128_store( (v128_t*)hashval + 3, state->state[3][0] );

   }
   return SUCCESS;
@@ -721,12 +717,12 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit

 	/////
 	/*
-	__m128i a, b, c, d, t[4], u[4], v[4];
+	v128_t a, b, c, d, t[4], u[4], v[4];

-	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
-	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
-	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
-	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
+	a = v128_set32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
+	b = v128_set32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
+	c = v128_set32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
+	d = v128_set32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);

 	t[0] = _mm_unpacklo_epi8(a, b);
 	t[1] = _mm_unpackhi_epi8(a, b);
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -10,11 +10,9 @@
 * This code is placed in the public domain
 */

-#include <smmintrin.h>
-#include <wmmintrin.h>
 #include "hash-groestl.h"

-static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
+static const v128u64_t round_const_p[] __attribute__ ((aligned (64))) =
 {
   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
   { 0x7161514131211101, 0xf1e1d1c1b1a19181 },
@@ -32,7 +30,7 @@ static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
 };

-static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
+static const v128u64_t round_const_q[] __attribute__ ((aligned (64))) =
 {
   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
@@ -50,15 +48,29 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };

-static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
-static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
-static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
-static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
-static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
-static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
-static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
-static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
-static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
+static const v128u64_t TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+static const v128u64_t SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+static const v128u64_t SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
+static const v128u64_t SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
+static const v128u64_t SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
+static const v128u64_t SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
+static const v128u64_t SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
+static const v128u64_t SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
+static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
+
+#if defined(__ARM_NEON)
+
+// No fast shuffle on NEON
+static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };  
+
+#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+
+#else
+
+#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+
+#endif
+

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -67,9 +79,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_cmpgt_epi8( m128_zero, i);\
-  i = _mm_add_epi8(i, i);\
-  i = mm128_xorand(i, j, k );\
+  j = v128_cmpgt8( v128_zero, i);\
+  i = v128_add8(i, i);\
+  i = v128_xorand(i, j, k );\
 } 

 /**/
@@ -98,85 +110,85 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = v128_xor(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = v128_xor(a1, a2);\
  b1 = a3;\
-  TEMP2 = _mm_xor_si128(a2, a3);\
+  TEMP2 = v128_xor(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = v128_xor(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = v128_xor(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = v128_xor(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = v128_xor(a6, a7);\
+  a7 = v128_xor(a7, b6);\
   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  TEMP0 = v128_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
-  TEMP1 = mm128_xor3( b1, a5, a7 );\
-  b2 = mm128_xor3( b2, a6, a0 ); \
+  TEMP1 = v128_xor3( b1, a5, a7 );\
+  b2 = v128_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b3 = mm128_xor3( b3, a7, a1 ); \
+  b3 = v128_xor3( b3, a7, a1 ); \
  b1 = a1;\
-  b6 = mm128_xor3( b6, a4, TEMP2 ); \
-  b4 = mm128_xor3( b4, a0, TEMP2 ); \
-  b7 = mm128_xor3( b7, a5, a3 ); \
-  b5 = mm128_xor3( b5, a1, a3 ); \
+  b6 = v128_xor3( b6, a4, TEMP2 ); \
+  b4 = v128_xor3( b4, a0, TEMP2 ); \
+  b7 = v128_xor3( b7, a5, a3 ); \
+  b5 = v128_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(TEMP2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = v128_xor(a0, a3);\
+  a1 = v128_xor(a1, a4);\
+  a2 = v128_xor(TEMP2, a5);\
+  a3 = v128_xor(a3, a6);\
+  a4 = v128_xor(a4, a7);\
+  a5 = v128_xor(a5, b0);\
+  a6 = v128_xor(a6, b1);\
+  a7 = v128_xor(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
+  b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = v128_xor(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = v128_xor(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = v128_xor(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = v128_xor(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = v128_xor(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = v128_xor(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = v128_xor(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = v128_xor(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = v128_xor(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = v128_xor(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = v128_xor(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = v128_xor(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = v128_xor(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = v128_xor(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = v128_xor(b0, a3);\
+  b1 = v128_xor(b1, a4);\
 }/*MixBytes*/

 #else
@@ -185,96 +197,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = v128_xor(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = v128_xor(a1, a2);\
  b1 = a3;\
-  a2 = _mm_xor_si128(a2, a3);\
+  a2 = v128_xor(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = v128_xor(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = v128_xor(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = v128_xor(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = v128_xor(a6, a7);\
+  a7 = v128_xor(a7, b6);\
   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b0 = _mm_xor_si128(b0, a6);\
+  b0 = v128_xor(b0, a4);\
+  b6 = v128_xor(b6, a4);\
+  b1 = v128_xor(b1, a5);\
+  b7 = v128_xor(b7, a5);\
+  b2 = v128_xor(b2, a6);\
+  b0 = v128_xor(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
-  b3 = _mm_xor_si128(b3, a7);\
-  b1 = _mm_xor_si128(b1, a7);\
+  b3 = v128_xor(b3, a7);\
+  b1 = v128_xor(b1, a7);\
  TEMP1 = b1;\
-  b4 = _mm_xor_si128(b4, a0);\
-  b2 = _mm_xor_si128(b2, a0);\
+  b4 = v128_xor(b4, a0);\
+  b2 = v128_xor(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b5 = _mm_xor_si128(b5, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
+  b5 = v128_xor(b5, a1);\
+  b3 = v128_xor(b3, a1);\
  b1 = a1;\
-  b6 = _mm_xor_si128(b6, a2);\
-  b4 = _mm_xor_si128(b4, a2);\
+  b6 = v128_xor(b6, a2);\
+  b4 = v128_xor(b4, a2);\
  TEMP2 = a2;\
-  b7 = _mm_xor_si128(b7, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
+  b7 = v128_xor(b7, a3);\
+  b5 = v128_xor(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = v128_xor(a0, a3);\
+  a1 = v128_xor(a1, a4);\
+  a2 = v128_xor(a2, a5);\
+  a3 = v128_xor(a3, a6);\
+  a4 = v128_xor(a4, a7);\
+  a5 = v128_xor(a5, b0);\
+  a6 = v128_xor(a6, b1);\
+  a7 = v128_xor(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
+  b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = v128_xor(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = v128_xor(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = v128_xor(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = v128_xor(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = v128_xor(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = v128_xor(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = v128_xor(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = v128_xor(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = v128_xor(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = v128_xor(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = v128_xor(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = v128_xor(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = v128_xor(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = v128_xor(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = v128_xor(b0, a3);\
+  b1 = v128_xor(b1, a4);\
 }/*MixBytes*/

 #endif
@@ -286,15 +298,15 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 */
 #define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
-  b0 = _mm_xor_si128(b0, b0);\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  b0 = v128_xor(b0, b0);\
+  a0 = v128_aesenclast(a0, b0);\
+  a1 = v128_aesenclast(a1, b0);\
+  a2 = v128_aesenclast(a2, b0);\
+  a3 = v128_aesenclast(a3, b0);\
+  a4 = v128_aesenclast(a4, b0);\
+  a5 = v128_aesenclast(a5, b0);\
+  a6 = v128_aesenclast(a6, b0);\
+  a7 = v128_aesenclast(a7, b0);\
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
 }
@@ -303,32 +315,32 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm_xor_si128( xmm8, \
-             casti_m128i( round_const_p, round_counter ) ); \
+    xmm8 = v128_xor( xmm8, \
+             casti_v128( round_const_p, round_counter ) ); \
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK1 ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
+    xmm8  = v128_shuffle8( xmm8,  SUBSH_MASK0 ); \
+    xmm9  = v128_shuffle8( xmm9,  SUBSH_MASK1 ); \
+    xmm10 = v128_shuffle8( xmm10, SUBSH_MASK2 ); \
+    xmm11 = v128_shuffle8( xmm11, SUBSH_MASK3 ); \
+    xmm12 = v128_shuffle8( xmm12, SUBSH_MASK4 ); \
+    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
+    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
+    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
    \
    /* AddRoundConstant P1024 */\
-    xmm0 = _mm_xor_si128( xmm0, \
-             casti_m128i( round_const_p, round_counter+1 ) ); \
-    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
+    xmm0 = v128_xor( xmm0, \
+             casti_v128( round_const_p, round_counter+1 ) ); \
+    xmm0 = v128_shuffle8( xmm0, SUBSH_MASK0 ); \
+    xmm1 = v128_shuffle8( xmm1, SUBSH_MASK1 ); \
+    xmm2 = v128_shuffle8( xmm2, SUBSH_MASK2 ); \
+    xmm3 = v128_shuffle8( xmm3, SUBSH_MASK3 ); \
+    xmm4 = v128_shuffle8( xmm4, SUBSH_MASK4 ); \
+    xmm5 = v128_shuffle8( xmm5, SUBSH_MASK5 ); \
+    xmm6 = v128_shuffle8( xmm6, SUBSH_MASK6 ); \
+    xmm7 = v128_shuffle8( xmm7, SUBSH_MASK7 ); \
    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
@@ -338,49 +350,49 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  u8 round_counter = 0;\
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant Q1024 */\
-    xmm1 = m128_neg1;\
-    xmm8  = _mm_xor_si128( xmm8,  xmm1 ); \
-    xmm9  = _mm_xor_si128( xmm9,  xmm1 ); \
-    xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
-    xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
-    xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
-    xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
-    xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
-    xmm15 = _mm_xor_si128( xmm15, \
-              casti_m128i( round_const_q, round_counter ) ); \
+    xmm1 = v128_neg1;\
+    xmm8  = v128_xor( xmm8,  xmm1 ); \
+    xmm9  = v128_xor( xmm9,  xmm1 ); \
+    xmm10 = v128_xor( xmm10, xmm1 ); \
+    xmm11 = v128_xor( xmm11, xmm1 ); \
+    xmm12 = v128_xor( xmm12, xmm1 ); \
+    xmm13 = v128_xor( xmm13, xmm1 ); \
+    xmm14 = v128_xor( xmm14, xmm1 ); \
+    xmm15 = v128_xor( xmm15, \
+              casti_v128( round_const_q, round_counter ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK1 ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK3 ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
+    xmm8  = v128_shuffle8( xmm8,  SUBSH_MASK1 ); \
+    xmm9  = v128_shuffle8( xmm9,  SUBSH_MASK3 ); \
+    xmm10 = v128_shuffle8( xmm10, SUBSH_MASK5 ); \
+    xmm11 = v128_shuffle8( xmm11, SUBSH_MASK7 ); \
+    xmm12 = v128_shuffle8( xmm12, SUBSH_MASK0 ); \
+    xmm13 = v128_shuffle8( xmm13, SUBSH_MASK2 ); \
+    xmm14 = v128_shuffle8( xmm14, SUBSH_MASK4 ); \
+    xmm15 = v128_shuffle8( xmm15, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
    \
    /* AddRoundConstant Q1024 */\
-    xmm9 = m128_neg1;\
-    xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
-    xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
-    xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
-    xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
-    xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
-    xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
-    xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
-    xmm7 = _mm_xor_si128( xmm7, \
-             casti_m128i( round_const_q, round_counter+1 ) ); \
+    xmm9 = v128_neg1;\
+    xmm0 = v128_xor( xmm0, xmm9 ); \
+    xmm1 = v128_xor( xmm1, xmm9 ); \
+    xmm2 = v128_xor( xmm2, xmm9 ); \
+    xmm3 = v128_xor( xmm3, xmm9 ); \
+    xmm4 = v128_xor( xmm4, xmm9 ); \
+    xmm5 = v128_xor( xmm5, xmm9 ); \
+    xmm6 = v128_xor( xmm6, xmm9 ); \
+    xmm7 = v128_xor( xmm7, \
+             casti_v128( round_const_q, round_counter+1 ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
+    xmm0 = v128_shuffle8( xmm0, SUBSH_MASK1 ); \
+    xmm1 = v128_shuffle8( xmm1, SUBSH_MASK3 ); \
+    xmm2 = v128_shuffle8( xmm2, SUBSH_MASK5 ); \
+    xmm3 = v128_shuffle8( xmm3, SUBSH_MASK7 ); \
+    xmm4 = v128_shuffle8( xmm4, SUBSH_MASK0 ); \
+    xmm5 = v128_shuffle8( xmm5, SUBSH_MASK2 ); \
+    xmm6 = v128_shuffle8( xmm6, SUBSH_MASK4 ); \
+    xmm7 = v128_shuffle8( xmm7, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
@@ -397,70 +409,70 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
  t0 = TRANSP_MASK; \
 \
-  i6 = _mm_shuffle_epi8(i6, t0);\
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
+  i6 = v128_shuffle8(i6, t0);\
+  i0 = v128_shuffle8(i0, t0);\
+  i1 = v128_shuffle8(i1, t0);\
+  i2 = v128_shuffle8(i2, t0);\
+  i3 = v128_shuffle8(i3, t0);\
  t1 = i2;\
-  i4 = _mm_shuffle_epi8(i4, t0);\
-  i5 = _mm_shuffle_epi8(i5, t0);\
+  i4 = v128_shuffle8(i4, t0);\
+  i5 = v128_shuffle8(i5, t0);\
  t2 = i4;\
  t3 = i6;\
-  i7 = _mm_shuffle_epi8(i7, t0);\
+  i7 = v128_shuffle8(i7, t0);\
 \
  /* continue with unpack using 4 temp registers */\
  t0 = i0;\
-  t2 = _mm_unpackhi_epi16(t2, i5);\
-  i4 = _mm_unpacklo_epi16(i4, i5);\
-  t3 = _mm_unpackhi_epi16(t3, i7);\
-  i6 = _mm_unpacklo_epi16(i6, i7);\
-  t0 = _mm_unpackhi_epi16(t0, i1);\
-  t1 = _mm_unpackhi_epi16(t1, i3);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  i0 = _mm_unpacklo_epi16(i0, i1);\
+  t2 = v128_unpackhi16(t2, i5);\
+  i4 = v128_unpacklo16(i4, i5);\
+  t3 = v128_unpackhi16(t3, i7);\
+  i6 = v128_unpacklo16(i6, i7);\
+  t0 = v128_unpackhi16(t0, i1);\
+  t1 = v128_unpackhi16(t1, i3);\
+  i2 = v128_unpacklo16(i2, i3);\
+  i0 = v128_unpacklo16(i0, i1);\
 \
  /* shuffle with immediate */\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t2 = _mm_shuffle_epi32(t2, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  i6 = _mm_shuffle_epi32(i6, 216);\
+  t0 = gr_shuffle32( t0 ); \
+  t1 = gr_shuffle32( t1 ); \
+  t2 = gr_shuffle32( t2 ); \
+  t3 = gr_shuffle32( t3 ); \
+  i0 = gr_shuffle32( i0 ); \
+  i2 = gr_shuffle32( i2 ); \
+  i4 = gr_shuffle32( i4 ); \
+  i6 = gr_shuffle32( i6 ); \
 \
  /* continue with unpack */\
  t4 = i0;\
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  t4 = _mm_unpackhi_epi32(t4, i2);\
+  i0 = v128_unpacklo32(i0, i2);\
+  t4 = v128_unpackhi32(t4, i2);\
  t5 = t0;\
-  t0 = _mm_unpacklo_epi32(t0, t1);\
-  t5 = _mm_unpackhi_epi32(t5, t1);\
+  t0 = v128_unpacklo32(t0, t1);\
+  t5 = v128_unpackhi32(t5, t1);\
  t6 = i4;\
-  i4 = _mm_unpacklo_epi32(i4, i6);\
+  i4 = v128_unpacklo32(i4, i6);\
  t7 = t2;\
-  t6 = _mm_unpackhi_epi32(t6, i6);\
+  t6 = v128_unpackhi32(t6, i6);\
  i2 = t0;\
-  t2 = _mm_unpacklo_epi32(t2, t3);\
+  t2 = v128_unpacklo32(t2, t3);\
  i3 = t0;\
-  t7 = _mm_unpackhi_epi32(t7, t3);\
+  t7 = v128_unpackhi32(t7, t3);\
 \
  /* there are now 2 rows in each xmm */\
  /* unpack to get 1 row of CV in each xmm */\
  i1 = i0;\
-  i1 = _mm_unpackhi_epi64(i1, i4);\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
+  i1 = v128_unpackhi64(i1, i4);\
+  i0 = v128_unpacklo64(i0, i4);\
  i4 = t4;\
-  i3 = _mm_unpackhi_epi64(i3, t2);\
+  i3 = v128_unpackhi64(i3, t2);\
  i5 = t4;\
-  i2 = _mm_unpacklo_epi64(i2, t2);\
+  i2 = v128_unpacklo64(i2, t2);\
  i6 = t5;\
-  i5 = _mm_unpackhi_epi64(i5, t6);\
+  i5 = v128_unpackhi64(i5, t6);\
  i7 = t5;\
-  i4 = _mm_unpacklo_epi64(i4, t6);\
-  i7 = _mm_unpackhi_epi64(i7, t7);\
-  i6 = _mm_unpacklo_epi64(i6, t7);\
+  i4 = v128_unpacklo64(i4, t6);\
+  i7 = v128_unpackhi64(i7, t7);\
+  i6 = v128_unpacklo64(i6, t7);\
  /* transpose done */\
 }/**/

@@ -471,74 +483,76 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
 * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
 * clobbers: t0-t4
 */
-#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+#define Matrix_Transpose_INV( i0, i1, i2, i3, i4, i5, i6, i7, \
+                              o0, o1, o2, t0, t1, t2, t3, t4 ) \
+{ \
  /*  transpose matrix to get output format */\
-  o1 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o1 = _mm_unpackhi_epi64(o1, i1);\
-  t0 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  t0 = _mm_unpackhi_epi64(t0, i3);\
-  t1 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  t1 = _mm_unpackhi_epi64(t1, i5);\
-  t2 = i6;\
+  o1 = i0; \
+  i0 = v128_unpacklo64( i0, i1 ); \
+  o1 = v128_unpackhi64( o1, i1 ); \
+  t0 = i2; \
+  i2 = v128_unpacklo64( i2, i3 ); \
+  t0 = v128_unpackhi64( t0, i3 ); \
+  t1 = i4; \
+  i4 = v128_unpacklo64( i4, i5 ); \
+  t1 = v128_unpackhi64( t1, i5 ); \
+  t2 = i6; \
  o0 = TRANSP_MASK; \
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  t2 = _mm_unpackhi_epi64(t2, i7);\
+  i6 = v128_unpacklo64( i6, i7 ); \
+  t2 = v128_unpackhi64( t2, i7 ); \
  /* load transpose mask into a register, because it will be used 8 times */\
-  i0 = _mm_shuffle_epi8(i0, o0);\
-  i2 = _mm_shuffle_epi8(i2, o0);\
-  i4 = _mm_shuffle_epi8(i4, o0);\
-  i6 = _mm_shuffle_epi8(i6, o0);\
-  o1 = _mm_shuffle_epi8(o1, o0);\
-  t0 = _mm_shuffle_epi8(t0, o0);\
-  t1 = _mm_shuffle_epi8(t1, o0);\
-  t2 = _mm_shuffle_epi8(t2, o0);\
+  i0 = v128_shuffle8( i0, o0 ); \
+  i2 = v128_shuffle8( i2, o0 ); \
+  i4 = v128_shuffle8( i4, o0 ); \
+  i6 = v128_shuffle8( i6, o0 ); \
+  o1 = v128_shuffle8( o1, o0 ); \
+  t0 = v128_shuffle8( t0, o0 ); \
+  t1 = v128_shuffle8( t1, o0 ); \
+  t2 = v128_shuffle8( t2, o0 ); \
  /* continue with unpack using 4 temp registers */\
-  t3 = i4;\
-  o2 = o1;\
-  o0 = i0;\
-  t4 = t1;\
+  t3 = i4; \
+  o2 = o1; \
+  o0 = i0; \
+  t4 = t1; \
  \
-  t3 = _mm_unpackhi_epi16(t3, i6);\
-  i4 = _mm_unpacklo_epi16(i4, i6);\
-  o0 = _mm_unpackhi_epi16(o0, i2);\
-  i0 = _mm_unpacklo_epi16(i0, i2);\
-  o2 = _mm_unpackhi_epi16(o2, t0);\
-  o1 = _mm_unpacklo_epi16(o1, t0);\
-  t4 = _mm_unpackhi_epi16(t4, t2);\
-  t1 = _mm_unpacklo_epi16(t1, t2);\
+  t3 = v128_unpackhi16( t3, i6 ); \
+  i4 = v128_unpacklo16( i4, i6 ); \
+  o0 = v128_unpackhi16( o0, i2 ); \
+  i0 = v128_unpacklo16( i0, i2 ); \
+  o2 = v128_unpackhi16( o2, t0 ); \
+  o1 = v128_unpacklo16( o1, t0 ); \
+  t4 = v128_unpackhi16( t4, t2 ); \
+  t1 = v128_unpacklo16( t1, t2 ); \
  /* shuffle with immediate */\
-  i4 = _mm_shuffle_epi32(i4, 216);\
-  t3 = _mm_shuffle_epi32(t3, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  o2 = _mm_shuffle_epi32(o2, 216);\
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o0 = _mm_shuffle_epi32(o0, 216);\
-  t1 = _mm_shuffle_epi32(t1, 216);\
-  t4 = _mm_shuffle_epi32(t4, 216);\
+  i4 = gr_shuffle32( i4 ); \
+  t3 = gr_shuffle32( t3 ); \
+  o1 = gr_shuffle32( o1 ); \
+  o2 = gr_shuffle32( o2 ); \
+  i0 = gr_shuffle32( i0 ); \
+  o0 = gr_shuffle32( o0 ); \
+  t1 = gr_shuffle32( t1 ); \
+  t4 = gr_shuffle32( t4 ); \
  /* continue with unpack */\
-  i1 = i0;\
-  i3 = o0;\
-  i5 = o1;\
-  i7 = o2;\
-  i0 = _mm_unpacklo_epi32(i0, i4);\
-  i1 = _mm_unpackhi_epi32(i1, i4);\
-  o0 = _mm_unpacklo_epi32(o0, t3);\
-  i3 = _mm_unpackhi_epi32(i3, t3);\
-  o1 = _mm_unpacklo_epi32(o1, t1);\
-  i5 = _mm_unpackhi_epi32(i5, t1);\
-  o2 = _mm_unpacklo_epi32(o2, t4);\
-  i7 = _mm_unpackhi_epi32(i7, t4);\
+  i1 = i0; \
+  i3 = o0; \
+  i5 = o1; \
+  i7 = o2; \
+  i0 = v128_unpacklo32( i0, i4 ); \
+  i1 = v128_unpackhi32( i1, i4 ); \
+  o0 = v128_unpacklo32( o0, t3 ); \
+  i3 = v128_unpackhi32( i3, t3 ); \
+  o1 = v128_unpacklo32( o1, t1 ); \
+  i5 = v128_unpackhi32( i5, t1 ); \
+  o2 = v128_unpacklo32( o2, t4 ); \
+  i7 = v128_unpackhi32( i7, t4 ); \
  /* transpose done */\
 }/**/


-void INIT( __m128i* chaining )
+void INIT( v128_t* chaining )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

  /* load IV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
@@ -564,14 +578,14 @@ void INIT( __m128i* chaining )
  chaining[7] = xmm15;
 }

-void TF1024( __m128i* chaining, const __m128i* message )
+void TF1024( v128_t* chaining, const v128_t* message )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i QTEMP[8];
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static v128_t QTEMP[8];
+  static v128_t TEMP0;
+  static v128_t TEMP1;
+  static v128_t TEMP2;

 #ifdef IACA_TRACE
  IACA_START;
@@ -602,14 +616,14 @@ void TF1024( __m128i* chaining, const __m128i* message )

  /* xor CV to message to get P input */
  /* result: CV+M in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+  xmm8 = v128_xor(xmm8,  (chaining[0]));
+  xmm9 = v128_xor(xmm9,  (chaining[1]));
+  xmm10 = v128_xor(xmm10, (chaining[2]));
+  xmm11 = v128_xor(xmm11, (chaining[3]));
+  xmm12 = v128_xor(xmm12, (chaining[4]));
+  xmm13 = v128_xor(xmm13, (chaining[5]));
+  xmm14 = v128_xor(xmm14, (chaining[6]));
+  xmm15 = v128_xor(xmm15, (chaining[7]));

  /* compute permutation P */
  /* result: P(CV+M) in xmm8...xmm15 */
@@ -617,14 +631,14 @@ void TF1024( __m128i* chaining, const __m128i* message )

  /* xor CV to P output (feed-forward) */
  /* result: P(CV+M)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+  xmm8 = v128_xor(xmm8,  (chaining[0]));
+  xmm9 = v128_xor(xmm9,  (chaining[1]));
+  xmm10 = v128_xor(xmm10, (chaining[2]));
+  xmm11 = v128_xor(xmm11, (chaining[3]));
+  xmm12 = v128_xor(xmm12, (chaining[4]));
+  xmm13 = v128_xor(xmm13, (chaining[5]));
+  xmm14 = v128_xor(xmm14, (chaining[6]));
+  xmm15 = v128_xor(xmm15, (chaining[7]));

  /* store P(CV+M)+CV */
  chaining[0] = xmm8;
@@ -652,14 +666,14 @@ void TF1024( __m128i* chaining, const __m128i* message )

  /* xor Q output */
  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+  xmm8 = v128_xor(xmm8,  (chaining[0]));
+  xmm9 = v128_xor(xmm9,  (chaining[1]));
+  xmm10 = v128_xor(xmm10, (chaining[2]));
+  xmm11 = v128_xor(xmm11, (chaining[3]));
+  xmm12 = v128_xor(xmm12, (chaining[4]));
+  xmm13 = v128_xor(xmm13, (chaining[5]));
+  xmm14 = v128_xor(xmm14, (chaining[6]));
+  xmm15 = v128_xor(xmm15, (chaining[7]));

  /* store CV */
  chaining[0] = xmm8;
@@ -678,13 +692,13 @@ void TF1024( __m128i* chaining, const __m128i* message )
  return;
 }

-void OF1024( __m128i* chaining )
+void OF1024( v128_t* chaining )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static v128_t TEMP0;
+  static v128_t TEMP1;
+  static v128_t TEMP2;

  /* load CV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
@@ -702,14 +716,14 @@ void OF1024( __m128i* chaining )

  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8...xmm15 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm9 = _mm_xor_si128(xmm9,  (chaining[1]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
-  xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
-  xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
-  xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
+  xmm8 = v128_xor(xmm8,  (chaining[0]));
+  xmm9 = v128_xor(xmm9,  (chaining[1]));
+  xmm10 = v128_xor(xmm10, (chaining[2]));
+  xmm11 = v128_xor(xmm11, (chaining[3]));
+  xmm12 = v128_xor(xmm12, (chaining[4]));
+  xmm13 = v128_xor(xmm13, (chaining[5]));
+  xmm14 = v128_xor(xmm14, (chaining[6]));
+  xmm15 = v128_xor(xmm15, (chaining[7]));

  /* transpose CV back from row ordering to column ordering */
  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -7,11 +7,9 @@
 * This code is placed in the public domain
 */

-#include <smmintrin.h>
-#include <wmmintrin.h>
 #include "hash-groestl256.h"

-static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
+static const v128u64_t round_const_l0[] __attribute__ ((aligned (64))) =
 {
   { 0x7060504030201000, 0xffffffffffffffff },
   { 0x7161514131211101, 0xffffffffffffffff },
@@ -25,7 +23,7 @@ static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
   { 0x7969594939291909, 0xffffffffffffffff }
 };

-static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
+static const v128u64_t round_const_l7[] __attribute__ ((aligned (64))) =
 {
   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
   { 0x0000000000000000, 0x8e9eaebecedeeefe },
@@ -39,16 +37,30 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };

-static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+static const v128u64_t TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+
+static const v128u64_t SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
+static const v128u64_t SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
+static const v128u64_t SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
+static const v128u64_t SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
+static const v128u64_t SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
+static const v128u64_t SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
+static const v128u64_t SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
+static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
+
+#if defined(__ARM_NEON)
+
+// No fast shuffle on NEON
+static const uint32x4_t vmask_d8 = {  3, 1, 2, 0 };
+
+#define gr_shuffle32( v )       v128_shufflev32( v, vmask_d8 )
+
+#else
+
+#define gr_shuffle32( v )       _mm_shuffle_epi32( v, 0xd8 )
+
+#endif

-static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
-static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
-static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
-static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
-static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
-static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
-static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
-static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -57,11 +69,11 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
-#define MUL2(i, j, k){\
-  j = _mm_cmpgt_epi8( m128_zero, i);\
-  i = _mm_add_epi8(i, i);\
-  i = mm128_xorand(i, j, k );\
-} 
+#define MUL2( i, j, k ) \
+  j = v128_cmpgt8( v128_zero, i ); \
+  i = v128_add8( i, i ); \
+  i = v128_xorand( i, j, k );
+

 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
@@ -87,85 +99,85 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = v128_xor(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = v128_xor(a1, a2);\
  b1 = a3;\
-  TEMP2 = _mm_xor_si128(a2, a3);\
+  TEMP2 = v128_xor(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = v128_xor(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = v128_xor(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = v128_xor(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = v128_xor(a6, a7);\
+  a7 = v128_xor(a7, b6);\
   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  TEMP0 = mm128_xor3( b0, a4, a6 ); \
+  TEMP0 = v128_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
-  TEMP1 = mm128_xor3( b1, a5, a7 );\
-  b2 = mm128_xor3( b2, a6, a0 ); \
+  TEMP1 = v128_xor3( b1, a5, a7 );\
+  b2 = v128_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b3 = mm128_xor3( b3, a7, a1 ); \
+  b3 = v128_xor3( b3, a7, a1 ); \
  b1 = a1;\
-  b6 = mm128_xor3( b6, a4, TEMP2 ); \
-  b4 = mm128_xor3( b4, a0, TEMP2 ); \
-  b7 = mm128_xor3( b7, a5, a3 ); \
-  b5 = mm128_xor3( b5, a1, a3 ); \
+  b6 = v128_xor3( b6, a4, TEMP2 ); \
+  b4 = v128_xor3( b4, a0, TEMP2 ); \
+  b7 = v128_xor3( b7, a5, a3 ); \
+  b5 = v128_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(TEMP2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = v128_xor(a0, a3);\
+  a1 = v128_xor(a1, a4);\
+  a2 = v128_xor(TEMP2, a5);\
+  a3 = v128_xor(a3, a6);\
+  a4 = v128_xor(a4, a7);\
+  a5 = v128_xor(a5, b0);\
+  a6 = v128_xor(a6, b1);\
+  a7 = v128_xor(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
+  b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = v128_xor(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = v128_xor(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = v128_xor(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = v128_xor(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = v128_xor(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = v128_xor(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = v128_xor(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = v128_xor(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = v128_xor(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = v128_xor(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = v128_xor(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = v128_xor(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = v128_xor(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = v128_xor(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = v128_xor(b0, a3);\
+  b1 = v128_xor(b1, a4);\
 }/*MixBytes*/

 #else
@@ -174,96 +186,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = v128_xor(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = v128_xor(a1, a2);\
  b1 = a3;\
-  a2 = _mm_xor_si128(a2, a3);\
+  a2 = v128_xor(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = v128_xor(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = v128_xor(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = v128_xor(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = v128_xor(a6, a7);\
+  a7 = v128_xor(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b0 = _mm_xor_si128(b0, a6);\
+  b0 = v128_xor(b0, a4);\
+  b6 = v128_xor(b6, a4);\
+  b1 = v128_xor(b1, a5);\
+  b7 = v128_xor(b7, a5);\
+  b2 = v128_xor(b2, a6);\
+  b0 = v128_xor(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
-  b3 = _mm_xor_si128(b3, a7);\
-  b1 = _mm_xor_si128(b1, a7);\
+  b3 = v128_xor(b3, a7);\
+  b1 = v128_xor(b1, a7);\
  TEMP1 = b1;\
-  b4 = _mm_xor_si128(b4, a0);\
-  b2 = _mm_xor_si128(b2, a0);\
+  b4 = v128_xor(b4, a0);\
+  b2 = v128_xor(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b5 = _mm_xor_si128(b5, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
+  b5 = v128_xor(b5, a1);\
+  b3 = v128_xor(b3, a1);\
  b1 = a1;\
-  b6 = _mm_xor_si128(b6, a2);\
-  b4 = _mm_xor_si128(b4, a2);\
+  b6 = v128_xor(b6, a2);\
+  b4 = v128_xor(b4, a2);\
  TEMP2 = a2;\
-  b7 = _mm_xor_si128(b7, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
+  b7 = v128_xor(b7, a3);\
+  b5 = v128_xor(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = v128_xor(a0, a3);\
+  a1 = v128_xor(a1, a4);\
+  a2 = v128_xor(a2, a5);\
+  a3 = v128_xor(a3, a6);\
+  a4 = v128_xor(a4, a7);\
+  a5 = v128_xor(a5, b0);\
+  a6 = v128_xor(a6, b1);\
+  a7 = v128_xor(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
+  b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = v128_xor(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = v128_xor(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = v128_xor(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = v128_xor(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = v128_xor(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = v128_xor(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = v128_xor(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = v128_xor(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = v128_xor(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = v128_xor(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = v128_xor(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = v128_xor(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = v128_xor(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = v128_xor(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = v128_xor(b0, a3);\
+  b1 = v128_xor(b1, a4);\
 }/*MixBytes*/

 #endif
@@ -275,34 +287,34 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
-  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
-  a1 = _mm_xor_si128( a1, b1 ); \
-  a2 = _mm_xor_si128( a2, b1 ); \
-  a3 = _mm_xor_si128( a3, b1 ); \
-  a4 = _mm_xor_si128( a4, b1 ); \
-  a5 = _mm_xor_si128( a5, b1 ); \
-  a6 = _mm_xor_si128( a6, b1 ); \
-  a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
+  b1 = v128_set64( 0xffffffffffffffff, 0 ); \
+  a0 = v128_xor( a0, casti_v128( round_const_l0, i ) ); \
+  a1 = v128_xor( a1, b1 ); \
+  a2 = v128_xor( a2, b1 ); \
+  a3 = v128_xor( a3, b1 ); \
+  a4 = v128_xor( a4, b1 ); \
+  a5 = v128_xor( a5, b1 ); \
+  a6 = v128_xor( a6, b1 ); \
+  a7 = v128_xor( a7, casti_v128( round_const_l7, i ) ); \
  \
  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
-  a0 = _mm_aesenclast_si128( a0, b0 );\
-  a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
-  a1 = _mm_aesenclast_si128( a1, b0 );\
-  a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
-  a2 = _mm_aesenclast_si128( a2, b0 );\
-  a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
-  a3 = _mm_aesenclast_si128( a3, b0 );\
-  a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
-  a4 = _mm_aesenclast_si128( a4, b0 );\
-  a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
-  a5 = _mm_aesenclast_si128( a5, b0 );\
-  a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
-  a6 = _mm_aesenclast_si128( a6, b0 );\
-  a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
-  a7 = _mm_aesenclast_si128( a7, b0 );\
+  b0 = v128_xor(b0,  b0);\
+  a0 = v128_shuffle8( a0, SUBSH_MASK0 ); \
+  a0 = v128_aesenclast( a0, b0 );\
+  a1 = v128_shuffle8( a1, SUBSH_MASK1 ); \
+  a1 = v128_aesenclast( a1, b0 );\
+  a2 = v128_shuffle8( a2, SUBSH_MASK2 ); \
+  a2 = v128_aesenclast( a2, b0 );\
+  a3 = v128_shuffle8( a3, SUBSH_MASK3 ); \
+  a3 = v128_aesenclast( a3, b0 );\
+  a4 = v128_shuffle8( a4, SUBSH_MASK4 ); \
+  a4 = v128_aesenclast( a4, b0 );\
+  a5 = v128_shuffle8( a5, SUBSH_MASK5 ); \
+  a5 = v128_aesenclast( a5, b0 );\
+  a6 = v128_shuffle8( a6, SUBSH_MASK6 ); \
+  a6 = v128_aesenclast( a6, b0 );\
+  a7 = v128_shuffle8( a7, SUBSH_MASK7 ); \
+  a7 = v128_aesenclast( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -334,31 +346,31 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
  t0 = TRANSP_MASK; \
  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
+  i0 = v128_shuffle8(i0, t0);\
+  i1 = v128_shuffle8(i1, t0);\
+  i2 = v128_shuffle8(i2, t0);\
+  i3 = v128_shuffle8(i3, t0);\
  \
  o1 = i0;\
  t0 = i2;\
  \
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
+  i0 = v128_unpacklo16(i0, i1);\
+  o1 = v128_unpackhi16(o1, i1);\
+  i2 = v128_unpacklo16(i2, i3);\
+  t0 = v128_unpackhi16(t0, i3);\
  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
+  i0 = gr_shuffle32( i0 ); \
+  o1 = gr_shuffle32( o1 ); \
+  i2 = gr_shuffle32( i2 ); \
+  t0 = gr_shuffle32( t0 ); \
  \
  o2 = i0;\
  o3 = o1;\
  \
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
+  i0 = v128_unpacklo32(i0, i2);\
+  o1 = v128_unpacklo32(o1, t0);\
+  o2 = v128_unpackhi32(o2, i2);\
+  o3 = v128_unpackhi32(o3, t0);\
 }/**/

 /* Matrix Transpose Step 2
@@ -376,19 +388,19 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 #define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
  o1 = i0;\
  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
+  i0 = v128_unpacklo64(i0, i4);\
+  o1 = v128_unpackhi64(o1, i4);\
  o3 = i1;\
  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o2 = v128_unpacklo64(o2, i5);\
+  o3 = v128_unpackhi64(o3, i5);\
  o5 = i2;\
  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o4 = v128_unpacklo64(o4, i6);\
+  o5 = v128_unpackhi64(o5, i6);\
  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
+  o6 = v128_unpacklo64(o6, i7);\
+  o7 = v128_unpackhi64(o7, i7);\
 }/**/

 /* Matrix Transpose Inverse Step 2
@@ -399,17 +411,17 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 */
 #define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
+  i0 = v128_unpacklo64(i0, i1);\
+  o0 = v128_unpackhi64(o0, i1);\
  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
+  i2 = v128_unpacklo64(i2, i3);\
+  o1 = v128_unpackhi64(o1, i3);\
  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
+  i4 = v128_unpacklo64(i4, i5);\
+  o2 = v128_unpackhi64(o2, i5);\
  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
+  i6 = v128_unpacklo64(i6, i7);\
+  o3 = v128_unpackhi64(o3, i7);\
 }/**/

 /* Matrix Transpose Output Step 2
@@ -419,19 +431,19 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 * outputs: (i0-7) = (0|S)
 */
 #define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
+  t0 = v128_xor(t0, t0);\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
+  i0 = v128_unpacklo64(i0, t0);\
+  i1 = v128_unpackhi64(i1, t0);\
+  i2 = v128_unpacklo64(i2, t0);\
+  i3 = v128_unpackhi64(i3, t0);\
+  i4 = v128_unpacklo64(i4, t0);\
+  i5 = v128_unpackhi64(i5, t0);\
+  i6 = v128_unpacklo64(i6, t0);\
+  i7 = v128_unpackhi64(i7, t0);\
 }/**/

 /* Matrix Transpose Output Inverse Step 2
@@ -441,17 +453,17 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 * outputs: (i0, i2, i4, i6) = S
 */
 #define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
+  i0 = v128_unpacklo64(i0, i1);\
+  i2 = v128_unpacklo64(i2, i3);\
+  i4 = v128_unpacklo64(i4, i5);\
+  i6 = v128_unpacklo64(i6, i7);\
 }/**/


-void INIT256( __m128i* chaining )
+void INIT256( v128_t* chaining )
 {
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+  static v128_t xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static v128_t /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;

  /* load IV into registers xmm12 - xmm15 */
  xmm12 = chaining[0];
@@ -470,13 +482,13 @@ void INIT256( __m128i* chaining )
  chaining[3] = xmm7;
 }

-void TF512( __m128i* chaining, __m128i* message )
+void TF512( v128_t* chaining, v128_t* message )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static v128_t TEMP0;
+  static v128_t TEMP1;
+  static v128_t TEMP2;

 #ifdef IACA_TRACE
  IACA_START;
@@ -501,10 +513,10 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor message to CV get input of P */
  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
+  xmm8 = v128_xor(xmm8, xmm12);
+  xmm0 = v128_xor(xmm0, xmm2);
+  xmm4 = v128_xor(xmm4, xmm6);
+  xmm5 = v128_xor(xmm5, xmm7);

  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
@@ -519,17 +531,17 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor output of P and Q */
  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
+  xmm0 = v128_xor(xmm0, xmm8);
+  xmm1 = v128_xor(xmm1, xmm10);
+  xmm2 = v128_xor(xmm2, xmm12);
+  xmm3 = v128_xor(xmm3, xmm14);

  /* xor CV (feed-forward) */
  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+  xmm0 = v128_xor(xmm0, (chaining[0]));
+  xmm1 = v128_xor(xmm1, (chaining[1]));
+  xmm2 = v128_xor(xmm2, (chaining[2]));
+  xmm3 = v128_xor(xmm3, (chaining[3]));

  /* store CV */
  chaining[0] = xmm0;
@@ -543,13 +555,13 @@ void TF512( __m128i* chaining, __m128i* message )
  return;
 }

-void OF512( __m128i* chaining )
+void OF512( v128_t* chaining )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static v128_t TEMP0;
+  static v128_t TEMP1;
+  static v128_t TEMP2;

  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
@@ -572,10 +584,10 @@ void OF512( __m128i* chaining )

  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+  xmm8 = v128_xor(xmm8,  (chaining[0]));
+  xmm10 = v128_xor(xmm10, (chaining[1]));
+  xmm12 = v128_xor(xmm12, (chaining[2]));
+  xmm14 = v128_xor(xmm14, (chaining[3]));

  /* transform state back from row ordering into column ordering */
  /* result: final hash value in xmm9, xmm11 */
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -6,7 +6,7 @@
 * This code is placed in the public domain
 */

-// Optimized for hash and data length that are integrals of __m128i 
+// Optimized for hash and data length that are integrals of v128_t 


 #include <memory.h>
@@ -14,11 +14,11 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)

 #include "groestl-intr-aes.h"

-HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
+int init_groestl( hashState_groestl* ctx, int hashlen )
 {
  int i;

@@ -26,52 +26,40 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )

  for ( i = 0; i < SIZE512; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = v128_zero;
+     ctx->buffer[i]   = v128_zero;
  }

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
+  ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }

-HashReturn_gr reinit_groestl( hashState_groestl* ctx )
+int reinit_groestl( hashState_groestl* ctx )
 {
  int i;

  for ( i = 0; i < SIZE512; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = v128_zero;
+     ctx->buffer[i]   = v128_zero;
  }
-  ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
+  ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }
-//// midstate is broken
-// To use midstate:
-// 1. midstate must process all full blocks.
-// 2. tail must be less than a full block and may not straddle a
-//    block boundary.
-// 3. midstate and tail each must be multiples of 128 bits.
-// 4. For best performance midstate length is a multiple of block size.
-// 5. Midstate will work at reduced impact than full hash, if total hash
-//    (midstate + tail) is less than 1 block.
-//    This, unfortunately, is the case with all current users.
-// 6. the more full blocks the bigger the gain

-// use only for midstate precalc
-HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
-                              DataLength_gr databitlen )
+int update_groestl( hashState_groestl* ctx, const void* input,
+                              int databitlen )
 {
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
+   v128_t* in = (v128_t*)input;
+   const int len = (int)databitlen / 128;  // bits to v128_t
   const int blocks = len / SIZE512;    // __M128i to blocks
   int rem = ctx->rem_ptr;
   int i;
@@ -92,16 +80,16 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
   // adjust rem_ptr for possible  new data
   ctx->rem_ptr += i;

-   return SUCCESS_GR;
+   return 0;
 }

 // deprecated do not use
-HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
+int final_groestl( hashState_groestl* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128; // bits to __m128i 
+   const int len = (int)ctx->databitlen / 128; // bits to v128_t 
   const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
   const int rem_ptr = ctx->rem_ptr;           // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;     // bytes to v128_t
   const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
   int i;

@@ -111,18 +99,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
+      ctx->buffer[rem_ptr] = v128_set64( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
+       ctx->buffer[rem_ptr] = v128_set64( 0, 0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = v128_set64( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
@@ -131,13 +119,13 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i];

-   return SUCCESS_GR;
+   return 0;
 }

-int groestl512_full( hashState_groestl* ctx, void* output,
-                                const void* input, uint64_t databitlen )
+int groestl512( hashState_groestl* ctx, void* output, const void* input,
+                uint64_t databitlen )
 {

   int i;
@@ -145,19 +133,19 @@ int groestl512_full( hashState_groestl* ctx, void* output,

   for ( i = 0; i < SIZE512; i++ )
   {
-      ctx->chaining[i] = _mm_setzero_si128();
-      ctx->buffer[i]   = _mm_setzero_si128();
+      ctx->chaining[i] = v128_zero;
+      ctx->buffer[i]   = v128_zero;
   }
-   ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
+   ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;

   // --- update ---
   
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to v128_t
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
-   __m128i* in = (__m128i*)input;
+   v128_t* in = (v128_t*)input;

   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
@@ -177,18 +165,18 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+      ctx->buffer[i] = v128_set64( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
+       ctx->buffer[i] = v128_set64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 ); 
+       ctx->buffer[i] = v128_set64( blocks << 56, 0 ); 
   }

   // digest final padding block and do output transform
@@ -197,21 +185,21 @@ int groestl512_full( hashState_groestl* ctx, void* output,

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];

   return 0;
 }
   

-HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
-                                const void* input, DataLength_gr databitlen )
+int update_and_final_groestl( hashState_groestl* ctx, void* output,
+                                const void* input, int databitlen )
 {
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to v128_t
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   uint64_t blocks = len / SIZE512;
-   __m128i* in = (__m128i*)input;
+   v128_t* in = (v128_t*)input;
   int i;

   // --- update ---
@@ -234,18 +222,18 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
-      ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
+      ctx->buffer[i] = v128_set64( blocks << 56, 0x80 );
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
+       ctx->buffer[i] = v128_set64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
+       ctx->buffer[i] = v128_set64( blocks << 56, 0 );
   }

   // digest final padding block and do output transform
@@ -254,17 +242,16 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }

 /* hash bit sequence */
-HashReturn_gr hash_groestl(int hashbitlen,
-		const BitSequence_gr* data, 
-		DataLength_gr databitlen,
-		BitSequence_gr* hashval) {
-  HashReturn_gr ret;
+int hash_groestl( int hashbitlen, const BitSequence_gr* data,	int databitlen,
+                  uint8_t* hashval )
+{
+  int ret;
  hashState_groestl context;

  /* initialise */
@@ -290,4 +277,5 @@ int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long
 }
 #endif

-#endif
+
+#endif   /// SSSE3 or NEON
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -16,8 +16,6 @@
 #include <stdlib.h>
 #include "simd-utils.h"

-#define LENGTH (512)
-
 #include "brg_endian.h"
 //#define NEED_UINT_64T
 #include "compat/brg_types.h"
@@ -32,6 +30,8 @@
 //#define ROUNDS512 (10)
 #define ROUNDS1024 (14)

+#define LENGTH 512
+
 //#if LENGTH<=256
 //#define COLS (COLS512)
 //#define SIZE (SIZE512)
@@ -76,17 +76,17 @@ typedef struct {
 } hashState_groestl;


-HashReturn_gr init_groestl( hashState_groestl*, int );
+int init_groestl( hashState_groestl*, int );

-HashReturn_gr reinit_groestl( hashState_groestl* );
+int reinit_groestl( hashState_groestl* );

-HashReturn_gr update_groestl( hashState_groestl*, const void*,
-                              DataLength_gr );
+int update_groestl( hashState_groestl*, const void*, int );

-HashReturn_gr final_groestl( hashState_groestl*, void* );
+int final_groestl( hashState_groestl*, void* );
+
+int update_and_final_groestl( hashState_groestl*,  void*, const void*, int );
+int groestl512( hashState_groestl*,  void*, const void*, uint64_t );
+#define groestl512_full   groestl512

-HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
-                                        const void*, DataLength_gr );
-int groestl512_full( hashState_groestl*,  void*, const void*, uint64_t );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -11,12 +11,12 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifdef __AES__
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)

 #include "groestl256-intr-aes.h"

 /* initialise context */
-HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
+int init_groestl256( hashState_groestl256* ctx, int hashlen )
 {
  int i;

@@ -24,42 +24,42 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )

  for ( i = 0; i < SIZE256; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = v128_zero;
+     ctx->buffer[i]   = v128_zero;
  }
  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
  INIT256( ctx->chaining );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }


-HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
+int reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;

  for ( i = 0; i < SIZE256; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = v128_zero;
+     ctx->buffer[i]   = v128_zero;
  }

-  ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );
+  ctx->chaining[ 3 ] = v128_set64( 0, 0x0100000000000000 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }

 // Use this only for midstate and never for cryptonight
-HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
-                                 DataLength_gr databitlen )
+int update_groestl256( hashState_groestl256* ctx, const void* input,
+                                 int databitlen )
 {
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
+   v128_t* in = (v128_t*)input;
+   const int len = (int)databitlen / 128;  // bits to v128_t
   const int blocks = len / SIZE256;    // __M128i to blocks
   int rem = ctx->rem_ptr;
   int i;
@@ -79,16 +79,16 @@ HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
   // adjust rem_ptr for new data
   ctx->rem_ptr += i;

-   return SUCCESS_GR;
+   return 0;
 }

 // don't use this at all
-HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
+int final_groestl256( hashState_groestl256* ctx, void* output )
 {
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int len = (int)ctx->databitlen / 128;  // bits to v128_t 
   const int blocks = ctx->blk_count + 1;       // adjust for final block
   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to v128_t
   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
   int i;

@@ -98,21 +98,20 @@ HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
   if ( rem_ptr == len - 1 )
   {
       // all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+       ctx->buffer[rem_ptr] = v128_set8( blocks,0,0,0, 0,0,0,0,
                                         0,0,0,0, 0,0,0,0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+       ctx->buffer[rem_ptr] = v128_set8( 0,0,0,0, 0,0,0,0,
                                         0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;
       // add length padding
       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0 );
+       ctx->buffer[i] = v128_set8( blocks,0,0,0, 0,0,0,0,  0,0,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
@@ -121,20 +120,20 @@ HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i];

-   return SUCCESS_GR;
+   return 0;
 }

-HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
+int update_and_final_groestl256( hashState_groestl256* ctx,
+                   void* output, const void* input, int databitlen )
 {
   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to v128_t
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
+   v128_t* in = (v128_t*)input;
   int i;

   // --- update ---
@@ -144,7 +143,7 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
      TF512( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // cryptonight has 200 byte input, an odd number of __m128i
+   // cryptonight has 200 byte input, an odd number of v128_t
   // remainder is only 8 bytes, ie u64.
   if ( databitlen % 128 !=0 )
   {
@@ -168,7 +167,7 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
   if ( i == len - 1 )
   {
       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+       ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
                                        0,        0,0,0, 0,0,0,0x80 );
   }
   else
@@ -183,15 +182,15 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
       else
       {
          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+          ctx->buffer[i] = v128_set8( 0,0,0,0, 0,0,0,0,
                                      0,0,0,0, 0,0,0,0x80 );
       }
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;
       // add length padding
       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+       ctx->buffer[i] = v128_set8( blocks, blocks>>8,0,0, 0,0,0,0,
                                         0,        0,0,0, 0,0,0,0 );
   }

@@ -201,30 +200,30 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }

 int groestl256_full( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
+                   void* output, const void* input, int databitlen )
 {
   int i;
   ctx->hashlen = 32;
  for ( i = 0; i < SIZE256; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = v128_zero;
+     ctx->buffer[i]   = v128_zero;
  }
  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
  INIT256( ctx->chaining );
  ctx->buf_ptr = 0;

   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to v128_t
   const int hash_offset = SIZE256 - hashlen_m128i;
   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
+   v128_t* in = (v128_t*)input;

   // --- update ---

@@ -233,7 +232,7 @@ int groestl256_full( hashState_groestl256* ctx,
      TF512( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // cryptonight has 200 byte input, an odd number of __m128i
+   // cryptonight has 200 byte input, an odd number of v128_t
   // remainder is only 8 bytes, ie u64.
   if ( databitlen % 128 != 0 )
   {
@@ -257,7 +256,7 @@ int groestl256_full( hashState_groestl256* ctx,
   if ( i == len - 1 )
   {
       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+       ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
                                        0,        0,0,0, 0,0,0,0x80 );
   }
   else
@@ -272,15 +271,15 @@ int groestl256_full( hashState_groestl256* ctx,
       else
       {
          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+          ctx->buffer[i] = v128_set8( 0,0,0,0, 0,0,0,0,
                                      0,0,0,0, 0,0,0,0x80 );
       }
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
+           ctx->buffer[i] = v128_zero;
       // add length padding
       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+       ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
                                        0,        0,0,0, 0,0,0,0 );
   }

@@ -290,18 +289,17 @@ int groestl256_full( hashState_groestl256* ctx,

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }


 /* hash bit sequence */
-HashReturn_gr hash_groestl256(int hashbitlen,
-                const BitSequence_gr* data,
-                DataLength_gr databitlen,
-                BitSequence_gr* hashval) {
-  HashReturn_gr ret;
+int hash_groestl256(int hashbitlen, const void* data, int databitlen,
+                uint8_t* hashval)
+{
+  int ret;
  hashState_groestl256 context;

  /* initialise */
@@ -327,4 +325,4 @@ HashReturn_gr hash_groestl256(int hashbitlen,
 //}
 //#endif

-#endif
+#endif // SSSE3 or NEON
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -100,22 +100,20 @@ typedef struct {
  int databitlen;
 } hashState_groestl256;

-HashReturn_gr init_groestl256( hashState_groestl256*, int );
+int init_groestl256( hashState_groestl256*, int );

-HashReturn_gr reinit_groestl256( hashState_groestl256* );
+int reinit_groestl256( hashState_groestl256* );

-HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
-                              DataLength_gr );
+int update_groestl256( hashState_groestl256*, const void*, int );

-HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+int final_groestl256( hashState_groestl256*, void* );

-HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
-                            BitSequence_gr* );
+int hash_groestl256( int, const void*, int, uint8_t* );

-HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
-                                           const void*, DataLength_gr );
+int update_and_final_groestl256( hashState_groestl256*, void*,
+                                           const void*, int );

 int groestl256_full( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen );
+                   void* output, const void* input, int databitlen );

 #endif /* __hash_h */
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -11,8 +11,6 @@

 #if defined(__AVX2__) && defined(__VAES__)

-#define LENGTH (512)
-
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -32,13 +32,8 @@

 #include <stddef.h>
 #include <string.h>
-#include "hamsi-hash-4way.h"
-
 #include <stdio.h>
-
-#if defined(__AVX2__)
-
-//#include "hamsi-helper-4way.c"
+#include "hamsi-hash-4way.h"

 static const uint32_t HAMSI_IV512[] =
 {
@@ -1120,6 +1115,8 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )

 #endif // AVX512

+#if defined (__AVX2__)
+
 // Hamsi 4 way AVX2

 #if defined(__AVX512VL__)
@@ -1896,15 +1893,15 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
-
-   sc->h[0] = v256_64( 0x6c70617273746565 );
-   sc->h[1] = v256_64( 0x656e62656b204172 );
-   sc->h[2] = v256_64( 0x302c206272672031 );
-   sc->h[3] = v256_64( 0x3434362c75732032 );
-   sc->h[4] = v256_64( 0x3030312020422d33 );
-   sc->h[5] = v256_64( 0x656e2d484c657576 );
-   sc->h[6] = v256_64( 0x6c65652c65766572 );
-   sc->h[7] = v256_64( 0x6769756d2042656c );
+   uint64_t *iv = (uint64_t*)HAMSI_IV512;
+   sc->h[0] = v256_64( iv[0] );
+   sc->h[1] = v256_64( iv[1] );
+   sc->h[2] = v256_64( iv[2] );
+   sc->h[3] = v256_64( iv[3] );
+   sc->h[4] = v256_64( iv[4] );
+   sc->h[5] = v256_64( iv[5] );
+   sc->h[6] = v256_64( iv[6] );
+   sc->h[7] = v256_64( iv[7] );
 }

 void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
@@ -1935,3 +1932,332 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 }

 #endif
+
+#if defined(__SSE4_2__) || defined(__ARM_NEON)
+
+#define DECL_STATE_2x64 \
+   v128_t c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_2x64(sc) \
+   c0 = sc->h[0]; \
+   c1 = sc->h[1]; \
+   c2 = sc->h[2]; \
+   c3 = sc->h[3]; \
+   c4 = sc->h[4]; \
+   c5 = sc->h[5]; \
+   c6 = sc->h[6]; \
+   c7 = sc->h[7];
+
+#define WRITE_STATE_2x64(sc) \
+   sc->h[0] = c0; \
+   sc->h[1] = c1; \
+   sc->h[2] = c2; \
+   sc->h[3] = c3; \
+   sc->h[4] = c4; \
+   sc->h[5] = c5; \
+   sc->h[6] = c6; \
+   sc->h[7] = c7;
+
+#define INPUT_2x64 \
+{ \
+  v128_t db = *buf; \
+  const v128_t zero = v128_zero; \
+  const uint64_t *tp = (const uint64_t*)T512;  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  for ( int i = 63; i >= 0; i-- ) \
+  { \
+     v128_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
+     m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
+     m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
+     m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
+     m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
+     m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
+     m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
+     m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
+     m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
+     tp += 8; \
+  } \
+}
+
+// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
+#define SBOX_2x64( a, b, c, d ) \
+{ \
+  v128_t tb, td; \
+  td = v128_xorand( d, a, c ); \
+  tb = v128_xoror( b, d, a ); \
+  c = v128_xor3( c, td, b ); \
+  a = v128_xor( a, c ); \
+  b = v128_xoror( td, tb, a ); \
+  td = v128_xorand( a, td, tb ); \
+  a = c; \
+  c = v128_xor3( tb, b, td ); \
+  d = v128_not( td ); \
+}
+
+#define L_2x64( a, b, c, d ) \
+{ \
+   a = v128_rol32( a, 13 ); \
+   c = v128_rol32( c,  3 ); \
+   b = v128_xor3( a, b, c ); \
+   d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
+   b = v128_rol32( b, 1 ); \
+   d = v128_rol32( d, 7 ); \
+   a = v128_xor3( a, b, d ); \
+   c = v128_xor3( c, d, v128_sl32( b, 7 ) ); \
+   a = v128_rol32( a,  5 ); \
+   c = v128_rol32( c, 22 ); \
+}
+
+#define ROUND_2x64( alpha ) \
+{ \
+   v128_t t0, t1, t2, t3, t4, t5; \
+   const v128_t mask = v128_64( 0x00000000ffffffff ); \
+   s0 = v128_xor( s0, alpha[ 0] ); \
+   s1 = v128_xor( s1, alpha[ 1] ); \
+   s2 = v128_xor( s2, alpha[ 2] ); \
+   s3 = v128_xor( s3, alpha[ 3] ); \
+   s4 = v128_xor( s4, alpha[ 4] ); \
+   s5 = v128_xor( s5, alpha[ 5] ); \
+   s6 = v128_xor( s6, alpha[ 6] ); \
+   s7 = v128_xor( s7, alpha[ 7] ); \
+   s8 = v128_xor( s8, alpha[ 8] ); \
+   s9 = v128_xor( s9, alpha[ 9] ); \
+   sA = v128_xor( sA, alpha[10] ); \
+   sB = v128_xor( sB, alpha[11] ); \
+   sC = v128_xor( sC, alpha[12] ); \
+   sD = v128_xor( sD, alpha[13] ); \
+   sE = v128_xor( sE, alpha[14] ); \
+   sF = v128_xor( sF, alpha[15] ); \
+\
+  SBOX_2x64( s0, s4, s8, sC ); \
+  SBOX_2x64( s1, s5, s9, sD ); \
+  SBOX_2x64( s2, s6, sA, sE ); \
+  SBOX_2x64( s3, s7, sB, sF ); \
+\
+  s4 = v128_swap64_32( s4 ); \
+  s5 = v128_swap64_32( s5 ); \
+  sD = v128_swap64_32( sD ); \
+  sE = v128_swap64_32( sE ); \
+  t0 = v128_blendv( s5, s4, mask ); \
+  t1 = v128_blendv( sE, sD, mask ); \
+  L_2x64( s0, t0, s9, t1 ); \
+\
+  s6 = v128_swap64_32( s6 ); \
+  sF = v128_swap64_32( sF ); \
+  t2 = v128_blendv( s6, s5, mask ); \
+  t3 = v128_blendv( sF, sE, mask ); \
+  L_2x64( s1, t2, sA, t3 ); \
+  s5 = v128_blendv( t0, t2, mask ); \
+  sE = v128_blendv( t1, t3, mask ); \
+\
+  s7 = v128_swap64_32( s7 ); \
+  sC = v128_swap64_32( sC ); \
+  t4 = v128_blendv( s7, s6, mask ); \
+  t5 = v128_blendv( sC, sF, mask ); \
+  L_2x64( s2, t4, sB, t5 ); \
+  s6 = v128_blendv( t2, t4, mask ); \
+  sF = v128_blendv( t3, t5, mask ); \
+  s6 = v128_swap64_32( s6 ); \
+  sF = v128_swap64_32( sF ); \
+\
+  t2 = v128_blendv( s4, s7, mask ); \
+  t3 = v128_blendv( sD, sC, mask ); \
+  L_2x64( s3, t2, s8, t3 ); \
+  s7 = v128_blendv( t4, t2, mask ); \
+  s4 = v128_blendv( t2, t0, mask ); \
+  sC = v128_blendv( t5, t3, mask ); \
+  sD = v128_blendv( t3, t1, mask ); \
+  s7 = v128_swap64_32( s7 ); \
+  sC = v128_swap64_32( sC ); \
+\
+  t0 = v128_blendv( v128_swap64_32( s8 ), s0, mask ); \
+  t1 = v128_blendv( s9, s1, mask ); \
+  t2 = v128_blendv( sA, v128_swap64_32( s2 ), mask ); \
+  t3 = v128_blendv( s3, sB, mask ); \
+  t3 = v128_swap64_32( t3 ); \
+  L_2x64( t0, t1, t2, t3 ); \
+  t3 = v128_swap64_32( t3 ); \
+  s0 = v128_blendv( s0, t0, mask ); \
+  s8 = v128_blendv( s8, v128_swap64_32( t0 ), mask ); \
+  s1 = v128_blendv( s1, t1, mask ); \
+  s9 = v128_blendv( t1, s9, mask ); \
+  s2 = v128_blendv( v128_swap64_32( t2 ), s2, mask ); \
+  sA = v128_blendv( t2, sA, mask ); \
+  s3 = v128_blendv( t3, s3, mask ); \
+  sB = v128_blendv( sB, t3, mask ); \
+\
+  t0 = v128_blendv( sC, s4, mask ); \
+  t1 = v128_blendv( sD, s5, mask ); \
+  t2 = v128_blendv( sE, s6, mask ); \
+  t3 = v128_blendv( sF, s7, mask ); \
+  L_2x64( t0, t1, t2, t3 ); \
+  s4 = v128_blendv( s4, t0, mask ); \
+  sC = v128_blendv( t0, sC, mask ); \
+  s5 = v128_blendv( s5, t1, mask ); \
+  sD = v128_blendv( t1, sD, mask ); \
+  s6 = v128_blendv( s6, t2, mask ); \
+  sE = v128_blendv( t2, sE, mask ); \
+  s7 = v128_blendv( s7, t3, mask ); \
+  sF = v128_blendv( t3, sF, mask ); \
+  s4 = v128_swap64_32( s4 ); \
+  s5 = v128_swap64_32( s5 ); \
+  sD = v128_swap64_32( sD ); \
+  sE = v128_swap64_32( sE ); \
+}
+
+#define P_2x64 \
+{ \
+   v128_t alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (1ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (2ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (3ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (4ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (5ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+}
+
+#define PF_2x64 \
+{ \
+   v128_t alpha[16]; \
+   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
+   for( int i = 0; i < 16; i++ ) \
+      alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 1ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 2ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 3ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 4ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 5ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 6ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 7ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 8ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( ( 9ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (10ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+   alpha[0] = v128_64( (11ULL << 32) ^ A0 ); \
+   ROUND_2x64( alpha ); \
+}
+
+#define T_2x64 \
+{ /* order is important */ \
+   c7 = sc->h[ 7 ] = v128_xor( sc->h[ 7 ], sB ); \
+   c6 = sc->h[ 6 ] = v128_xor( sc->h[ 6 ], sA ); \
+   c5 = sc->h[ 5 ] = v128_xor( sc->h[ 5 ], s9 ); \
+   c4 = sc->h[ 4 ] = v128_xor( sc->h[ 4 ], s8 ); \
+   c3 = sc->h[ 3 ] = v128_xor( sc->h[ 3 ], s3 ); \
+   c2 = sc->h[ 2 ] = v128_xor( sc->h[ 2 ], s2 ); \
+   c1 = sc->h[ 1 ] = v128_xor( sc->h[ 1 ], s1 ); \
+   c0 = sc->h[ 0 ] = v128_xor( sc->h[ 0 ], s0 ); \
+}
+
+void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
+{
+   DECL_STATE_2x64;
+   uint32_t tmp;
+
+   tmp = (uint32_t)num << 6;
+   sc->count_low = sc->count_low + tmp;
+   sc->count_high += (uint32_t)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_2x64( sc );
+   while ( num-- > 0 )
+   {
+      v128_t m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_2x64;
+      P_2x64;
+      T_2x64;
+      buf++;
+   }
+   WRITE_STATE_2x64( sc );
+}
+
+void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
+{
+   v128_t m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_2x64;
+   READ_STATE_2x64( sc );
+   INPUT_2x64;
+   PF_2x64;
+   T_2x64;
+   WRITE_STATE_2x64( sc );
+}
+
+void hamsi512_2x64_init( hamsi_2x64_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+   uint64_t * iv = (uint64_t*)HAMSI_IV512;
+   sc->h[0] = v128_64( iv[0] );
+   sc->h[1] = v128_64( iv[1] );
+   sc->h[2] = v128_64( iv[2] );
+   sc->h[3] = v128_64( iv[3] );
+   sc->h[4] = v128_64( iv[4] );
+   sc->h[5] = v128_64( iv[5] );
+   sc->h[6] = v128_64( iv[6] );
+   sc->h[7] = v128_64( iv[7] );
+}
+
+void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
+      size_t len )
+{
+   v128_t *vdata = (v128_t*)data;
+
+   hamsi64_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   v128_memcpy( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
+{
+   v128_t pad[1];
+   uint32_t ch, cl;
+
+   ch = bswap_32( sc->count_high );
+   cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
+   sc->buf[0] = v128_64( 0x80 );
+   hamsi64_big( sc, sc->buf, 1 );
+   hamsi64_big_final( sc, pad );
+
+   v128_block_bswap32( (v128_t*)dst, sc->h );
+}
+
+void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data, 
+                        size_t len )
+{
+   hamsi512_2x64_init( sc );
+   hamsi512_2x64_update( sc, data, len );
+   hamsi512_2x64_close( sc, dst );
+}
+
+void hamsi512_2x64( void *dst, const void *data, size_t len )
+{
+   hamsi512_2x64_context sc;
+   hamsi512_2x64_init( &sc );
+   hamsi512_2x64_update( &sc, data, len );
+   hamsi512_2x64_close( &sc, dst );
+}   
+
+#endif   // SSE4.1 or NEON
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -36,11 +36,29 @@
 #define HAMSI_4WAY_H__

 #include <stddef.h>
+#include "simd-utils.h"
+
+// SSE2 or NEON Hamsi-512 2x64
+
+typedef struct
+{
+   v128_t h[8];
+   v128_t buf[1];
+   size_t partial_len;
+   uint32_t count_high, count_low;
+} hamsi_2x64_context;
+typedef hamsi_2x64_context hamsi512_2x64_context;
+
+void hamsi512_2x64_init( hamsi512_2x64_context *sc );
+void hamsi512_2x64_update( hamsi512_2x64_context *sc, const void *data,
+      size_t len );
+void hamsi512_2x64_close( hamsi512_2x64_context *sc, void *dst );
+void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
+                        size_t len );
+void hamsi512_2x64( void *dst, const void *data, size_t len );

 #if defined (__AVX2__)

-#include "simd-utils.h"
-
 // Hamsi-512 4x64

 // Partial is only scalar but needs pointer ref for hamsi-helper
@@ -88,7 +106,8 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,

 // Hamsi-512 8x64

-typedef struct {
+typedef struct
+{
   __m512i h[8];
   __m512i buf[1];
   size_t partial_len;
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,5 +1,5 @@
 #include <memory.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include <stdlib.h>

 #include "hodl-gate.h"
@@ -176,7 +176,7 @@ bool register_hodl_algo( algo_gate_t* gate )
  gate->resync_threads        = (void*)&hodl_resync_threads;
  gate->do_this_thread        = (void*)&hodl_do_this_thread;
  gate->work_cmp_size         = 76;
-  hodl_scratchbuf = (unsigned char*)_mm_malloc( 1 << 30, 64 );
+  hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
  allow_getwork = false;
  opt_target_factor = 8388608.0;
  return ( hodl_scratchbuf != NULL );
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -36,11 +36,6 @@
 #ifndef JH_HASH_4WAY_H__
 #define JH_HASH_4WAY_H__

-#ifdef __AVX2__
-
-#ifdef __cplusplus
-extern "C"{
-#endif

 #include <stddef.h>
 #include "simd-utils.h"
@@ -60,61 +55,96 @@ extern "C"{
 * <code>memcpy()</code>).
 */

- 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-typedef struct {
+typedef struct
+{
    __m512i buf[8];
    __m512i H[16];
    size_t ptr;
    uint64_t block_count;
-} jh_8way_context __attribute__ ((aligned (128)));
+} jh_8x64_context __attribute__ ((aligned (128)));

-typedef jh_8way_context jh256_8way_context;
+typedef jh_8x64_context jh256_8x64_context;
+typedef jh_8x64_context jh512_8x64_context;
+#define jh256_8way_context jh256_8x64_context
+#define jh512_8way_context jh512_8x64_context

-typedef jh_8way_context jh512_8way_context;
+void jh256_8x64_init( jh_8x64_context *sc);
+void jh256_8x64_update(void *cc, const void *data, size_t len);
+void jh256_8x64_close(void *cc, void *dst);
+void jh256_8x64_ctx( jh_8x64_context *cc, void *dst, const void *data, size_t len );

-void jh256_8way_init( jh_8way_context *sc);
+void jh512_8x64_init( jh_8x64_context *sc );
+void jh512_8x64_update(void *cc, const void *data, size_t len);
+void jh512_8x64_close(void *cc, void *dst);
+void jh512_8x64_ctx( jh_8x64_context *cc, void *dst, const void *data, size_t len );

-void jh256_8way_update(void *cc, const void *data, size_t len);
+#define jh256_8way_init     jh256_8x64_init
+#define jh256_8way_update   jh256_8x64_update
+#define jh256_8way_close    jh256_8x64_close

-void jh256_8way_close(void *cc, void *dst);
-
-void jh512_8way_init( jh_8way_context *sc );
-
-void jh512_8way_update(void *cc, const void *data, size_t len);
-
-void jh512_8way_close(void *cc, void *dst);
+#define jh512_8way_init     jh512_8x64_init
+#define jh512_8way_update   jh512_8x64_update
+#define jh512_8way_close    jh512_8x64_close

 #endif

-typedef struct {
+#if defined(__AVX2__)
+
+typedef struct
+{
    __m256i buf[8];
    __m256i H[16];
    size_t ptr;
    uint64_t block_count;
-} jh_4way_context __attribute__ ((aligned (128)));
+} jh_4x64_context __attribute__ ((aligned (128)));

-typedef jh_4way_context jh256_4way_context;
+typedef jh_4x64_context jh256_4x64_context;
+typedef jh_4x64_context jh512_4x64_context;
+#define jh256_4way_context jh256_4x64_context
+#define jh512_4way_context jh512_4x64_context

-typedef jh_4way_context jh512_4way_context;
+void jh256_4x64_init( jh_4x64_context *sc );
+void jh256_4x64_update( void *cc, const void *data, size_t len );
+void jh256_4x64_close( void *cc, void *dst );
+void jh256_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data,
+                     size_t len );

-void jh256_4way_init( jh_4way_context *sc);
+void jh512_4x64_init( jh_4x64_context *sc );
+void jh512_4x64_update( void *cc, const void *data, size_t len );
+void jh512_4x64_close( void *cc, void *dst );
+void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t len );

-void jh256_4way_update(void *cc, const void *data, size_t len);
+#define jh256_4way_init     jh256_4x64_init
+#define jh256_4way_update   jh256_4x64_update 
+#define jh256_4way_close    jh256_4x64_close

-void jh256_4way_close(void *cc, void *dst);
-
-void jh512_4way_init( jh_4way_context *sc );
-
-void jh512_4way_update(void *cc, const void *data, size_t len);
-
-void jh512_4way_close(void *cc, void *dst);
-
-#ifdef __cplusplus
-}
-#endif
+#define jh512_4way_init     jh512_4x64_init
+#define jh512_4way_update   jh512_4x64_update 
+#define jh512_4way_close    jh512_4x64_close

 #endif // AVX2

+typedef struct
+{
+    v128u64_t buf[8];
+    v128u64_t H[16];
+    size_t ptr;
+    uint64_t block_count;
+} jh_2x64_context __attribute__ ((aligned (128)));
+
+typedef jh_2x64_context jh256_2x64_context;
+typedef jh_2x64_context jh512_2x64_context;
+
+void jh256_2x64_init( jh256_2x64_context *cc );
+void jh256_2x64_update( jh256_2x64_context *cc, const void *data, size_t len );
+void jh256_2x64_close( jh256_2x64_context *cc, void *dst );
+void jh256_2x64_ctx( jh256_2x64_context *cc, void *dst, const void *data, size_t len );
+
+void jh512_2x64_init( jh512_2x64_context *cc );
+void jh512_2x64_update( jh256_2x64_context *cc, const void *data, size_t len );
+void jh512_2x64_close( jh256_2x64_context *cc, void *dst );
+void jh512_2x64_ctx( jh256_2x64_context *cc, void *dst, const void *data, size_t len );
+
 #endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -3,7 +3,7 @@
 #include "keccak-hash-4way.h"
 #include "keccak-gate.h"

-#if defined(__AVX2__)
+//#if defined(__AVX2__)

 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
@@ -48,10 +48,6 @@ static const uint64_t RC[] = {
 #define a34   (kc->w[23])
 #define a44   (kc->w[24])

-#define DECL_STATE
-#define READ_STATE(sc)
-#define WRITE_STATE(sc)
-
 #define MOV64(d, s)      (d = s)
 #define XOR64_IOTA       XOR

@@ -131,7 +127,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
    __m512i *buf;
    __m512i *vdata = (__m512i*)data;
    size_t ptr;
-    DECL_STATE

    buf = kc->buf;
    ptr = kc->ptr;
@@ -142,7 +137,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
        kc->ptr = ptr + len;
        return;
    }
-    READ_STATE( kc );
    while ( len > 0 )
    {
        size_t clen;
@@ -161,7 +155,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
            ptr = 0;
        }
    }
-    WRITE_STATE( kc );
    kc->ptr = ptr;
 }

@@ -218,6 +211,13 @@ keccak256_8x64_close(void *cc, void *dst)
    keccak64_8way_close(cc, dst, 32, 136);
 }

+void keccak256_8x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak256_8x64_init( cc );
+   keccak256_8x64_update( cc, data, len );
+   keccak256_8x64_close( cc, dst );
+}
+
 void keccak512_8x64_init( void *kc )
 {
   keccak64_8way_init( kc, 512 );
@@ -235,6 +235,13 @@ keccak512_8x64_close(void *cc, void *dst)
        keccak64_8way_close(cc, dst, 64, 72);
 }

+void keccak512_8x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak512_8x64_init( cc );
+   keccak512_8x64_update( cc, data, len );
+   keccak512_8x64_close( cc, dst );
+}
+
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
@@ -247,9 +254,10 @@ keccak512_8x64_close(void *cc, void *dst)
 #undef XOROR
 #undef XORAND
 #undef XOR3
+
 #endif  // AVX512

-// AVX2
+#if defined(__AVX2__)

 #define INPUT_BUF(size)   do { \
    size_t j; \
@@ -318,7 +326,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
-    DECL_STATE

    buf = kc->buf;
    ptr = kc->ptr;
@@ -330,7 +337,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
        return;
    }

-    READ_STATE( kc );
    while ( len > 0 )
    {
        size_t clen;
@@ -349,7 +355,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
            ptr = 0;
        }
    }
-    WRITE_STATE( kc );
    kc->ptr = ptr;
 }

@@ -389,7 +394,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    memcpy_256( dst, kc->w, m256_len );
 }

-void keccak256_4way_init( void *kc )
+void keccak256_4x64_init( void *kc )
 {
   keccak64_init( kc, 256 );
 }
@@ -406,6 +411,13 @@ keccak256_4x64_close(void *cc, void *dst)
    keccak64_close(cc, dst, 32, 136);
 }

+void keccak256_4x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak256_4x64_init( cc );
+   keccak256_4x64_update( cc, data, len );
+   keccak256_4x64_close( cc, dst );
+}
+
 void keccak512_4x64_init( void *kc )
 {
   keccak64_init( kc, 512 );
@@ -418,11 +430,219 @@ keccak512_4x64_update(void *cc, const void *data, size_t len)
 }

 void
-keccak512_4way_close(void *cc, void *dst)
+keccak512_4x64_close(void *cc, void *dst)
 {
   keccak64_close(cc, dst, 64, 72);
 }

+void keccak512_4x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak512_4x64_init( cc );
+   keccak512_4x64_update( cc, data, len );
+   keccak512_4x64_close( cc, dst );
+}
+
+#undef INPUT_BUF
+#undef DECL64
+#undef XOR64
+#undef XOR
+#undef AND64
+#undef OR64
+#undef NOT64
+#undef ROL64
+#undef KECCAK_F_1600
+#undef KECCAK_F_1600_256
+#undef XOROR
+#undef XORAND
+#undef XOR3
+
+#endif  // AVX2
+
+// SSE2 & NEON
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = v128_xor( kc->w[j], buf[j] ); \
+} while (0)
+
+#define DECL64(x)          v128_t x
+#define XOR(d, a, b)       (d = v128_xor(a,b))
+#define XOR64              XOR
+#define AND64(d, a, b)     (d = v128_and(a,b))
+#define OR64(d, a, b)      (d = v128_or(a,b))
+#define NOT64(d, s)        (d = v128_not( s ) )
+#define ROL64(d, v, n)     (d = v128_rol64(v, n))
+#define XOROR(d, a, b, c)  (d = v128_xoror( a, b, c ) )
+#define XORAND(d, a, b, c) (d = v128_xorand( a, b, c ) )
+#define XOR3( d, a, b, c ) (d = v128_xor3( a, b, c ))
+
+#include "keccak-macros.c"
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_256)
+
+#define KECCAK_F_1600_256   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, v128_64( RC[j + 0] ) ); \
+       KF_ELT( 1,  2, v128_64( RC[j + 1] ) ); \
+       KF_ELT( 2,  3, v128_64( RC[j + 2] ) ); \
+       KF_ELT( 3,  4, v128_64( RC[j + 3] ) ); \
+       KF_ELT( 4,  5, v128_64( RC[j + 4] ) ); \
+       KF_ELT( 5,  6, v128_64( RC[j + 5] ) ); \
+       KF_ELT( 6,  7, v128_64( RC[j + 6] ) ); \
+       KF_ELT( 7,  8, v128_64( RC[j + 7] ) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+static void keccak64x2_init( keccak64_ctx_v128 *kc, unsigned out_size )
+{
+   v128_t zero = v128_zero;
+   v128_t neg1 = v128_neg1;
+
+   // Initialization for the "lane complement".
+   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
+   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
+   kc->w[ 4] = zero;   kc->w[ 5] = zero;
+   kc->w[ 6] = zero;   kc->w[ 7] = zero;
+   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
+   kc->w[10] = zero;   kc->w[11] = zero;
+   kc->w[12] = neg1;   kc->w[13] = zero;
+   kc->w[14] = zero;   kc->w[15] = zero;
+   kc->w[16] = zero;   kc->w[17] = neg1;
+   kc->w[18] = zero;   kc->w[19] = zero;
+   kc->w[20] = neg1;   kc->w[21] = zero;
+   kc->w[22] = zero;   kc->w[23] = zero;
+   kc->w[24] = zero;   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64x2_core( keccak64_ctx_v128 *kc, const void *data, size_t len,
+               size_t lim )
+{
+    v128_t *buf;
+    v128_t *vdata = (v128_t*)data;
+    size_t ptr;
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        v128_memcpy( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        v128_memcpy( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    kc->ptr = ptr;
+}
+
+static void keccak64x2_close( keccak64_ctx_v128 *kc, void *dst,
+                              size_t byte_len, size_t lim )
+{
+    unsigned eb;
+    union {
+       v128_t tmp[lim + 1];
+       uint64_t dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t v128_len = byte_len >> 3;
+
+    eb = hard_coded_eb;
+    if ( kc->ptr == (lim - 8) )
+    {
+        const uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = v128_64( t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = v128_64( eb );
+        v128_memset_zero( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = v128_64( 0x8000000000000000 );
+    }
+    keccak64x2_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    v128_memcpy( dst, kc->w, v128_len );
+}
+
+void keccak256_2x64_init( void *kc )
+{
+   keccak64x2_init( kc, 256 );
+}
+
+void
+keccak256_2x64_update(void *cc, const void *data, size_t len)
+{
+    keccak64x2_core(cc, data, len, 136);
+}
+
+void
+keccak256_2x64_close(void *cc, void *dst)
+{
+    keccak64x2_close(cc, dst, 32, 136);
+}
+
+void keccak256_2x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak256_2x64_init( cc );
+   keccak256_2x64_update( cc, data, len );
+   keccak256_2x64_close( cc, dst );
+}
+
+void keccak512_2x64_init( void *kc )
+{
+   keccak64x2_init( kc, 512 );
+}
+
+void
+keccak512_2x64_update(void *cc, const void *data, size_t len)
+{
+   keccak64x2_core(cc, data, len, 72);
+}
+
+void
+keccak512_2x64_close(void *cc, void *dst)
+{
+   keccak64x2_close(cc, dst, 64, 72);
+}
+
+void keccak512_2x64_ctx( void *cc, void *dst, const void *data, size_t len )
+{
+   keccak512_2x64_init( cc );
+   keccak512_2x64_update( cc, data, len );
+   keccak512_2x64_close( cc, dst );
+}
+
+
 #undef INPUT_BUF
 #undef DECL64
 #undef XOR64
@@ -436,4 +656,4 @@ keccak512_4way_close(void *cc, void *dst)
 #undef XORAND
 #undef XOR3
        
-#endif  // AVX2
+
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -19,10 +19,12 @@ typedef keccak64_ctx_m512i keccak512_8x64_context;
 void keccak256_8x64_init(void *cc);
 void keccak256_8x64_update(void *cc, const void *data, size_t len);
 void keccak256_8x64_close(void *cc, void *dst);
+void keccak256_8x64_ctx( void *cc, void *dst, const void *data, size_t len );

 void keccak512_8x64_init(void *cc);
 void keccak512_8x64_update(void *cc, const void *data, size_t len);
 void keccak512_8x64_close(void *cc, void *dst);
+void keccak512_8x64_ctx( void *cc, void *dst, const void *data, size_t len );

 // legacy naming
 #define keccak512_8way_context keccak512_8x64_context
@@ -51,10 +53,12 @@ typedef keccak64_ctx_m256i keccak512_4x64_context;
 void keccak256_4x64_init(void *cc);
 void keccak256_4x64_update(void *cc, const void *data, size_t len);
 void keccak256_4x64_close(void *cc, void *dst);
+void keccak256_4x64_ctx( void *cc, void *dst, const void *data, size_t len );

 void keccak512_4x64_init(void *cc);
 void keccak512_4x64_update(void *cc, const void *data, size_t len);
 void keccak512_4x64_close(void *cc, void *dst);
+void keccak512_4x64_ctx( void *cc, void *dst, const void *data, size_t len );

 // legacy naming
 #define keccak512_4way_context keccak512_4x64_context
@@ -68,27 +72,27 @@ void keccak512_4x64_close(void *cc, void *dst);

 #endif

-#if defined(__SSE2__) || defined(__ARM_NEON)
-
 typedef struct
 {
-    v128_t buf[144*4];
-    v128_t w[50];
+   v128_t buf[144*8];
+   v128_t w[25];
   size_t ptr, lim;
-} keccak32_ctx_v128 __attribute__((aligned(64)));
+} keccak64_ctx_v128 __attribute__((aligned(128)));

-typedef keccak32_ctx_v128 keccak256_4x32_context;
-typedef keccak32_ctx_v128 keccak512_4x32_context;
+typedef keccak64_ctx_v128 keccak256_2x64_context;
+typedef keccak64_ctx_v128 keccak512_2x64_context;
+
+void keccak256_2x64_init (void *cc );
+void keccak256_2x64_update( void *cc, const void *data, size_t len );
+void keccak256_2x64_close( void *cc, void *dst );
+void keccak256_2x64_ctx( void *cc, void *dst, const void *data, size_t len );
+
+void keccak512_2x64_init( void *cc );
+void keccak512_2x64_update( void *cc, const void *data, size_t len );
+void keccak512_2x64_close( void *cc, void *dst );
+void keccak512_2x64_ctx( void *cc, void *dst, const void *data, size_t len );

-void keccak256_4x32_init(void *cc);
-void keccak256_4x32_update(void *cc, const void *data, size_t len);
-void keccak256_4x32_close(void *cc, void *dst);

-void keccak512_4x32_init(void *cc);
-void keccak512_4x32_update(void *cc, const void *data, size_t len);
-void keccak512_4x32_close(void *cc, void *dst);
-
-#endif

 #endif

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -50,8 +50,6 @@

 #elif defined(__ARM_NEON)

-#pragma message "NEON for Luffa"
-
 const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };

 // { a1_0, 0, a1_0, a1_0 }
@@ -316,11 +314,11 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, v128_mov64(  0x80000000 ),
+       rnd512( state, v128_set64( 0, 0x0000000080000000 ),
                      v128_bswap32( cast_v128( data ) ) );
    else
       // empty pad block
-       rnd512( state, v128_zero, v128_64( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_64( 0x0000000080000000 ) );

    finalization512( state, (uint32_t*) output );
    if ( state->hashbitlen > 512 )
@@ -338,7 +336,7 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
    state->hashbitlen = hashbitlen;
 #if !defined(__SSE4_1__)
    /* set the lower 32 bits to '1' */
-    MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
+    MASK= v128_set64( 0, 0x00000000ffffffff );
 #endif
    /* set the 32-bit round constant values to the 128-bit data field */
    for ( i=0; i<32; i++ )
@@ -365,11 +363,11 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, v128_mov64( 0x80000000 ),
+       rnd512( state, v128_set64( 0, 0x0000000080000000 ),
                      v128_bswap32( cast_v128( data ) ) );
    else
       // empty pad block
-       rnd512( state, v128_zero, v128_mov64( 0x80000000 ) );
+       rnd512( state, v128_zero, v128_set64( 0, 0x0000000080000000 ) );

    finalization512( state, (uint32_t*) output );
    if ( state->hashbitlen > 512 )
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,6 +1,5 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-#include <mm_malloc.h>
 #include "algo/blake/blake256-hash.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -10,6 +9,19 @@
 #if defined(__VAES__)
  #include "algo/groestl/groestl256-hash-4way.h"
 #endif
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
+ #include "algo/groestl/sph_groestl.h"
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ALLIUM_16WAY 1
+#elif defined(__AVX2__)
+  #define ALLIUM_8WAY 1
+#elif #defined(__SSE2__) || defined(__ARM_NEON)
+  #define ALLIUM_4WAY 1
+#endif

 #if defined (ALLIUM_16WAY)  

@@ -443,4 +455,297 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#elif defined(__SSE2__) || defined(__ARM_NEON)
+
+///////////////////
+//
+//    4 way
+
+typedef union
+{
+   keccak256_2x64_context    keccak;
+   cubehashParam             cube;
+#if defined(__x86_64__)
+   skein256_2x64_context     skein;
+#else
+   sph_skein512_context      skein;
 #endif
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   hashState_groestl256      groestl;
+#else
+   sph_groestl256_context     groestl;
+#endif
+} allium_4way_ctx_holder;
+
+static void allium_4way_hash( void *hash, const void *midstate_vars,
+                               const void *midhash, const void *block )
+{
+   uint64_t vhashA[4*4] __attribute__ ((aligned (64)));
+   uint64_t *hash0 = (uint64_t*)hash;
+   uint64_t *hash1 = (uint64_t*)hash+ 4;
+   uint64_t *hash2 = (uint64_t*)hash+ 8;
+   uint64_t *hash3 = (uint64_t*)hash+12;
+   allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+
+   blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
+   dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
+
+   intrlv_2x64( vhashA, hash0, hash1, 256 );
+   keccak256_2x64_init( &ctx.keccak );
+   keccak256_2x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_2x64_close( &ctx.keccak, vhashA );
+   dintrlv_2x64( hash0, hash1, vhashA, 256 );
+   intrlv_2x64( vhashA, hash2, hash3, 256 );
+   keccak256_2x64_init( &ctx.keccak );
+   keccak256_2x64_update( &ctx.keccak, vhashA, 32 );
+   keccak256_2x64_close( &ctx.keccak, vhashA );
+   dintrlv_2x64( hash2, hash3, vhashA, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   cubehash_full( &ctx.cube, hash0, 256, hash0, 32 );
+   cubehash_full( &ctx.cube, hash1, 256, hash1, 32 );
+   cubehash_full( &ctx.cube, hash2, 256, hash2, 32 );
+   cubehash_full( &ctx.cube, hash3, 256, hash3, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+#if defined(__x86_64__)
+   intrlv_2x64( vhashA, hash0, hash1, 256 );
+   skein256_2x64_init( &ctx.skein );
+   skein256_2x64_update( &ctx.skein, vhashA, 32 );
+   skein256_2x64_close( &ctx.skein, vhashA );
+   dintrlv_2x64( hash0, hash1, vhashA, 256 );
+   intrlv_2x64( vhashA, hash2, hash3, 256 );
+   skein256_2x64_init( &ctx.skein );
+   skein256_2x64_update( &ctx.skein, vhashA, 32 );
+   skein256_2x64_close( &ctx.skein, vhashA );
+   dintrlv_2x64( hash2, hash3, vhashA, 256 );
+#else
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash0, 32 );
+    sph_skein256_close( &ctx.skein, hash0 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash1, 32 );
+    sph_skein256_close( &ctx.skein, hash1 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash2, 32 );
+    sph_skein256_close( &ctx.skein, hash2 );
+    sph_skein256_init( &ctx.skein );
+    sph_skein256( &ctx.skein, hash3, 32 );
+    sph_skein256_close( &ctx.skein, hash3 );
+#endif
+
+#if defined(__AES__) || defined(__ARM_FEATURE_AES)
+   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
+   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
+   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
+   groestl256_full( &ctx.groestl, hash3, hash3, 256 );
+#else
+   sph_groestl256_init( &ctx.groestl );
+   sph_groestl256( &ctx.groestl, hash0, 32 );
+   sph_groestl256_close( &ctx.groestl, hash0 );
+   sph_groestl256_init( &ctx.groestl );
+   sph_groestl256( &ctx.groestl, hash1, 32 );
+   sph_groestl256_close( &ctx.groestl, hash1 );
+   sph_groestl256_init( &ctx.groestl );
+   sph_groestl256( &ctx.groestl, hash2, 32 );
+   sph_groestl256_close( &ctx.groestl, hash2 );
+   sph_groestl256_init( &ctx.groestl );
+   sph_groestl256( &ctx.groestl, hash3, 32 );
+   sph_groestl256_close( &ctx.groestl, hash3 );
+#endif
+}
+
+int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint64_t hash[4*4] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*4] __attribute__ ((aligned (64)));
+   v128_t block0_hash[8] __attribute__ ((aligned (64)));
+   v128_t block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
+   uint32_t *pdata = work->data;
+   uint64_t *ptarget = (uint64_t*)work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const v128u32_t four = v128_32(4);
+
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0, 14 );
+
+   block0_hash[0] = v128_32( phash[0] );
+   block0_hash[1] = v128_32( phash[1] );
+   block0_hash[2] = v128_32( phash[2] );
+   block0_hash[3] = v128_32( phash[3] );
+   block0_hash[4] = v128_32( phash[4] );
+   block0_hash[5] = v128_32( phash[5] );
+   block0_hash[6] = v128_32( phash[6] );
+   block0_hash[7] = v128_32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = v128_32( pdata[16] );
+   block_buf[ 1] = v128_32( pdata[17] );
+   block_buf[ 2] = v128_32( pdata[18] );
+   block_buf[ 3] = v128_set32( n+3, n+2, n+1, n );
+   block_buf[ 4] = v128_32( 0x80000000 );
+   block_buf[13] = v128_32( 1 );
+   block_buf[15] = v128_32( 640 );
+
+      // Partialy prehash second block without touching nonces
+   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
+   do {
+     allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
+
+     for ( int lane = 0; lane < 4; lane++ )
+     {
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = n + lane;
+           submit_solution( work, lane_hash, mythr );
+        }
+     }
+     n += 4;
+     block_buf[3] = v128_add32( block_buf[3], four );
+   } while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
+////////////
+//
+//  1 way
+
+
+typedef struct 
+{
+        blake256_context        blake;
+        sph_keccak256_context      keccak;
+        cubehashParam           cube;
+        sph_skein256_context       skein;
+#if defined (__AES__) || defined(__ARM_FEATURE_AES)
+        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
+#endif
+} allium_ctx_holder;
+
+static __thread allium_ctx_holder allium_ctx;
+
+bool init_allium_ctx()
+{
+        sph_keccak256_init( &allium_ctx.keccak );
+        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
+        sph_skein256_init( &allium_ctx.skein );
+#if defined (__AES__) || defined(__ARM_FEATURE_AES)
+        init_groestl256( &allium_ctx.groestl, 32 );
+#else
+        sph_groestl256_init( &allium_ctx.groestl );
+#endif
+        return true;
+}
+
+void allium_hash(void *state, const void *input)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    allium_ctx_holder ctx __attribute__ ((aligned (32)));
+
+    memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
+    blake256_update( &ctx.blake, input + 64, 16 );
+    blake256_close( &ctx.blake, hash );
+
+    sph_keccak256( &ctx.keccak, hash, 32 );
+    sph_keccak256_close( &ctx.keccak, hash );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    sph_skein256( &ctx.skein, hash, 32 );
+    sph_skein256_close( &ctx.skein, hash );
+
+#if defined (__AES__) || defined(__ARM_FEATURE_AES)
+   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
+#else
+   sph_groestl256( &ctx.groestl, hash, 32 );
+   sph_groestl256_close( &ctx.groestl, hash );
+#endif
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_allium( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
+{
+    uint32_t _ALIGN(128) hash[8];
+    uint32_t _ALIGN(128) edata[20];
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t first_nonce = pdata[19];
+    uint32_t nonce = first_nonce;
+    const int thr_id = mythr->id;
+
+    if ( opt_benchmark )
+        ptarget[7] = 0x3ffff;
+
+    for ( int i = 0; i < 19; i++ )
+        edata[i] = bswap_32( pdata[i] );
+
+    blake256_init( &allium_ctx.blake );
+    blake256_update( &allium_ctx.blake, edata, 64 );
+
+    do {
+        edata[19] = nonce;
+        allium_hash( hash, edata );
+        if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+        {
+            pdata[19] = bswap_32( nonce );
+            submit_solution( work, hash, mythr );
+        }
+        nonce++;
+    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
+    pdata[19] = nonce;
+    *hashes_done = pdata[19] - first_nonce;
+    return 0;
+}
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_16WAY)
+  gate->scanhash  = (void*)&scanhash_allium_16way;
+#elif defined (ALLIUM_8WAY)
+  gate->scanhash  = (void*)&scanhash_allium_8way;
+#elif defined (ALLIUM_4WAY)
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+#else
+  gate->miner_thread_init = (void*)&init_allium_ctx;
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
+                      | VAES_OPT | NEON_OPT;
+  opt_target_factor = 256.0;
+  return true;
+};
+
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,110 +0,0 @@
-#include "lyra2-gate.h"
-
-#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
-
-#include <memory.h>
-#include "algo/blake/sph_blake.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/cubehash/cubehash_sse2.h" 
-#if defined(__AES__)
-#include "algo/groestl/aes_ni/hash-groestl256.h"
-#else
-#include "algo/groestl/sph_groestl.h"
-#endif
-#include "lyra2.h"
-
-typedef struct {
-        sph_blake256_context     blake;
-        sph_keccak256_context    keccak;
-        cubehashParam            cube;
-        sph_skein256_context     skein;
-#if defined (__AES__)
-        hashState_groestl256     groestl;
-#else
-        sph_groestl256_context   groestl;
-#endif
-} allium_ctx_holder;
-
-static __thread allium_ctx_holder allium_ctx;
-
-bool init_allium_ctx()
-{
-        sph_keccak256_init( &allium_ctx.keccak );
-        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
-        sph_skein256_init( &allium_ctx.skein );
-#if defined (__AES__)
-        init_groestl256( &allium_ctx.groestl, 32 );
-#else
-        sph_groestl256_init( &allium_ctx.groestl );
-#endif
-        return true;
-}
-
-void allium_hash(void *state, const void *input)
-{
-    uint32_t hash[8] __attribute__ ((aligned (64)));
-    allium_ctx_holder ctx __attribute__ ((aligned (32)));
-
-    memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
-    sph_blake256( &ctx.blake, input + 64, 16 );
-    sph_blake256_close( &ctx.blake, hash );
-
-    sph_keccak256( &ctx.keccak, hash, 32 );
-    sph_keccak256_close( &ctx.keccak, hash );
-
-    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-
-    cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
-
-    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
-
-    sph_skein256( &ctx.skein, hash, 32 );
-    sph_skein256_close( &ctx.skein, hash );
-
-#if defined (__AES__)
-   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
-#else
-   sph_groestl256( &ctx.groestl, hash, 32 );
-   sph_groestl256_close( &ctx.groestl, hash );
-#endif
-
-    memcpy(state, hash, 32);
-}
-
-int scanhash_allium( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr )
-{
-    uint32_t _ALIGN(128) hash[8];
-    uint32_t _ALIGN(128) edata[20];
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    const uint32_t first_nonce = pdata[19];
-    uint32_t nonce = first_nonce;
-    const int thr_id = mythr->id; 
-
-    if ( opt_benchmark )
-        ptarget[7] = 0x3ffff;
-
-    for ( int i = 0; i < 19; i++ )
-        edata[i] = bswap_32( pdata[i] );
-
-    sph_blake256_init( &allium_ctx.blake );
-    sph_blake256( &allium_ctx.blake, edata, 64 );
-
-    do {
-        edata[19] = nonce;
-        allium_hash( hash, edata );
-        if ( valid_hash( hash, ptarget ) && !opt_benchmark )
-        {
-            pdata[19] = bswap_32( nonce );
-            submit_solution( work, hash, mythr );
-        }
-        nonce++;
-    } while ( nonce < max_nonce && !work_restart[thr_id].restart );
-    pdata[19] = nonce;
-    *hashes_done = pdata[19] - first_nonce;
-    return 0;
-}
-
-#endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -1,5 +1,5 @@
 #include "lyra2-gate.h"
-#include <mm_malloc.h>
+//#include <mm_malloc.h>

 // huge pages
 //
@@ -48,10 +48,10 @@ bool lyra2rev3_thread_init()

 #if defined(LYRA2REV3_16WAY)
 //   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
-   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
+   l2v3_wholeMatrix = mm_malloc( 2*size, 64 );
   init_lyra2rev3_16way_ctx();;
 #else
-   l2v3_wholeMatrix = _mm_malloc( size, 64 );
+   l2v3_wholeMatrix = mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
 #elif defined (LYRA2REV3_4WAY)
@@ -95,13 +95,13 @@ bool lyra2rev2_thread_init()

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
 #if defined (LYRA2REV2_16WAY)
-   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   l2v2_wholeMatrix = mm_malloc( 2 * size, 64 );   // 2 way
   init_lyra2rev2_16way_ctx();;
 #elif defined (LYRA2REV2_8WAY)
-   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+   l2v2_wholeMatrix = mm_malloc( size, 64 );
   init_lyra2rev2_8way_ctx();;
 #else
-   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+   l2v2_wholeMatrix = mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -125,6 +125,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  return true;
 };

+/*
 /////////////////////////////

 bool register_lyra2z_algo( algo_gate_t* gate )
@@ -146,11 +147,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };
-
+*/

 ////////////////////////

@@ -171,7 +172,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 };

 /////////////////////////////////
-
+/*
 bool register_allium_algo( algo_gate_t* gate )
 {
 #if defined (ALLIUM_16WAY)
@@ -184,11 +185,11 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->hash      = (void*)&allium_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
-	                   | VAES_OPT;
+	                   | VAES_OPT | NEON_OPT;
  opt_target_factor = 256.0;
  return true;
 };
-
+*/
 /////////////////////////////////////////

 bool phi2_has_roots = false;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,6 @@
 #include <stdint.h>
 #include "lyra2.h"

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
@@ -74,7 +73,6 @@ int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev2_8way_ctx();

-
 #else

 void lyra2rev2_hash( void *state, const void *input );
@@ -84,49 +82,6 @@ bool init_lyra2rev2_ctx();

 #endif

-/////////////////////////
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define LYRA2Z_16WAY 1
-#elif defined(__AVX2__)
-  #define LYRA2Z_8WAY 1
-#elif defined(__SSE2__)
-  #define LYRA2Z_4WAY 1
-#endif
-
-
-#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
-
-#if defined(LYRA2Z_16WAY)
-
-//void lyra2z_16way_hash( void *state, const void *input );
-int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool lyra2z_16way_thread_init();
-
-#elif defined(LYRA2Z_8WAY)
-
-//void lyra2z_8way_hash( void *state, const void *input );
-int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool lyra2z_8way_thread_init();
-
-#elif defined(LYRA2Z_4WAY)
-
-void lyra2z_4way_hash( void *state, const void *input );
-int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool lyra2z_4way_thread_init();
-
-#else
-
-void lyra2z_hash( void *state, const void *input );
-int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-bool lyra2z_thread_init();
-
-#endif
-
 ////////////////////

 #if defined(__AVX2__)
@@ -151,35 +106,6 @@ bool lyra2h_thread_init();

 #endif

-//////////////////////////////////
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define ALLIUM_16WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_8WAY 1
-#endif
-
-bool register_allium_algo( algo_gate_t* gate );
-
-#if defined(ALLIUM_16WAY)
-
-int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-
-#elif defined(ALLIUM_8WAY)
-
-int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-
-#else
-
-void allium_hash( void *state, const void *input );
-int scanhash_allium( struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_ctx();
-
-#endif 
-
 /////////////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -21,8 +21,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include "compat.h"
+#include "miner.h"
 #include "lyra2.h"
 #include "sponge.h"

@@ -468,7 +469,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   uint64_t *wholeMatrix = mm_malloc( 2*i, 64 );
   if (wholeMatrix == NULL)
      return -1;

@@ -570,7 +571,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
   squeeze_2way( state, K, (unsigned int) kLen );

   //================== Freeing the memory =============================//
-   _mm_free(wholeMatrix);
+   mm_free(wholeMatrix);

   return 0;
 }
@@ -602,7 +603,7 @@ int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   uint64_t *wholeMatrix = mm_malloc( 2*i, 64 );
   if (wholeMatrix == NULL)
      return -1;

@@ -704,7 +705,7 @@ int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
   squeeze_2way( state, K, (unsigned int) kLen );

   //================== Freeing the memory =============================//
-   _mm_free(wholeMatrix);
+   mm_free(wholeMatrix);

   return 0;
 }
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -21,7 +21,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
+#include "miner.h"
 #include "compat.h"
 #include "lyra2.h"
 #include "sponge.h"
@@ -463,7 +464,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

    //=================== Initializing the Sponge State ====================//
    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
+//        uint64_t *state = mm_malloc(16 * sizeof(uint64_t), 32);
 //        if (state == NULL) {
 //                return -1;
 //        }
@@ -572,7 +573,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   uint64_t *wholeMatrix = mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;

@@ -720,7 +721,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   squeeze(state, K, (unsigned int) kLen);

   //================== Freeing the memory =============================//
-   _mm_free(wholeMatrix);
+   mm_free(wholeMatrix);

   return 0;
 }
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -37,8 +37,8 @@ typedef unsigned char byte;
        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif

-#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
-#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
+#define BLOCK_LEN_256 (BLOCK_LEN_INT64 / 4 )
+#define BLOCK_LEN_128 (BLOCK_LEN_INT64 / 2 )

 int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
             uint64_t pwdlen, const void *salt, uint64_t saltlen,
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -3,7 +3,7 @@
 #ifdef LYRA2H_4WAY

 #include <memory.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/blake256-hash.h"

@@ -11,7 +11,7 @@ __thread uint64_t* lyra2h_4way_matrix;

 bool lyra2h_4way_thread_init()
 {
- return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
+ return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
 }

 static __thread blake256_4way_context l2h_4way_blake_mid;
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -3,7 +3,7 @@
 #if !( defined(LYRA2H_8WAY) || defined(LYRA2H_4WAY) )

 #include <memory.h>
-#include <mm_malloc.h>
+//#include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"

@@ -11,7 +11,7 @@ __thread uint64_t* lyra2h_matrix;

 bool lyra2h_thread_init()
 {
-   lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
+   lyra2h_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
   return lyra2h_matrix;
 }

--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,16 +1,27 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-#include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/blake256-hash.h"

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2Z_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2Z_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2Z_4WAY 1
+//#else
+// NEON 1 way SIMD
+#endif
+
+#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
+
 #if defined(LYRA2Z_16WAY)

 __thread uint64_t* lyra2z_16way_matrix;

 bool lyra2z_16way_thread_init()
 {
- return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
+ return ( lyra2z_16way_matrix = mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
 }

 static void lyra2z_16way_hash( void *state, const void *midstate_vars,
@@ -153,7 +164,7 @@ __thread uint64_t* lyra2z_8way_matrix;

 bool lyra2z_8way_thread_init()
 {
- return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+ return ( lyra2z_8way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

 static void lyra2z_8way_hash( void *state, const void *midstate_vars,
@@ -259,12 +270,13 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,

 #elif defined(LYRA2Z_4WAY)

+//    SSE2 or NEON
    
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
 {
- return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+ return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
 }

 static __thread blake256_4way_context l2z_4way_blake_mid;
@@ -275,59 +287,90 @@ void lyra2z_4way_midstate( const void* input )
       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
 }

-void lyra2z_4way_hash( void *state, const void *input )
+void lyra2z_4way_hash( void *hash, const void *midstate_vars,
+                               const void *midhash, const void *block )
 {
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
     uint32_t hash3[8] __attribute__ ((aligned (64)));
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+//     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

+     blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
+
+/*
     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );
+*/

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

-     LYRA2Z( lyra2z_4way_matrix, state   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
 }

 int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint64_t hash[4*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*4] __attribute__ ((aligned (64)));
+   v128_t block0_hash[8] __attribute__ ((aligned (64)));
+   v128_t block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   uint64_t *ptarget = (uint64_t*)work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
+   const v128u32_t four = v128_32(4);

-   if ( bench )   ptarget[7] = 0x0000ff;
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0, 14 );

-   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
-   lyra2z_4way_midstate( vdata );
+   block0_hash[0] = v128_32( phash[0] );
+   block0_hash[1] = v128_32( phash[1] );
+   block0_hash[2] = v128_32( phash[2] );
+   block0_hash[3] = v128_32( phash[3] );
+   block0_hash[4] = v128_32( phash[4] );
+   block0_hash[5] = v128_32( phash[5] );
+   block0_hash[6] = v128_32( phash[6] );
+   block0_hash[7] = v128_32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[ 0] = v128_32( pdata[16] );
+   block_buf[ 1] = v128_32( pdata[17] );
+   block_buf[ 2] = v128_32( pdata[18] );
+   block_buf[ 3] = v128_set32( n+3, n+2, n+1, n );
+   block_buf[ 4] = v128_32( 0x80000000 );
+   block_buf[13] = v128_32( 1 );
+   block_buf[15] = v128_32( 640 );
+
+   // Partialy prehash second block without touching nonces
+   blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      lyra2z_4way_hash( hash, vdata );
+      lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
      for ( int lane = 0; lane < 4; lane++ )
      {
        const uint64_t *lane_hash = hash + (lane<<2);
        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
        {
-           pdata[19] = bswap_32( n + lane );
+           pdata[19] =  n + lane;
           submit_solution( work, lane_hash, mythr );
        }
      }
-      *noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
+      block_buf[ 3] = v128_add32( block_buf[ 3], four );
      n += 4;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

@@ -336,5 +379,97 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#else
+
+// not used
+
+__thread uint64_t* lyra2z_matrix;
+
+bool lyra2z_thread_init()
+{
+   const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
+   lyra2z_matrix = mm_malloc( i, 64 );
+   return lyra2z_matrix;
+}
+
+static __thread blake256_context lyra2z_blake_mid;
+
+void lyra2z_midstate( const void* input )
+{
+       blake256_init( &lyra2z_blake_mid );
+       blake256_update( &lyra2z_blake_mid, input, 64 );
+}
+
+void lyra2z_hash( void *state, const void *input )
+{
+    uint32_t _ALIGN(32) hash[16];
+    blake256_context ctx_blake __attribute__ ((aligned (64)));
+
+    memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof (blake256_context) );
+    blake256_update( &ctx_blake, input + 64, 16 );
+    blake256_close( &ctx_blake, hash );
+
+    LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8 );
+
+    memcpy( state, hash, 32 );
+}
+
+int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(64) hash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
+   int thr_id = mythr->id;
+
+   if (opt_benchmark) ptarget[7] = 0x0000ff;
+
+//   for ( int i = 0; i < 20; i++ ) endiandata[i] = bswap_32( pdata[i] );
+   v128_bswap32_80( endiandata, pdata );
+
+   lyra2z_midstate( endiandata );
+
+   do {
+      endiandata[19] = bswap_32( nonce );
+      lyra2z_hash( hash, endiandata );
+      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !work_restart[thr_id].restart );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
+
 #endif

+bool register_lyra2z_algo( algo_gate_t* gate )
+{
+#if defined(LYRA2Z_16WAY)
+  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
+  gate->scanhash          = (void*)&scanhash_lyra2z_16way;
+//  gate->hash       = (void*)&lyra2z_16way_hash;
+#elif defined(LYRA2Z_8WAY)
+  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
+  gate->scanhash          = (void*)&scanhash_lyra2z_8way;
+//  gate->hash       = (void*)&lyra2z_8way_hash;
+#elif defined(LYRA2Z_4WAY)
+  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
+  gate->scanhash          = (void*)&scanhash_lyra2z_4way;
+  gate->hash              = (void*)&lyra2z_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash          = (void*)&scanhash_lyra2z;
+  gate->hash              = (void*)&lyra2z_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
+  opt_target_factor = 256.0;
+  return true;
+};
+
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,84 +0,0 @@
-#include <memory.h>
-#include <mm_malloc.h>
-#include "lyra2-gate.h"
-
-#if !( defined(LYRA2Z_16WAY) || defined(LYRA2Z_8WAY) || defined(LYRA2Z_4WAY) )
-
-#include "lyra2.h"
-#include "algo/blake/sph_blake.h"
-#include "simd-utils.h"
-
-__thread uint64_t* lyra2z_matrix;
-
-bool lyra2z_thread_init()
-{
-//   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-//   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
-   lyra2z_matrix = _mm_malloc( i, 64 );
-   return lyra2z_matrix;
-}
-
-static __thread sph_blake256_context lyra2z_blake_mid;
-
-void lyra2z_midstate( const void* input )
-{
-       sph_blake256_init( &lyra2z_blake_mid );
-       sph_blake256( &lyra2z_blake_mid, input, 64 );
-}
-
-// block 2050 new algo, blake plus new lyra parms. new input
-// is power of 2 so normal lyra can be used
-//void zcoin_hash(void *state, const void *input, uint32_t height)
-void lyra2z_hash( void *state, const void *input )
-{
-        uint32_t _ALIGN(64) hash[16];
-
-        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
-
-        memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
-        sph_blake256( &ctx_blake, input + 64, 16 );
-        sph_blake256_close( &ctx_blake, hash );
-
-        LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
-
-    memcpy(state, hash, 32);
-}
-
-int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done, struct thr_info *mythr )
-{
-	uint32_t _ALIGN(64) hash[8];
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-   int thr_id = mythr->id; 
-
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;
-
-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
-
-   lyra2z_midstate( endiandata );
-
-	do {
-		be32enc(&endiandata[19], nonce);
-                lyra2z_hash( hash, endiandata );
-
-      if ( valid_hash( hash, ptarget ) && !opt_benchmark )
-      {
-			pdata[19] = nonce;
-			submit_solution( work, hash, mythr );
-	   }
-		nonce++;
-	} while ( nonce < max_nonce && !work_restart[thr_id].restart );
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-#endif
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -2,7 +2,6 @@
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "simd-utils.h"
-#include <mm_malloc.h>

 static __thread uint64_t* lyra2z330_wholeMatrix;

@@ -62,14 +61,14 @@ bool lyra2z330_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
-   lyra2z330_wholeMatrix = _mm_malloc( i, 64 );
+   lyra2z330_wholeMatrix = mm_malloc( i, 64 );

   return lyra2z330_wholeMatrix;
 }

 bool register_lyra2z330_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | NEON_OPT;
  gate->miner_thread_init = (void*)&lyra2z330_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -32,7 +32,7 @@
 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
    const int len_m256i = len / 32;
-    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
+    const int fullBlocks = len_m256i / BLOCK_LEN_256;
    __m512i* state = (__m512i*)State;
    __m512i* out   = (__m512i*)Out;
    int i;
@@ -40,12 +40,12 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_512( out, state, BLOCK_LEN_M256I );
+       memcpy_512( out, state, BLOCK_LEN_256 );
       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I;
+       out += BLOCK_LEN_256;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_256 );
 }

 inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
@@ -116,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,


    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -139,7 +139,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
       out[2] = state2;

       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I;
+       out -= BLOCK_LEN_256;

       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -157,7 +157,7 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
    int i;
    register __m512i state0, state1, state2, state3;
    __m512i *in = (__m512i*)rowIn;
-    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -177,9 +177,9 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );

         //Input: next column (i.e., next block in sequence)
-         in += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_256;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I;
+         out -= BLOCK_LEN_256;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -195,7 +195,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -234,10 +234,10 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
      }

      //Inputs: next column (i.e., next block in sequence)
-      in    += BLOCK_LEN_M256I;
-      inout += BLOCK_LEN_M256I;
+      in    += BLOCK_LEN_256;
+      inout += BLOCK_LEN_256;
      //Output: goes to previous column
-      out   -= BLOCK_LEN_M256I;
+      out   -= BLOCK_LEN_256;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -336,10 +336,10 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2 );

      //Goes to next block
-      in     += BLOCK_LEN_M256I;
-      inout0 += BLOCK_LEN_M256I;
-      inout1 += BLOCK_LEN_M256I;
-      out    += BLOCK_LEN_M256I;
+      in     += BLOCK_LEN_256;
+      inout0 += BLOCK_LEN_256;
+      inout1 += BLOCK_LEN_256;
+      out    += BLOCK_LEN_256;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -458,10 +458,10 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
 */
      //Goes to next block
-      in     += BLOCK_LEN_M256I;
-      inout0 += BLOCK_LEN_M256I;
-      inout1 += BLOCK_LEN_M256I;
-      out    += BLOCK_LEN_M256I;
+      in     += BLOCK_LEN_256;
+      inout0 += BLOCK_LEN_256;
+      inout1 += BLOCK_LEN_256;
+      out    += BLOCK_LEN_256;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -550,10 +550,10 @@ static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State,
      inout1[5] = inout.v256[5];

       //Goes to next block
-       in     += BLOCK_LEN_M256I;
-       inout0 += BLOCK_LEN_M256I * 2;
-       inout1 += BLOCK_LEN_M256I * 2;
-       out    += BLOCK_LEN_M256I;
+       in     += BLOCK_LEN_256;
+       inout0 += BLOCK_LEN_256 * 2;
+       inout1 += BLOCK_LEN_256 * 2;
+       out    += BLOCK_LEN_256;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
@@ -610,9 +610,9 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
     }

     //Goes to next block
-     in    += BLOCK_LEN_M256I;
-     inout += BLOCK_LEN_M256I;
-     out   += BLOCK_LEN_M256I;
+     in    += BLOCK_LEN_256;
+     inout += BLOCK_LEN_256;
+     out   += BLOCK_LEN_256;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -97,11 +97,11 @@ static const uint64_t blake2b_IV[8] =

 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
-   b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
+   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

@@ -144,38 +144,38 @@ static const uint64_t blake2b_IV[8] =

 #endif

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

 // process 2 columns in parallel
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
-   a = _mm_add_epi64( a, b ); \
-   d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
-   c = _mm_add_epi64( c, d ); \
-   b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
-   a = _mm_add_epi64( a, b ); \
-   d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
-   c = _mm_add_epi64( c, d ); \
-   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
+   a = v128_add64( a, b ); \
+   d = v128_ror64( v128_xor( d, a), 32 ); \
+   c = v128_add64( c, d ); \
+   b = v128_ror64( v128_xor( b, c ), 24 ); \
+   a = v128_add64( a, b ); \
+   d = v128_ror64( v128_xor( d, a ), 16 ); \
+   c = v128_add64( c, d ); \
+   b = v128_ror64( v128_xor( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
 { \
-   __m128i t; \
+   v128u64_t t; \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   t = mm128_alignr_64( s7, s6, 1 ); \
-   s6 = mm128_alignr_64( s6, s7, 1 ); \
+   t =  v128_alignr64( s7, s6, 1 ); \
+   s6 = v128_alignr64( s6, s7, 1 ); \
   s7 = t; \
-   t = mm128_alignr_64( s2, s3, 1 ); \
-   s2 =  mm128_alignr_64( s3, s2, 1 ); \
+   t =  v128_alignr64( s2, s3, 1 ); \
+   s2 = v128_alignr64( s3, s2, 1 ); \
   s3 = t; \
   G_2X64( s0, s2, s5, s6 ); \
   G_2X64( s1, s3, s4, s7 ); \
-   t = mm128_alignr_64( s6, s7, 1 ); \
-   s6 = mm128_alignr_64( s7, s6, 1 ); \
+   t =  v128_alignr64( s6, s7, 1 ); \
+   s6 = v128_alignr64( s7, s6, 1 ); \
   s7 = t; \
-   t = mm128_alignr_64( s3, s2, 1 ); \
-   s2 =  mm128_alignr_64( s2, s3, 1 ); \
+   t =  v128_alignr64( s3, s2, 1 ); \
+   s2 = v128_alignr64( s2, s3, 1 ); \
   s3 = t; \
 } 

@@ -195,34 +195,31 @@ static const uint64_t blake2b_IV[8] =

 #endif // AVX2 else SSE2

-
-// Scalar, not used.
-
 static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    return ( w >> c ) | ( w << ( 64 - c ) );
 }

-#define G(r,i,a,b,c,d) \
-  do { \
+#define G( r, i, a, b, c, d ) \
+{ \
    a = a + b; \
-    d = rotr64(d ^ a, 32); \
+    d = ror64( (d) ^ (a), 32 ); \
    c = c + d; \
-    b = rotr64(b ^ c, 24); \
+    b = ror64( (b) ^ (c), 24 ); \
    a = a + b; \
-    d = rotr64(d ^ a, 16); \
+    d = ror64( (d) ^ (a), 16 ); \
    c = c + d; \
-    b = rotr64(b ^ c, 63); \
-  } while(0)
+    b = ror64( (b) ^ (c), 63 ); \
+}

 #define ROUND_LYRA(r)  \
-    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+    G( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+    G( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+    G( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+    G( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+    G( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+    G( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+    G( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+    G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );


 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
--- a/Show More
+++ b/Show More