v3.11.3

2025-09-17 23:44:27 +00:00 · 2020-01-10 20:37:47 -05:00
64 changed files with 1475 additions and 1801 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -89,6 +89,7 @@ cpuminer_SOURCES = \
  algo/gost/sph_gost.c \
  algo/groestl/groestl-gate.c \
  algo/groestl/groestl512-hash-4way.c \
+  algo/groestl/groestl256-hash-4way.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
  algo/groestl/groestl-4way.c \
--- a/8
+++ b/8
@@ -36,6 +36,14 @@ are not supported. FreeBSD YMMV.
 Change Log
 ----------

+v3.11.3
+
+Fixed x12 AVX2 again.
+
+More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%.
+
+Restored lost speed for x22i & x25x.
+
 v3.11.2

 Fixed x11gost (sib) AVX2 invalid shares.
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -39,7 +39,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
      blake2b_8way_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
@@ -94,7 +94,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      blake2b_4way_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -45,7 +45,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[19], n);
 		blake2b_hash(vhashcpu, endiandata);

-		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget))
+		if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
      {
 			pdata[19] = n;
         submit_solution( work, vhashcpu, mythr );
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -56,7 +56,7 @@ int scanhash_blake2s( struct work *work,
 	do {
 		be32enc(&endiandata[19], n);
 		blake2s_hash( hash64, endiandata );
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
+		if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
 			return true;
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -40,7 +40,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
      bmw512hash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -93,8 +93,7 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
      bmw512hash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -7,7 +7,6 @@
 * - implements NIST hash api
 * - assumes that message lenght is multiple of 8-bits
 * - _ECHO_VPERM_ must be defined if compiling with ../main.c
- * -  define NO_AES_NI for aes_ni version
 *
 * Cagdas Calik
 * ccalik@metu.edu.tr
@@ -21,13 +20,6 @@
 #include "hash_api.h"
 //#include "vperm.h"
 #include <immintrin.h>
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
--- a/algo/echo/aes_ni/hash.c.test
+++ b/algo/echo/aes_ni/hash.c.test
@@ -1,620 +0,0 @@
-/*
- * file        : echo_vperm.c
- * version     : 1.0.208
- * date        : 14.12.2010
- * 
- * - vperm and aes_ni implementations of hash function ECHO
- * - implements NIST hash api
- * - assumes that message lenght is multiple of 8-bits
- * - _ECHO_VPERM_ must be defined if compiling with ../main.c
- * -  define NO_AES_NI for aes_ni version
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-#if defined(__AES__)
-
-#include <memory.h>
-#include "miner.h"
-#include "hash_api.h"
-//#include "vperm.h"
-#include <immintrin.h>
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
-
-MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
-MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
-MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
-MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
-MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
-MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
-MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
-MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
-MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
-MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
-MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
-MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
-MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
-MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
-MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
-MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
-MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
-MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
-MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
-MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
-MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
-
-
-MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
-MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
-
-
-#define ECHO_SUBBYTES(state, i, j) \
-	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
-	k1 = _mm_add_epi32(k1, M128(const1))
-
-#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
-	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
-	t1 = _mm_srli_epi16(state1[0][j], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = s2;\
-	state2[1][j] = state1[0][j];\
-	state2[2][j] = state1[0][j];\
-	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
-	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
-	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
-	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
-	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
-	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
-	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
-	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
-	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
-	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
-	t1 = _mm_and_si128(t1, M128(lsbmask));\
-	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-	s2 = _mm_xor_si128(s2, t2);\
-	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
-	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
-	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
-
-
-#define ECHO_ROUND_UNROLL2 \
-	ECHO_SUBBYTES(_state, 0, 0);\
-	ECHO_SUBBYTES(_state, 1, 0);\
-	ECHO_SUBBYTES(_state, 2, 0);\
-	ECHO_SUBBYTES(_state, 3, 0);\
-	ECHO_SUBBYTES(_state, 0, 1);\
-	ECHO_SUBBYTES(_state, 1, 1);\
-	ECHO_SUBBYTES(_state, 2, 1);\
-	ECHO_SUBBYTES(_state, 3, 1);\
-	ECHO_SUBBYTES(_state, 0, 2);\
-	ECHO_SUBBYTES(_state, 1, 2);\
-	ECHO_SUBBYTES(_state, 2, 2);\
-	ECHO_SUBBYTES(_state, 3, 2);\
-	ECHO_SUBBYTES(_state, 0, 3);\
-	ECHO_SUBBYTES(_state, 1, 3);\
-	ECHO_SUBBYTES(_state, 2, 3);\
-	ECHO_SUBBYTES(_state, 3, 3);\
-	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-	ECHO_SUBBYTES(_state2, 0, 0);\
-	ECHO_SUBBYTES(_state2, 1, 0);\
-	ECHO_SUBBYTES(_state2, 2, 0);\
-	ECHO_SUBBYTES(_state2, 3, 0);\
-	ECHO_SUBBYTES(_state2, 0, 1);\
-	ECHO_SUBBYTES(_state2, 1, 1);\
-	ECHO_SUBBYTES(_state2, 2, 1);\
-	ECHO_SUBBYTES(_state2, 3, 1);\
-	ECHO_SUBBYTES(_state2, 0, 2);\
-	ECHO_SUBBYTES(_state2, 1, 2);\
-	ECHO_SUBBYTES(_state2, 2, 2);\
-	ECHO_SUBBYTES(_state2, 3, 2);\
-	ECHO_SUBBYTES(_state2, 0, 3);\
-	ECHO_SUBBYTES(_state2, 1, 3);\
-	ECHO_SUBBYTES(_state2, 2, 3);\
-	ECHO_SUBBYTES(_state2, 3, 3);\
-	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
-
-
-
-#define SAVESTATE(dst, src)\
-	dst[0][0] = src[0][0];\
-	dst[0][1] = src[0][1];\
-	dst[0][2] = src[0][2];\
-	dst[0][3] = src[0][3];\
-	dst[1][0] = src[1][0];\
-	dst[1][1] = src[1][1];\
-	dst[1][2] = src[1][2];\
-	dst[1][3] = src[1][3];\
-	dst[2][0] = src[2][0];\
-	dst[2][1] = src[2][1];\
-	dst[2][2] = src[2][2];\
-	dst[2][3] = src[2][3];\
-	dst[3][0] = src[3][0];\
-	dst[3][1] = src[3][1];\
-	dst[3][2] = src[3][2];\
-	dst[3][3] = src[3][3]
-
-
-void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
-{
-   unsigned int r, b, i, j;
-   __m128i t1, t2, s2, k1;
-   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
-
-   for(i = 0; i < 4; i++)
-	for(j = 0; j < ctx->uHashSize / 256; j++)
-		_state[i][j] = ctx->state[i][j];
-
-   for(b = 0; b < uBlockCount; b++)
-   {
-	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
-
-	// load message
-	for(j = ctx->uHashSize / 256; j < 4; j++)
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
-	   }
-	}
-
-uint64_t *b = (uint64_t*)_state;
-//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-   
-	// save state
-	SAVESTATE(_statebackup, _state);
-
-	k1 = ctx->k;
-
-	for(r = 0; r < ctx->uRounds / 2; r++)
-	{
-		ECHO_ROUND_UNROLL2;
-	}
-
-//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-   
-   
-	if(ctx->uHashSize == 256)
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
-	   }
-	}
-	else
-	{
-	   for(i = 0; i < 4; i++)
-	   {
-		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
-           }
-	}
-	pmsg += ctx->uBlockLength;
-   }
-	SAVESTATE(ctx->state, _state);
-
-}
-
-
-
-HashReturn init_echo(hashState_echo *ctx, int nHashSize)
-{
-	int i, j;
-
-        ctx->k = _mm_setzero_si128(); 
-	ctx->processed_bits = 0;
-	ctx->uBufferBytes = 0;
-
-	switch(nHashSize)
-	{
-		case 256:
-			ctx->uHashSize = 256;
-			ctx->uBlockLength = 192;
-			ctx->uRounds = 8;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
-			break;
-
-		case 512:
-			ctx->uHashSize = 512;
-			ctx->uBlockLength = 128;
-			ctx->uRounds = 10;
-			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
-			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
-			break;
-
-		default:
-			return BAD_HASHBITLEN;
-	}
-
-
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < nHashSize / 256; j++)
-			ctx->state[i][j] = ctx->hashsize;
-
-	for(i = 0; i < 4; i++)
-		for(j = nHashSize / 256; j < 4; j++)
-			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
-
-	return SUCCESS;
-}
-
-HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
-{
-	unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-	uByteLength = (unsigned int)(databitlen / 8);
-
-	if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
-	{
-		if(state->uBufferBytes != 0)
-		{
-			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
-
-			// Process buffer
-			Compress(state, state->buffer, 1);
-			state->processed_bits += state->uBlockLength * 8;
-
-			data += state->uBlockLength - state->uBufferBytes;
-			uByteLength -= state->uBlockLength - state->uBufferBytes;
-		}
-
-		// buffer now does not contain any unprocessed bytes
-
-		uBlockCount = uByteLength / state->uBlockLength;
-		uRemainingBytes = uByteLength % state->uBlockLength;
-
-		if(uBlockCount > 0)
-		{
-			Compress(state, data, uBlockCount);
-
-			state->processed_bits += uBlockCount * state->uBlockLength * 8;
-			data += uBlockCount * state->uBlockLength;
-		}
-
-		if(uRemainingBytes > 0)
-		{
-			memcpy(state->buffer, (void*)data, uRemainingBytes);
-		}
-
-		state->uBufferBytes = uRemainingBytes;
-	}
-	else
-	{
-		memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
-		state->uBufferBytes += uByteLength;
-	}
-
-	return SUCCESS;
-}
-
-HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
-{
-	__m128i remainingbits;
-
-	// Add remaining bytes in the buffer
-	state->processed_bits += state->uBufferBytes * 8;
-
-	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
-
-	// Pad with 0x80
-	state->buffer[state->uBufferBytes++] = 0x80;
-	
-	// Enough buffer space for padding in this block?
-	if((state->uBlockLength - state->uBufferBytes) >= 18)
-	{
-		// Pad with zeros
-		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
-
-		// Hash size
-		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
-
-		// Processed bits
-		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
-		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
-
-		// Last block contains message bits?
-		if(state->uBufferBytes == 1)
-		{
-			state->k = _mm_xor_si128(state->k, state->k);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
-		}
-		else
-		{
-			state->k = _mm_add_epi64(state->k, remainingbits);
-			state->k = _mm_sub_epi64(state->k, state->const1536);
-		}
-
-		// Compress
-		Compress(state, state->buffer, 1);
-	}
-	else
-	{
-		// Fill with zero and compress
-		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
-		state->k = _mm_add_epi64(state->k, remainingbits);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
-		Compress(state, state->buffer, 1);
-
-		// Last block
-		memset(state->buffer, 0, state->uBlockLength - 18);
-
-		// Hash size
-		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
-
-		// Processed bits
-		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
-		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
-
-		// Compress the last block
-		state->k = _mm_xor_si128(state->k, state->k);
-		state->k = _mm_sub_epi64(state->k, state->const1536);
-		Compress(state, state->buffer, 1);
-	}
-
-	// Store the hash value
-	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
-
-	if(state->uHashSize == 512)
-	{
-		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
-	}
-
-	return SUCCESS;
-}
-
-HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
-{
-   unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-   uByteLength = (unsigned int)(databitlen / 8);
-
-/*   
-   if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
-   {
-printf("full block\n");
-      if( state->uBufferBytes != 0 )
-        {
-           // Fill the buffer
-           memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
-
-           // Process buffer
-           Compress( state, state->buffer, 1 );
-           state->processed_bits += state->uBlockLength * 8;
-
-           data += state->uBlockLength - state->uBufferBytes;
-           uByteLength -= state->uBlockLength - state->uBufferBytes;
-        }
-
-        // buffer now does not contain any unprocessed bytes
-
-        uBlockCount = uByteLength / state->uBlockLength;
-        uRemainingBytes = uByteLength % state->uBlockLength;
-
-        if( uBlockCount > 0 )
-        {
-           Compress( state, data, uBlockCount );
-           state->processed_bits += uBlockCount * state->uBlockLength * 8;
-           data += uBlockCount * state->uBlockLength;
-        }
-
-        if( uRemainingBytes > 0 )
-        memcpy(state->buffer, (void*)data, uRemainingBytes);
-
-        state->uBufferBytes = uRemainingBytes;
-   }
-   else
-   {
-*/
-   memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-        state->uBufferBytes += uByteLength;
-//   }
-
-   __m128i remainingbits;
-
-   // Add remaining bytes in the buffer
-   state->processed_bits += state->uBufferBytes * 8;
-
-   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-   // Pad with 0x80
-   state->buffer[state->uBufferBytes++] = 0x80;
-
-   // Enough buffer space for padding in this block?
-
-//   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
-//   {
-        // Pad with zeros
-
-        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-
-
-        // Last block contains message bits?
-        if( state->uBufferBytes == 1 )
-        {
-           state->k = _mm_xor_si128( state->k, state->k );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-        else
-        {
-           state->k = _mm_add_epi64( state->k, remainingbits );
-           state->k = _mm_sub_epi64( state->k, state->const1536 );
-        }
-
-uint64_t *b = (uint64_t*)&state->k;
-/*
-printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-b = (uint64_t*)state->buffer;
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
-printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
-
-b = (uint64_t*)state->state;
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
-printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
-*/        
-        // Compress
-        Compress( state, state->buffer, 1 );
-
-//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
-
-        
-/*
-   }
-   else
-   {
-        // Fill with zero and compress
-        memset( state->buffer + state->uBufferBytes, 0,
-                state->uBlockLength - state->uBufferBytes );
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1 );
-
-        // Last block
-        memset( state->buffer, 0, state->uBlockLength - 18 );
-
-        // Hash size
-        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
-                 state->uHashSize;
-
-        // Processed bits
-        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-        // Compress the last block
-        state->k = _mm_xor_si128( state->k, state->k );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-        Compress( state, state->buffer, 1) ;
-   }
-*/
-
-   // Store the hash value
-   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
-
-   if( state->uHashSize == 512 )
-   {
-        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
-
-   }
-   return SUCCESS;
-}
-
-
-HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
-{
-	HashReturn hRet;
-	hashState_echo hs;
-
-	/////
-	/*
-	__m128i a, b, c, d, t[4], u[4], v[4];
-
-	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
-	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
-	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
-	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
-
-	t[0] = _mm_unpacklo_epi8(a, b);
-	t[1] = _mm_unpackhi_epi8(a, b);
-	t[2] = _mm_unpacklo_epi8(c, d);
-	t[3] = _mm_unpackhi_epi8(c, d);
-
-	u[0] = _mm_unpacklo_epi16(t[0], t[2]);
-	u[1] = _mm_unpackhi_epi16(t[0], t[2]);
-	u[2] = _mm_unpacklo_epi16(t[1], t[3]);
-	u[3] = _mm_unpackhi_epi16(t[1], t[3]);
-
-
-	t[0] = _mm_unpacklo_epi16(u[0], u[1]);
-	t[1] = _mm_unpackhi_epi16(u[0], u[1]);
-	t[2] = _mm_unpacklo_epi16(u[2], u[3]);
-	t[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
-	u[0] = _mm_unpacklo_epi8(t[0], t[1]);
-	u[1] = _mm_unpackhi_epi8(t[0], t[1]);
-	u[2] = _mm_unpacklo_epi8(t[2], t[3]);
-	u[3] = _mm_unpackhi_epi8(t[2], t[3]);
-
-	a = _mm_unpacklo_epi8(u[0], u[1]);
-	b = _mm_unpackhi_epi8(u[0], u[1]);
-	c = _mm_unpacklo_epi8(u[2], u[3]);
-	d = _mm_unpackhi_epi8(u[2], u[3]);
-	*/
-	/////
-
-	hRet = init_echo(&hs, hashbitlen);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	hRet = update_echo(&hs, data, databitlen);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	hRet = final_echo(&hs, hashval);
-	if(hRet != SUCCESS)
-		return hRet;
-
-	return SUCCESS;
-}
-
-#endif
--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -15,7 +15,7 @@
 #ifndef HASH_API_H
 #define HASH_API_H

-#ifndef NO_AES_NI
+#ifdef __AES__
 #define HASH_IMPL_STR	"ECHO-aesni"
 #else
 #define HASH_IMPL_STR	"ECHO-vperm"
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -2,13 +2,6 @@
 //#define TASM
 #define TINTR

-//#define AES_NI
-
-//#ifdef AES_NI
-// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
-
-//#ifndef NO_AES_NI
-
 // Not to be confused with AVX512VAES
 #define VAES
 // #define VAVX
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -14,7 +14,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

 #include "groestl-version.h"

@@ -91,8 +91,9 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT(ctx->chaining);
+  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -11,7 +11,7 @@
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#ifdef __AES__

 #include "groestl-version.h"

@@ -86,8 +86,11 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256(ctx->chaining);
+
+  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
+
+//  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+//  INIT256(ctx->chaining);
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -93,9 +93,6 @@ typedef enum
 typedef struct {
  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
  int hashlen;              // bytes
  int blk_count;
  int buf_ptr;              /* data buffer pointer */
--- a/algo/groestl/groestl-4way.c
+++ b/algo/groestl/groestl-4way.c
@@ -49,7 +49,7 @@ int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
        pdata[19] = n;

        for ( int lane = 0; lane < 4; lane++ )
-        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( ( hash+(lane<<3) )[7] <= Htarg )
        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
        {
           pdata[19] = n + lane;
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -3,19 +3,18 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif

 typedef struct
 {
-#ifdef NO_AES_NI
-    sph_groestl512_context groestl1, groestl2;
-#else
+#ifdef __AES__
    hashState_groestl groestl1, groestl2;
+#else
+    sph_groestl512_context groestl1, groestl2;
 #endif

 } groestl_ctx_holder;
@@ -24,12 +23,12 @@ static groestl_ctx_holder groestl_ctx;

 void init_groestl_ctx()
 {
-#ifdef NO_AES_NI
-    sph_groestl512_init( &groestl_ctx.groestl1 );
-    sph_groestl512_init( &groestl_ctx.groestl2 );
-#else
+#ifdef __AES__
    init_groestl( &groestl_ctx.groestl1, 64 );
    init_groestl( &groestl_ctx.groestl2, 64 );
+#else
+    sph_groestl512_init( &groestl_ctx.groestl1 );
+    sph_groestl512_init( &groestl_ctx.groestl2 );
 #endif
 }

@@ -39,18 +38,18 @@ void groestlhash( void *output, const void *input )
     groestl_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) );

-#ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl1, input, 80);
-     sph_groestl512_close(&ctx.groestl1, hash);
-
-     sph_groestl512(&ctx.groestl2, hash, 64);
-     sph_groestl512_close(&ctx.groestl2, hash);
-#else
+#ifdef __AES__
     update_and_final_groestl( &ctx.groestl1, (char*)hash,
                               (const char*)input, 640 );

     update_and_final_groestl( &ctx.groestl2, (char*)hash,
                               (const char*)hash, 512 );
+#else
+     sph_groestl512(&ctx.groestl1, input, 80);
+     sph_groestl512_close(&ctx.groestl1, hash);
+
+     sph_groestl512(&ctx.groestl2, hash, 64);
+     sph_groestl512_close(&ctx.groestl2, hash);
 #endif
     memcpy(output, hash, 32);
 }
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -1,4 +1,5 @@
 /* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
 *
 * Groestl implementation for different versions.
 * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
@@ -6,51 +7,18 @@
 * This code is placed in the public domain
 */

+// Optimized for hash and data length that are integrals of __m128i 
+
+
 #include <memory.h>
-#include "hash-groestl256.h"
+#include "groestl256-intr-4way.h"
 #include "miner.h"
 #include "simd-utils.h"

-#ifndef NO_AES_NI
+#if defined(__VAES__)

-#include "groestl-version.h"

-#ifdef TASM
-  #ifdef VAES
-    #include "groestl256-asm-aes.h"
-  #else
-    #ifdef VAVX
-      #include "groestl256-asm-avx.h"
-    #else
-      #ifdef VVPERM
-        #include "groestl256-asm-vperm.h"
-      #else
-        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-      #endif
-    #endif
-  #endif
-#else
-  #ifdef TINTR
-    #ifdef VAES
-      #include "groestl256-intr-aes.h"
-    #else
-      #ifdef VAVX
-        #include "groestl256-intr-avx.h"
-      #else
-        #ifdef VVPERM
-          #include "groestl256-intr-vperm.h"
-        #else
-          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
-        #endif
-      #endif
-    #endif
-  #else
-    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
-  #endif
-#endif
-
-/* initialise context */
-HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
+int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
 {
  int i;

@@ -58,223 +26,84 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
+    return 1;

  for ( i = 0; i < SIZE256; i++ )
  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256( ctx->chaining );
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+//  uint64_t len = U64BIG((uint64_t)LENGTH);
+//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+//  INIT256_4way(ctx->chaining);
+
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

-  return SUCCESS_GR;
+  return 0;
 }

-
-HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
- {
-  int i;
-
-  if (ctx->chaining == NULL || ctx->buffer == NULL)
-    return FAIL_GR;
-
-  for ( i = 0; i < SIZE256; i++ )
-  {
-     ctx->chaining[i] = _mm_setzero_si128();
-     ctx->buffer[i]   = _mm_setzero_si128();
-  }
-  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
-  INIT256(ctx->chaining);
-  ctx->buf_ptr = 0;
-  ctx->rem_ptr = 0;
-
-  return SUCCESS_GR;
-}
-
-// Use this only for midstate and never for cryptonight
-HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
-                                 DataLength_gr databitlen )
-{
-   __m128i* in = (__m128i*)input;
-   const int len = (int)databitlen / 128;  // bits to __m128i
-   const int blocks = len / SIZE256;    // __M128i to blocks
-   int rem = ctx->rem_ptr;
-   int i;
-
-   ctx->blk_count = blocks;
-   ctx->databitlen = databitlen;
-
-   // digest any full blocks 
-   for ( i = 0; i < blocks; i++ )
-       TF512( ctx->chaining, &in[ i * SIZE256 ] );
-   // adjust buf_ptr to last block
-   ctx->buf_ptr = blocks * SIZE256;
-
-   // Copy any remainder to buffer
-   for ( i = 0; i < len % SIZE256; i++ )
-       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-   // adjust rem_ptr for new data
-   ctx->rem_ptr += i;
-
-   return SUCCESS_GR;
-}
-
-// don't use this at all
-HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
-{
-   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
-   const int blocks = ctx->blk_count + 1;       // adjust for final block
-   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
-   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
-   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
-   int i;
-
-   // first pad byte = 0x80, last pad byte = block count
-   // everything in between is zero
-
-   if ( rem_ptr == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                                  0,0,0,0, 0,0,0,0x80 );
-   }
-   else
-   {
-       // add first padding
-       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                            0,0,0,0, 0,0,0,0x80 );
-       // add zero padding
-       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
-                                           0,0,0,0, 0,0,0,0 );
-   }
-
-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
-
-   // store hash result in output 
-   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
-
-   return SUCCESS_GR;
-}
-
-HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
-                   void* output, const void* input, DataLength_gr databitlen )
+int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
-   __m128i* in = (__m128i*)input;
+   __m512i* in = (__m512i*)input;
   int i;

   // --- update ---

   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
-      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;

-   // cryptonight has 200 byte input, an odd number of __m128i
-   // remainder is only 8 bytes, ie u64.
-   if ( databitlen % 128 !=0 )
-   {
-      // must be cryptonight, copy 64 bits of data
-      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
-      i = -1; // signal for odd length
-   }
-   else   
-   { 
-      // Copy any remaining data to buffer for final transform
-      for ( i = 0; i < len % SIZE256; i++ )
-          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
-      i += rem;   // use i as rem_ptr in final
-   }
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final

   //--- final ---

-   // adjust for final block
-   blocks++;
+   blocks++;      // adjust for final block

-   if ( i == len - 1 )
-   {
-       // all padding at once
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0x80 );
-   }
+   if ( i == SIZE256 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }   
   else
   {
-      if ( i == -1 )
-      {
-         // cryptonight odd length
-         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
-         // finish the block with zero and length padding as normal
-         i = 0;
-       }
-       else
-       {
-          // add first padding
-          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
-                                         0,0,0,0, 0,0,0,0x80 );
-       }
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
-           ctx->buffer[i] = _mm_setzero_si128();
-       // add length padding
-       // cheat since we know the block count is trivial, good if block < 256
-       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
-                                           0,        0,0,0, 0,0,0,0 );
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }

-   // digest final padding block and do output transform
-   TF512( ctx->chaining, ctx->buffer );
-   OF512( ctx->chaining );
+// digest final padding block and do output transform
+   TF512_4way( ctx->chaining, ctx->buffer );
+
+   OF512_4way( ctx->chaining );

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
-      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];

-   return SUCCESS_GR;
+   return 0;
 }

-/* hash bit sequence */
-HashReturn_gr hash_groestl256(int hashbitlen,
-                const BitSequence_gr* data,
-                DataLength_gr databitlen,
-                BitSequence_gr* hashval) {
-  HashReturn_gr ret;
-  hashState_groestl256 context;
+#endif   // VAES

-  /* initialise */
-  if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
-    return ret;
-
-  /* process message */
-  if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
-    return ret;
-
-  /* finalise */
-  ret = final_groestl256(&context, hashval);
-
-  return ret;
-}
-
-/* eBash API */
-//#ifdef crypto_hash_BYTES
-//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
-//{
-//  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
-//  return -1;
-//}
-//#endif
-
-#endif
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -6,56 +6,37 @@
 * This code is placed in the public domain
 */

-#ifndef __hash_h
-#define __hash_h
+#if !defined(GROESTL256_HASH_4WAY_H__)
+#define GROESTL256_HASH_4WAY_H__ 1

+#include "simd-utils.h"
 #include <immintrin.h>
+#include <stdint.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
 #endif
 #include <stdlib.h>

-/* eBash API begin */
-/*
-#include "crypto_hash.h"
-#ifdef crypto_hash_BYTES
-
-#include <crypto_uint8.h>
-#include <crypto_uint32.h>
-#include <crypto_uint64.h>
-typedef crypto_uint8 u8;
-typedef crypto_uint32 u32;
-typedef crypto_uint64 u64;
-#endif
- */
-/* eBash API end */
-
-//#define LENGTH (512)
-
-#include "brg_endian.h"
-#define NEED_UINT_64T
-#include "algo/sha/brg_types.h"
-
-#ifdef IACA_TRACE
-  #include IACA_MARKS
-#endif
-
 #define LENGTH (256)

+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
 #define COLS512 (8)
 //#define COLS1024 (16)
 #define SIZE_512 ((ROWS)*(COLS512))
-//#define SIZE1024 ((ROWS)*(COLS1024))
+//#define SIZE_1024 ((ROWS)*(COLS1024))
 #define ROUNDS512 (10)
 //#define ROUNDS1024 (14)

 //#if LENGTH<=256
 #define COLS (COLS512)
-//#define SIZE (SIZE512)
+#define SIZE (SIZE512)
 #define ROUNDS (ROUNDS512)
 //#else
 //#define COLS (COLS1024)
@@ -63,59 +44,29 @@ typedef crypto_uint64 u64;
 //#define ROUNDS (ROUNDS1024)
 //#endif

-#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
-
-#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
-#define U64BIG(a) (a)
-#endif /* IS_BIG_ENDIAN */
-
-#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
-#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
-#define U64BIG(a) \
-  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
-   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
-   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
-   (ROTL64(a,56) & li_64(FF000000FF000000)))
-#endif /* IS_LITTLE_ENDIAN */
-
-typedef unsigned char BitSequence_gr;
-typedef unsigned long long DataLength_gr;
-typedef enum
-{
-    SUCCESS_GR = 0,
-    FAIL_GR = 1,
-    BAD_HASHBITLEN_GR = 2
-} HashReturn_gr;
-
 #define SIZE256 (SIZE_512/16)

 typedef struct {
-  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
-  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
-//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-//  u64 block_counter;        /* message block counter */
-  int hashlen;              // bytes
-  int blk_count;
-  int buf_ptr;              /* data buffer pointer */
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE256];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
  int rem_ptr;
-  int databitlen;
-} hashState_groestl256;
+  int databitlen;    // bits
+} groestl256_4way_context;

-HashReturn_gr init_groestl256( hashState_groestl256*, int );

-HashReturn_gr reinit_groestl256( hashState_groestl256* );
+int groestl256_4way_init( groestl256_4way_context*, uint64_t );

-HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
-                              DataLength_gr );
+//int reinit_groestl( hashState_groestl* );

-HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+//int groestl512_4way_update( groestl256_4way_context*, const void*,
+//                              uint64_t );

-HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
-                            BitSequence_gr* );
+//int groestl512_4way_close( groestl512_4way_context*, void* );

-HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
-                                           const void*, DataLength_gr );
+int groestl256_4way_update_close( groestl256_4way_context*,  void*,
+                                        const void*, uint64_t );

-#endif /* __hash_h */
+#endif 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -7,35 +7,37 @@
 * This code is placed in the public domain
 */

-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include "hash-groestl256.h"
+
+#if !defined(GROESTL256_INTR_4WAY_H__)
+#define GROESTL256_INTR_4WAY_H__ 1
+      
+#include "groestl256-hash-4way.h"
+
+#if defined(__VAES__)

 /* global constants  */
-__m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
-//__m128i ROUND_CONST_P[ROUNDS1024];
-//__m128i ROUND_CONST_Q[ROUNDS1024];
-__m128i TRANSP_MASK;
-__m128i SUBSH_MASK[8];
-__m128i ALL_1B;
-__m128i ALL_FF;
-
+__m512i ROUND_CONST_Lx;
+__m512i ROUND_CONST_L0[ROUNDS512];
+__m512i ROUND_CONST_L7[ROUNDS512];
+//__m512i ROUND_CONST_P[ROUNDS1024];
+//__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;

 #define tos(a)    #a
 #define tostr(a)  tos(a)

-
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2(i, j, k){\
-  j = _mm_xor_si128(j, j);\
-  j = _mm_cmpgt_epi8(j, i);\
-  i = _mm_add_epi8(i, i);\
-  j = _mm_and_si128(j, k);\
-  i = _mm_xor_si128(i, j);\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
 } 

 /**/
@@ -61,152 +63,188 @@ __m128i ALL_FF;
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
-  a0 = _mm_xor_si128(a0, a1);\
+  a0 = _mm512_xor_si512(a0, a1);\
  b0 = a2;\
-  a1 = _mm_xor_si128(a1, a2);\
+  a1 = _mm512_xor_si512(a1, a2);\
  b1 = a3;\
-  a2 = _mm_xor_si128(a2, a3);\
+  a2 = _mm512_xor_si512(a2, a3);\
  b2 = a4;\
-  a3 = _mm_xor_si128(a3, a4);\
+  a3 = _mm512_xor_si512(a3, a4);\
  b3 = a5;\
-  a4 = _mm_xor_si128(a4, a5);\
+  a4 = _mm512_xor_si512(a4, a5);\
  b4 = a6;\
-  a5 = _mm_xor_si128(a5, a6);\
+  a5 = _mm512_xor_si512(a5, a6);\
  b5 = a7;\
-  a6 = _mm_xor_si128(a6, a7);\
-  a7 = _mm_xor_si128(a7, b6);\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm_xor_si128(b0, a4);\
-  b6 = _mm_xor_si128(b6, a4);\
-  b1 = _mm_xor_si128(b1, a5);\
-  b7 = _mm_xor_si128(b7, a5);\
-  b2 = _mm_xor_si128(b2, a6);\
-  b0 = _mm_xor_si128(b0, a6);\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
-  b3 = _mm_xor_si128(b3, a7);\
-  b1 = _mm_xor_si128(b1, a7);\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
  TEMP1 = b1;\
-  b4 = _mm_xor_si128(b4, a0);\
-  b2 = _mm_xor_si128(b2, a0);\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
-  b5 = _mm_xor_si128(b5, a1);\
-  b3 = _mm_xor_si128(b3, a1);\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
  b1 = a1;\
-  b6 = _mm_xor_si128(b6, a2);\
-  b4 = _mm_xor_si128(b4, a2);\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
  TEMP2 = a2;\
-  b7 = _mm_xor_si128(b7, a3);\
-  b5 = _mm_xor_si128(b5, a3);\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm_xor_si128(a0, a3);\
-  a1 = _mm_xor_si128(a1, a4);\
-  a2 = _mm_xor_si128(a2, a5);\
-  a3 = _mm_xor_si128(a3, a6);\
-  a4 = _mm_xor_si128(a4, a7);\
-  a5 = _mm_xor_si128(a5, b0);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, TEMP2);\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = ALL_1B;\
+  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
-  a0 = _mm_xor_si128(a0, TEMP0);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
  MUL2(a1, b0, b1);\
-  a1 = _mm_xor_si128(a1, TEMP1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
  MUL2(a2, b0, b1);\
-  a2 = _mm_xor_si128(a2, b2);\
+  a2 = _mm512_xor_si512(a2, b2);\
  MUL2(a3, b0, b1);\
-  a3 = _mm_xor_si128(a3, b3);\
+  a3 = _mm512_xor_si512(a3, b3);\
  MUL2(a4, b0, b1);\
-  a4 = _mm_xor_si128(a4, b4);\
+  a4 = _mm512_xor_si512(a4, b4);\
  MUL2(a5, b0, b1);\
-  a5 = _mm_xor_si128(a5, b5);\
+  a5 = _mm512_xor_si512(a5, b5);\
  MUL2(a6, b0, b1);\
-  a6 = _mm_xor_si128(a6, b6);\
+  a6 = _mm512_xor_si512(a6, b6);\
  MUL2(a7, b0, b1);\
-  a7 = _mm_xor_si128(a7, b7);\
+  a7 = _mm512_xor_si512(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2(a0, b0, b1);\
-  b5 = _mm_xor_si128(b5, a0);\
+  b5 = _mm512_xor_si512(b5, a0);\
  MUL2(a1, b0, b1);\
-  b6 = _mm_xor_si128(b6, a1);\
+  b6 = _mm512_xor_si512(b6, a1);\
  MUL2(a2, b0, b1);\
-  b7 = _mm_xor_si128(b7, a2);\
+  b7 = _mm512_xor_si512(b7, a2);\
  MUL2(a5, b0, b1);\
-  b2 = _mm_xor_si128(b2, a5);\
+  b2 = _mm512_xor_si512(b2, a5);\
  MUL2(a6, b0, b1);\
-  b3 = _mm_xor_si128(b3, a6);\
+  b3 = _mm512_xor_si512(b3, a6);\
  MUL2(a7, b0, b1);\
-  b4 = _mm_xor_si128(b4, a7);\
+  b4 = _mm512_xor_si512(b4, a7);\
  MUL2(a3, b0, b1);\
  MUL2(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
-  b0 = _mm_xor_si128(b0, a3);\
-  b1 = _mm_xor_si128(b1, a4);\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \
+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x33363a3d, 0x38323539, 0x3c3f3134, 0x373b3e30, \
+                         0x23262a2d, 0x28222529, 0x2c2f2124, 0x272b2e20, \
+                         0x13161a1d, 0x18121519, 0x1c1f1114, 0x171b1e10, \
+                         0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373c3f, 0x3a33363b, 0x3e393235, 0x303d3831, \
+                         0x24272c2f, 0x2a23262b, 0x2e292225, 0x202d2821, \
+                         0x14171c1f, 0x1a13161b, 0x1e191215, 0x101d1801, \
+                         0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801 );\
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35303e39, 0x3c34373d, 0x383b3336, 0x313f3a32, \
+                         0x25202e29, 0x2c24272d, 0x282b2326, 0x212f2a22, \
+                         0x15101e19, 0x1c14171d, 0x181b1316, 0x111f1a12, \
+                         0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02 );\
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x3631383b, 0x3e35303f, 0x3a3d3437, 0x32393c33, \
+                         0x2621282b, 0x2e25202f, 0x2a2d2427, 0x22292c23, \
+                         0x1611181b, 0x1e15101f, 0x1a1d1417, 0x12191c13, \
+                         0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03 );\
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x3732393c, 0x3f363138, 0x3b3e3530, 0x333a3d34, \
+                         0x2722292c, 0x2f262128, 0x2b2e2520, 0x232a2d24, \
+                         0x1712191c, 0x1f161118, 0x1b1e1510, 0x131a1d14, \
+                         0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04 );\
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x30333b3e, 0x3937323a, 0x3d383631, 0x343c3f35, \
+                         0x20232b2e, 0x2927222a, 0x2d282621, 0x242c2f25, \
+                         0x10131b1e, 0x1917121a, 0x1d181611, 0x141c1f15, \
+                         0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05 );\
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x31343d38, 0x3b30333c, 0x3f3a3732, 0x353e3936, \
+                         0x21242d28, 0x2b20232c, 0x2f2a2722, 0x252e2926, \
+                         0x11141d18, 0x1b10131c, 0x1f1a1712, 0x151e1916, \
+                         0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906 );\
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x32353f3a, 0x3d31343e, 0x393c3033, 0x36383b37, \
+                         0x22252f2a, 0x2d21242e, 0x292c2023, 0x26282b27, \
+                         0x12151f1a, 0x1d11141e, 0x191c1013, 0x16181b17, \
+                         0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07 );\
+  for ( i = 0; i < ROUNDS512; i++ ) \
+  {\
+    ROUND_CONST_L0[i] = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+          0x70605040 ^ ( i * 0x01010101 ), 0x30201000 ^ ( i * 0x01010101 ) ); \
+    ROUND_CONST_L7[i] = _mm512_set4_epi32( 0x8f9fafbf ^ ( i * 0x01010101 ), \
+          0xcfdfefff ^ ( i * 0x01010101 ), 0x00000000, 0x00000000 ); \
+  }\
+  ROUND_CONST_Lx = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
+                                      0x00000000, 0x00000000 ); \
+}while(0);\

-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  a0 = _mm512_xor_si512( a0, (ROUND_CONST_L0[i]) );\
+  a1 = _mm512_xor_si512( a1, b1 );\
+  a2 = _mm512_xor_si512( a2, b1 );\
+  a3 = _mm512_xor_si512( a3, b1 );\
+  a4 = _mm512_xor_si512( a4, b1 );\
+  a5 = _mm512_xor_si512( a5, b1 );\
+  a6 = _mm512_xor_si512( a6, b1 );\
+  a7 = _mm512_xor_si512( a7, (ROUND_CONST_L7[i]) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_shuffle_epi8( a0, (SUBSH_MASK[0]) );\
+  a0 = _mm512_aesenclast_epi128(a0, b0 );\
+  a1 = _mm512_shuffle_epi8( a1, (SUBSH_MASK[1]) );\
+  a1 = _mm512_aesenclast_epi128(a1, b0 );\
+  a2 = _mm512_shuffle_epi8( a2, (SUBSH_MASK[2]) );\
+  a2 = _mm512_aesenclast_epi128(a2, b0 );\
+  a3 = _mm512_shuffle_epi8( a3, (SUBSH_MASK[3]) );\
+  a3 = _mm512_aesenclast_epi128(a3, b0 );\
+  a4 = _mm512_shuffle_epi8( a4, (SUBSH_MASK[4]) );\
+  a4 = _mm512_aesenclast_epi128(a4, b0 );\
+  a5 = _mm512_shuffle_epi8( a5, (SUBSH_MASK[5]) );\
+  a5 = _mm512_aesenclast_epi128(a5, b0 );\
+  a6 = _mm512_shuffle_epi8( a6, (SUBSH_MASK[6]) );\
+  a6 = _mm512_aesenclast_epi128(a6, b0 );\
+  a7 = _mm512_shuffle_epi8( a7, (SUBSH_MASK[7]) );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
@@ -237,31 +275,31 @@ __m128i ALL_FF;
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
  t0 = TRANSP_MASK;\
  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
+  i0 = _mm512_shuffle_epi8( i0, t0 );\
+  i1 = _mm512_shuffle_epi8( i1, t0 );\
+  i2 = _mm512_shuffle_epi8( i2, t0 );\
+  i3 = _mm512_shuffle_epi8( i3, t0 );\
  \
  o1 = i0;\
  t0 = i2;\
  \
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
+  i0 = _mm512_unpacklo_epi16( i0, i1 );\
+  o1 = _mm512_unpackhi_epi16( o1, i1 );\
+  i2 = _mm512_unpacklo_epi16( i2, i3 );\
+  t0 = _mm512_unpackhi_epi16( t0, i3 );\
  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
+  i0 = _mm512_shuffle_epi32( i0, 216 );\
+  o1 = _mm512_shuffle_epi32( o1, 216 );\
+  i2 = _mm512_shuffle_epi32( i2, 216 );\
+  t0 = _mm512_shuffle_epi32( t0, 216 );\
  \
  o2 = i0;\
  o3 = o1;\
  \
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
+  i0 = _mm512_unpacklo_epi32( i0, i2 );\
+  o1 = _mm512_unpacklo_epi32( o1, t0 );\
+  o2 = _mm512_unpackhi_epi32( o2, i2 );\
+  o3 = _mm512_unpackhi_epi32( o3, t0 );\
 }/**/

 /* Matrix Transpose Step 2
@@ -279,19 +317,19 @@ __m128i ALL_FF;
 #define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
  o1 = i0;\
  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
+  i0 = _mm512_unpacklo_epi64( i0, i4 );\
+  o1 = _mm512_unpackhi_epi64( o1, i4 );\
  o3 = i1;\
  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o2 = _mm512_unpacklo_epi64( o2, i5 );\
+  o3 = _mm512_unpackhi_epi64( o3, i5 );\
  o5 = i2;\
  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o4 = _mm512_unpacklo_epi64( o4, i6 );\
+  o5 = _mm512_unpackhi_epi64( o5, i6 );\
  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
+  o6 = _mm512_unpacklo_epi64( o6, i7 );\
+  o7 = _mm512_unpackhi_epi64( o7, i7 );\
 }/**/

 /* Matrix Transpose Inverse Step 2
@@ -302,19 +340,20 @@ __m128i ALL_FF;
 */
 #define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  o0 = _mm512_unpackhi_epi64( o0, i1 );\
  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  o1 = _mm512_unpackhi_epi64( o1, i3 );\
  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  o2 = _mm512_unpackhi_epi64( o2, i5 );\
  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
+  o3 = _mm512_unpackhi_epi64( o3, i7 );\
 }/**/

+
 /* Matrix Transpose Output Step 2
 * input is one 512-bit state with two rows in one xmm
 * output is one 512-bit state with one row in the low 64-bits of one xmm
@@ -322,19 +361,19 @@ __m128i ALL_FF;
 * outputs: (i0-7) = (0|S)
 */
 #define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
+  t0 = _mm512_xor_si512( t0, t0 );\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
+  i0 = _mm512_unpacklo_epi64( i0, t0 );\
+  i1 = _mm512_unpackhi_epi64( i1, t0 );\
+  i2 = _mm512_unpacklo_epi64( i2, t0 );\
+  i3 = _mm512_unpackhi_epi64( i3, t0 );\
+  i4 = _mm512_unpacklo_epi64( i4, t0 );\
+  i5 = _mm512_unpackhi_epi64( i5, t0 );\
+  i6 = _mm512_unpacklo_epi64( i6, t0 );\
+  i7 = _mm512_unpackhi_epi64( i7, t0 );\
 }/**/

 /* Matrix Transpose Output Inverse Step 2
@@ -344,17 +383,18 @@ __m128i ALL_FF;
 * outputs: (i0, i2, i4, i6) = S
 */
 #define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
+  i0 = _mm512_unpacklo_epi64( i0, i1 );\
+  i2 = _mm512_unpacklo_epi64( i2, i3 );\
+  i4 = _mm512_unpacklo_epi64( i4, i5 );\
+  i6 = _mm512_unpacklo_epi64( i6, i7 );\
 }/**/


-void INIT256( __m128i* chaining )
+
+void INIT256_4way( __m512i* chaining )
 {
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+  static __m512i xmm0, xmm2, xmm6, xmm7;
+  static __m512i xmm12, xmm13, xmm14, xmm15;

  /* load IV into registers xmm12 - xmm15 */
  xmm12 = chaining[0];
@@ -373,17 +413,13 @@ void INIT256( __m128i* chaining )
  chaining[3] = xmm7;
 }

-void TF512( __m128i* chaining, __m128i* message )
+void TF512_4way( __m512i* chaining, __m512i* message )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;

  /* load message into registers xmm12 - xmm15 */
  xmm12 = message[0];
@@ -404,10 +440,10 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor message to CV get input of P */
  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
+  xmm8 = _mm512_xor_si512( xmm8, xmm12 );
+  xmm0 = _mm512_xor_si512( xmm0, xmm2 );
+  xmm4 = _mm512_xor_si512( xmm4, xmm6 );
+  xmm5 = _mm512_xor_si512( xmm5, xmm7 );

  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
@@ -422,17 +458,17 @@ void TF512( __m128i* chaining, __m128i* message )

  /* xor output of P and Q */
  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
+  xmm0 = _mm512_xor_si512( xmm0, xmm8 );
+  xmm1 = _mm512_xor_si512( xmm1, xmm10 );
+  xmm2 = _mm512_xor_si512( xmm2, xmm12 );
+  xmm3 = _mm512_xor_si512( xmm3, xmm14 );

  /* xor CV (feed-forward) */
  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+  xmm0 = _mm512_xor_si512( xmm0, (chaining[0]) );
+  xmm1 = _mm512_xor_si512( xmm1, (chaining[1]) );
+  xmm2 = _mm512_xor_si512( xmm2, (chaining[2]) );
+  xmm3 = _mm512_xor_si512( xmm3, (chaining[3]) );

  /* store CV */
  chaining[0] = xmm0;
@@ -440,19 +476,16 @@ void TF512( __m128i* chaining, __m128i* message )
  chaining[2] = xmm2;
  chaining[3] = xmm3;

-#ifdef IACA_TRACE
-  IACA_END;
-#endif
  return;
 }

-void OF512( __m128i* chaining )
+void OF512_4way( __m512i* chaining )
 {
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;

  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
@@ -475,10 +508,10 @@ void OF512( __m128i* chaining )

  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+  xmm8  = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[1]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[2]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[3]) );

  /* transform state back from row ordering into column ordering */
  /* result: final hash value in xmm9, xmm11 */
@@ -489,4 +522,5 @@ void OF512( __m128i* chaining )
  chaining[3] = xmm11;
 }

-
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -17,15 +17,6 @@

 #if defined(__VAES__)

-#define ROTL64(a,n) \
-   ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
-     
-#define U64BIG(a) \
-  ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
-    ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
-    ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
-    ( ROTL64(a,56) & 0xFF000000FF000000 ) )
-
 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
  int i;
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,22 +1,20 @@
 #include "myrgr-gate.h"
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
-#ifdef NO_AES_NI
-  #include "sph_groestl.h"
-#else
+#ifdef __AES__
  #include "aes_ni/hash-groestl.h"
+#else
+  #include "sph_groestl.h"
 #endif
 #include <openssl/sha.h>

 typedef struct {
-#ifdef NO_AES_NI
-    sph_groestl512_context  groestl;
-#else
+#ifdef __AES__
    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
 #endif
    SHA256_CTX              sha;
 } myrgr_ctx_holder;
@@ -25,10 +23,10 @@ myrgr_ctx_holder myrgr_ctx;

 void init_myrgr_ctx()
 {
-#ifdef NO_AES_NI
-     sph_groestl512_init( &myrgr_ctx.groestl );
-#else
+#ifdef __AES__
     init_groestl ( &myrgr_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &myrgr_ctx.groestl );
 #endif
     SHA256_Init( &myrgr_ctx.sha );
 }
@@ -40,12 +38,12 @@ void myriad_hash(void *output, const void *input)

 	uint32_t _ALIGN(32) hash[16];

-#ifdef NO_AES_NI
-	sph_groestl512(&ctx.groestl, input, 80);
-	sph_groestl512_close(&ctx.groestl, hash);
-#else
+#ifdef __AES__
   update_groestl( &ctx.groestl, (char*)input, 640 );
   final_groestl( &ctx.groestl, (char*)hash);
+#else
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #endif

   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,13 +1,10 @@
 #include "algo-gate-api.h"
-
 #include <stdio.h>
 #include <string.h>
 #include <openssl/sha.h>
 #include <stdint.h>
 #include <stdlib.h>
-
 #include "sph_hefty1.h"
-
 #include "algo/luffa/sph_luffa.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/skein/sph_skein.h"
@@ -16,8 +13,7 @@
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h"
-
-#ifndef NO_AES_NI
+#ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

@@ -25,30 +21,23 @@ void bastionhash(void *output, const void *input)
 {
 	unsigned char hash[64] __attribute__ ((aligned (64)));

-#ifdef NO_AES_NI
-        sph_echo512_context     ctx_echo;
+#ifdef __AES__
+   hashState_echo          ctx_echo;
 #else
-        hashState_echo          ctx_echo;
+   sph_echo512_context     ctx_echo;
 #endif
-        hashState_luffa         ctx_luffa;
+   hashState_luffa         ctx_luffa;
 	sph_fugue512_context ctx_fugue;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_shabal512_context ctx_shabal;
   sph_hamsi512_context ctx_hamsi;
 	sph_skein512_context ctx_skein;

-//        unsigned char hashbuf[128] __attribute__ ((aligned (16)));
-//        sph_u64 hashctA;
-//        sph_u64 hashctB;
-//        size_t hashptr;
-
 	HEFTY1(input, 80, hash);

        init_luffa( &ctx_luffa, 512 );
        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
                                (const BitSequence*)hash, 64 );
-//        update_luffa( &ctx_luffa, hash, 64 );
-//        final_luffa( &ctx_luffa, hash );

 	if (hash[0] & 0x8)
 	{
@@ -71,23 +60,19 @@ void bastionhash(void *output, const void *input)

 	if (hash[0] & 0x8)
 	{
-#ifdef NO_AES_NI
+#ifdef __AES__
+      init_echo( &ctx_echo, 512 );
+      update_final_echo ( &ctx_echo,(BitSequence*)hash,
+                              (const BitSequence*)hash, 512 );
+#else
 		sph_echo512_init(&ctx_echo);
 		sph_echo512(&ctx_echo, hash, 64);
 		sph_echo512_close(&ctx_echo, hash);
-#else
-                init_echo( &ctx_echo, 512 );
-                update_final_echo ( &ctx_echo,(BitSequence*)hash,
-                                    (const BitSequence*)hash, 512 );
-//                update_echo ( &ctx_echo, hash, 512 );
-//                final_echo( &ctx_echo,  hash );
 #endif
 	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
 	}

 	sph_shabal512_init(&ctx_shabal);
@@ -119,11 +104,9 @@ void bastionhash(void *output, const void *input)
 		sph_hamsi512(&ctx_hamsi, hash, 64);
 		sph_hamsi512_close(&ctx_hamsi, hash);
 	} else {
-                init_luffa( &ctx_luffa, 512 );
-                update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                        (const BitSequence*)hash, 64 );
-//                update_luffa( &ctx_luffa, hash, 64 );
-//                final_luffa( &ctx_luffa, hash );
+      init_luffa( &ctx_luffa, 512 );
+      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
 	}

 	memcpy(output, hash, 32);
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,19 +1,16 @@
 #include "jha-gate.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/blake/sph_blake.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif

 static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
@@ -28,12 +25,12 @@ void jha_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(128) hash[64];

-#ifdef NO_AES_NI
-	sph_groestl512_context ctx_groestl;
+#ifdef __AES__
+   hashState_groestl      ctx_groestl;
 #else
-        hashState_groestl      ctx_groestl;
+	sph_groestl512_context ctx_groestl;
 #endif
-        sph_blake512_context ctx_blake;
+   sph_blake512_context ctx_blake;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
@@ -46,36 +43,36 @@ void jha_hash(void *output, const void *input)
 	for (int round = 0; round < 3; round++)
 	{
 	   if (hash[0] & 0x01)
-           {
-#ifdef NO_AES_NI
-		sph_groestl512_init(&ctx_groestl);
-		sph_groestl512(&ctx_groestl, hash, 64 );
-		sph_groestl512_close(&ctx_groestl, hash );
+      {
+#ifdef __AES__
+         init_groestl( &ctx_groestl, 64 );
+         update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                              (char*)hash, 512 );
 #else
-                init_groestl( &ctx_groestl, 64 );
-                update_and_final_groestl( &ctx_groestl, (char*)hash,
-                                          (char*)hash, 512 );
+   		sph_groestl512_init(&ctx_groestl);
+	   	sph_groestl512(&ctx_groestl, hash, 64 );
+		   sph_groestl512_close(&ctx_groestl, hash );
 #endif
-	    }
-            else
-            {
-		sph_skein512_init(&ctx_skein);
-		sph_skein512(&ctx_skein, hash, 64);
-		sph_skein512_close(&ctx_skein, hash );
-	    }
+      }
+      else
+      {
+		   sph_skein512_init(&ctx_skein);
+		   sph_skein512(&ctx_skein, hash, 64);
+		   sph_skein512_close(&ctx_skein, hash );
+	   }

-	    if (hash[0] & 0x01)
-            {
-		sph_blake512_init(&ctx_blake);
-		sph_blake512(&ctx_blake, hash, 64);
-		sph_blake512_close(&ctx_blake, hash );
-	    }
-            else
-            {
-		sph_jh512_init(&ctx_jh);
-		sph_jh512(&ctx_jh, hash, 64 );
-		sph_jh512_close(&ctx_jh, hash );
-	    }
+	   if (hash[0] & 0x01)
+      {
+		   sph_blake512_init(&ctx_blake);
+		   sph_blake512(&ctx_blake, hash, 64);
+		   sph_blake512_close(&ctx_blake, hash );
+	   }
+      else
+      {
+		   sph_jh512_init(&ctx_jh);
+		   sph_jh512(&ctx_jh, hash, 64 );
+		   sph_jh512_close(&ctx_jh, hash );
+	   }
 	}

 	memcpy(output, hash, 32);
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -39,7 +39,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
      keccakhash_8way( hash, vdata );

      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg ) 
+      if ( hash7[ lane<<1 ] <= Htarg ) 
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
@@ -89,7 +89,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( hash7[ lane<<1 ] < Htarg )
+      if ( hash7[ lane<<1 ] <= Htarg )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -7,33 +7,44 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl256-hash-4way.h"
+#endif

-#if defined (ALLIUM_8WAY)  
+#if defined (ALLIUM_16WAY)  

 typedef struct {
-   blake256_8way_context     blake;
+   blake256_16way_context     blake;
   keccak256_8way_context    keccak;
   cube_4way_context          cube;
   skein256_8way_context     skein;
+#if defined(__VAES__)
+   groestl256_4way_context groestl;
+#else
   hashState_groestl256      groestl;
-} allium_8way_ctx_holder;
+#endif
+} allium_16way_ctx_holder;

-static __thread allium_8way_ctx_holder allium_8way_ctx;
+static __thread allium_16way_ctx_holder allium_16way_ctx;

-bool init_allium_8way_ctx()
+bool init_allium_16way_ctx()
 {
-   keccak256_8way_init( &allium_8way_ctx.keccak );
-   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
-   skein256_8way_init( &allium_8way_ctx.skein );
-   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   keccak256_8way_init( &allium_16way_ctx.keccak );
+   cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_16way_ctx.skein );
+#if defined(__VAES__)
+   groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
+#else
+   init_groestl256( &allium_16way_ctx.groestl, 32 );
+#endif
   return true;
 }

-void allium_8way_hash( void *state, const void *input )
+void allium_16way_hash( void *state, const void *input )
 {
-   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
-   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
-   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (64)));
   uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -42,18 +53,39 @@ void allium_8way_hash( void *state, const void *input )
   uint32_t hash5[8] __attribute__ ((aligned (64)));
   uint32_t hash6[8] __attribute__ ((aligned (64)));
   uint32_t hash7[8] __attribute__ ((aligned (64)));
-   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
-   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
-   blake256_8way_close( &ctx.blake, vhash );
+   memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
+   blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
+   blake256_16way_close( &ctx.blake, vhash );

-   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                  vhash, 256 );
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+   
+//   rintrlv_8x32_8x64( vhashA, vhash, 256 );
   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
-   keccak256_8way_close( &ctx.keccak, vhash );
+   keccak256_8way_close( &ctx.keccak, vhashA);
+   keccak256_8way_init( &ctx.keccak );
+   keccak256_8way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_8way_close( &ctx.keccak, vhashB);

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
@@ -67,6 +99,18 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
  
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
@@ -78,6 +122,17 @@ void allium_8way_hash( void *state, const void *input )
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );

+   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
+   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
+
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -90,15 +145,258 @@ void allium_8way_hash( void *state, const void *input )
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );

-   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+   intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                hash7, 256 );
+   intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+                hash15, 256 );
+
+   skein256_8way_update( &ctx.skein, vhashA, 32 );
+   skein256_8way_close( &ctx.skein, vhashA );
+   skein256_8way_init( &ctx.skein );
+   skein256_8way_update( &ctx.skein, vhashB, 32 );
+   skein256_8way_close( &ctx.skein, vhashB );

-   skein256_8way_update( &ctx.skein, vhash, 32 );
-   skein256_8way_close( &ctx.skein, vhash );

   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 vhash, 256 );
+                 vhashA, 256 );
+   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
+                 vhashB, 256 );
+
+#if defined(__VAES__)
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   
+   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+
+   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+
+   groestl256_4way_init( &ctx.groestl, 32 );
+   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+ 
+   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
+   
+#else
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 );
+   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 );
+
+#endif
+}
+
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   blake256_16way_init( &allium_16way_ctx.blake );
+   blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
+
+   do {
+     *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                 n+11, n+10, n+ 9, n+ 8,
+                                                 n+ 7, n+ 6, n+ 5, n+ 4,
+                                                 n+ 3, n+ 2, n +1, n ) );
+
+     allium_16way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 16; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     {
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
+     }
+     n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_4way_init( &allium_8way_ctx.keccak );
+   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *state, const void *input )
+{
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (32)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (32)));
+   uint32_t hash6[8] __attribute__ ((aligned (32)));
+   uint32_t hash7[8] __attribute__ ((aligned (32)));
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhashA );
+
+   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                     vhashA, 256 );
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   keccak256_4way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashA );
+   keccak256_4way_init( &ctx.keccak );
+   keccak256_4way_update( &ctx.keccak, vhashB, 32 );
+   keccak256_4way_close( &ctx.keccak, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
+   cubehashInit( &ctx.cube, 256, 16, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
+   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
+   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
+   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
+
+   intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   skein256_4way_update( &ctx.skein, vhashA, 32 );
+   skein256_4way_close( &ctx.skein, vhashA );
+   skein256_4way_init( &ctx.skein );
+   skein256_4way_update( &ctx.skein, vhashB, 32 );
+   skein256_4way_close( &ctx.skein, vhashB );
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
@@ -122,23 +420,21 @@ void allium_8way_hash( void *state, const void *input )
   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
           sizeof(hashState_groestl256) );
   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   int thr_id = mythr->id;  

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -169,126 +465,4 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-
-#elif defined (ALLIUM_4WAY)  
-
-
-typedef struct {
-   blake256_4way_context     blake;
-   keccak256_4way_context    keccak;
-   cubehashParam             cube;
-   skein256_4way_context     skein;
-   hashState_groestl256      groestl;
-
-} allium_4way_ctx_holder;
-
-static __thread allium_4way_ctx_holder allium_4way_ctx;
-
-bool init_allium_4way_ctx()
-{
-   keccak256_4way_init( &allium_4way_ctx.keccak );
-   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
-   skein256_4way_init( &allium_4way_ctx.skein );
-   init_groestl256( &allium_4way_ctx.groestl, 32 );
-   return true;
-}
-
-void allium_4way_hash( void *state, const void *input )
-{
-   uint32_t hash0[8] __attribute__ ((aligned (64)));
-   uint32_t hash1[8] __attribute__ ((aligned (32)));
-   uint32_t hash2[8] __attribute__ ((aligned (32)));
-   uint32_t hash3[8] __attribute__ ((aligned (32)));
-   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
-   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
-   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
-
-   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
-   blake256_4way_close( &ctx.blake, vhash32 );
-
-   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
-   keccak256_4way_close( &ctx.keccak, vhash64 );
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
-
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-
-   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
-
-   skein256_4way_update( &ctx.skein, vhash64, 32 );
-   skein256_4way_close( &ctx.skein, vhash64 );
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
-
-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
-}
-
-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
-                             uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   const uint32_t Htarg = ptarget[7];
-   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ( (uint32_t*)ptarget )[7] = 0x0000ff;
-
-   mm128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256_4way_init( &allium_4way_ctx.blake );
-   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
-
-   do {
-     *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
-
-     allium_4way_hash( hash, vdata );
-     pdata[19] = n;
-
-     for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
-     {
-        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
-        {
-           pdata[19] = n + lane;
-           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
-         }
-     }
-     n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -119,7 +119,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -174,20 +174,20 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_8WAY)
+#if defined (ALLIUM_16WAY)
+  gate->miner_thread_init = (void*)&init_allium_16way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_16way;
+  gate->hash      = (void*)&allium_16way_hash;
+#elif defined (ALLIUM_8WAY)
  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_8way;
  gate->hash      = (void*)&allium_8way_hash;
-#elif defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
 #else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
 //   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -153,27 +153,27 @@ bool lyra2h_thread_init();
 //////////////////////////////////

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define ALLIUM_8WAY 1
+  #define ALLIUM_16WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY 1
+  #define ALLIUM_8WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_8WAY)
+#if defined(ALLIUM_16WAY)
+
+void allium_16way_hash( void *state, const void *input );
+int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_16way_ctx();
+
+#elif defined(ALLIUM_8WAY)

 void allium_8way_hash( void *state, const void *input );
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
 bool init_allium_8way_ctx();

-#elif defined(ALLIUM_4WAY)
-
-void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr );
-bool init_allium_4way_ctx();
-
 #else

 void allium_hash( void *state, const void *input );
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -575,4 +575,138 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
   return 0;
 }

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+   i = (int64_t)ROW_LEN_BYTES * nRows;
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
+   if (wholeMatrix == NULL)
+      return -1;
+
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
+
+   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
+
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa0 = (rowa0 + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa0 == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
+
+        reducedDuplexRow_2way_X( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );
+
+   //================== Freeing the memory =============================//
+   _mm_free(wholeMatrix);
+
+   return 0;
+}
+      
 #endif
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -74,6 +74,9 @@ int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 #endif

 #endif /* LYRA2_H_ */
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -246,15 +246,32 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
-                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+// reduced duplex row has three version depending on rows inout.
+// If they are the same the fastest version can be used, equivalent to 
+// linear version.
+// If either rowinout overlaps with rowout the slowest version is used,
+// to refresh local data after overwriting rowout.
+// Otherwise the normal version is used, slower than unified, faster than
+// overlap.
+//
+// The likelyhood of each case depends on the number of rows. More rows
+// means unified and overlap are both less likely.
+// Unified has a 1 in Nrows chances,
+// Overlap has 2 in Nrows chance reduced to 1 in Nrows because if both
+// overlap it's unified.
+// As a result normal is Nrows-2 / Nrows.
+// for 4 rows: 1 unified, 1 overlap, 2 normal.
+// for 8 rows: 1 unified, 1 overlap, 6 normal.
+
+static inline void reducedDuplexRow_2way_normal( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols)
 {
   int i;
   register __m512i state0, state1, state2, state3;
   __m512i *in = (__m512i*)rowIn;
-   __m256i *inout0 = (__m256i*)rowInOut0;
-   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
   __m512i *out = (__m512i*)rowOut;
   register __m512i io0, io1, io2;

@@ -262,19 +279,19 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
-    
+
   for ( i = 0; i < nCols; i++ )
   {
     //Absorbing "M[prev] [+] M[row*]"
     io0 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 ),
-                 _mm512_load_si512( (__m512i*)inout1 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 ),
+                                    _mm512_load_si512( (__m512i*)inout1 ) );
     io1 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +1 ),
-                 _mm512_load_si512( (__m512i*)inout1 +1 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +1 ) );
     io2 = _mm512_mask_blend_epi64( 0xf0,
-                 _mm512_load_si512( (__m512i*)inout0 +2 ),
-                 _mm512_load_si512( (__m512i*)inout1 +2 ) );
+                                    _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                    _mm512_load_si512( (__m512i*)inout1 +2 ) );

     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
@@ -286,29 +303,6 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
     {
       register __m512i t0, t1, t2;

-       //M[rowOut][col] = M[rowOut][col] XOR rand
-       t0 = _mm512_xor_si512( out[0], state0 );
-       t1 = _mm512_xor_si512( out[1], state1 );
-       t2 = _mm512_xor_si512( out[2], state2 );
-
-       // if out is the same row as inout, update with new data.
-       if ( rowOut == rowInOut0 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0x0f, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0x0f, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0x0f, io2, t2 );
-       }
-       if ( rowOut == rowInOut1 )
-       {
-          io0 = _mm512_mask_blend_epi64( 0xf0, io0, t0 );
-          io1 = _mm512_mask_blend_epi64( 0xf0, io1, t1 );
-          io2 = _mm512_mask_blend_epi64( 0xf0, io2, t2 );
-       }
-
-       out[0] = t0;
-       out[1] = t1;
-       out[2] = t2;
-
       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
       t0 = _mm512_permutex_epi64( state0, 0x93 );
       t1 = _mm512_permutex_epi64( state1, 0x93 );
@@ -317,19 +311,24 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
       io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
       io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
       io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
     }

-     _mm512_mask_store_epi64( (__m512i*)inout0,    0x0f, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout1,    0xf0, io0 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +1, 0x0f, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +1, 0xf0, io1 );
-     _mm512_mask_store_epi64( (__m512i*)inout0 +2, 0x0f, io2 );
-     _mm512_mask_store_epi64( (__m512i*)inout1 +2, 0xf0, io2 );
+     _mm512_mask_store_epi64( inout0,    0x0f, io0 );
+     _mm512_mask_store_epi64( inout1,    0xf0, io0 );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io1 );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io1 );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io2 );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io2 );

      //Goes to next block
      in     += BLOCK_LEN_M256I;
-      inout0 += BLOCK_LEN_M256I * 2;
-      inout1 += BLOCK_LEN_M256I * 2;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
      out    += BLOCK_LEN_M256I;
   }

@@ -339,4 +338,297 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
   _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

+
+
+// rowInOut0 ! = rowInOut1 != rowOut
+static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout0 = (__m512i*)rowInOut0;
+   __m512i *inout1 = (__m512i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   inout_ovly io;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     io.v512[0] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 ),
+                                  _mm512_load_si512( (__m512i*)inout1 ) );
+     io.v512[1] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +1 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +1 ) );
+     io.v512[2] = _mm512_mask_blend_epi64( 0xf0,
+                                  _mm512_load_si512( (__m512i*)inout0 +2 ),
+                                  _mm512_load_si512( (__m512i*)inout1 +2 ) );
+
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io.v512[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io.v512[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io.v512[2] ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       __m512i t0, t1, t2;
+
+       //M[rowOut][col] = M[rowOut][col] XOR rand
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+       // if out is the same row as inout, update with new data.
+       if ( rowOut == rowInOut0 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0x0f, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0x0f, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0x0f, io.v512[2], out[2] );
+
+       }
+       if ( rowOut == rowInOut1 )
+       {
+          io.v512[0] = _mm512_mask_blend_epi64( 0xf0, io.v512[0], out[0] );
+          io.v512[1] = _mm512_mask_blend_epi64( 0xf0, io.v512[1], out[1] );
+          io.v512[2] = _mm512_mask_blend_epi64( 0xf0, io.v512[2], out[2] );
+       }
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       io.v512[0] = _mm512_xor_si512( io.v512[0],
+                                 _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       io.v512[1] = _mm512_xor_si512( io.v512[1],
+                                 _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       io.v512[2] = _mm512_xor_si512( io.v512[2],
+                                 _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+     }
+
+     _mm512_mask_store_epi64( inout0,    0x0f, io.v512[0] );
+     _mm512_mask_store_epi64( inout1,    0xf0, io.v512[0] );
+     _mm512_mask_store_epi64( inout0 +1, 0x0f, io.v512[1] );
+     _mm512_mask_store_epi64( inout1 +1, 0xf0, io.v512[1] );
+     _mm512_mask_store_epi64( inout0 +2, 0x0f, io.v512[2] );
+     _mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
+
+      //Goes to next block
+      in     += BLOCK_LEN_M256I;
+      inout0 += BLOCK_LEN_M256I;
+      inout1 += BLOCK_LEN_M256I;
+      out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+
+}
+
+static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State,
+                    uint64_t *rowIn, uint64_t *rowInOut0, uint64_t *rowInOut1,
+                    uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   inout_ovly inout;
+   __m512i t0, t1, t2;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+    for ( i = 0; i < nCols; i++ )
+    {
+
+      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];
+
+      state0 = _mm512_xor_si512( state0,
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
+      state1 = _mm512_xor_si512( state1,
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
+      state2 = _mm512_xor_si512( state2,
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+
+
+      //Applies the reduced-round transformation f to the sponge's state
+      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+      //M[rowOut][col] = M[rowOut][col] XOR rand
+      out[0] = _mm512_xor_si512( out[0], state0 );
+      out[1] = _mm512_xor_si512( out[1], state1 );
+      out[2] = _mm512_xor_si512( out[2], state2 );
+
+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = ( (__m256i*)out )[0];
+         inout.v256[2] = ( (__m256i*)out )[2];
+         inout.v256[4] = ( (__m256i*)out )[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = ( (__m256i*)out )[1];
+         inout.v256[3] = ( (__m256i*)out )[3];
+         inout.v256[5] = ( (__m256i*)out )[5];
+      }
+
+      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+      t0 = _mm512_permutex_epi64( state0, 0x93 );
+      t1 = _mm512_permutex_epi64( state1, 0x93 );
+      t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];
+
+       //Goes to next block
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// rowInOut0 == rowInOut1, fastest, least likely: 1 / nrows
+static inline void reducedDuplexRow_2way_unified( uint64_t *State,
+                   uint64_t *rowIn, uint64_t *rowInOut0,
+                            uint64_t *rowOut, uint64_t nCols)
+{
+   int i;
+   register __m512i state0, state1, state2, state3;
+   __m512i *in = (__m512i*)rowIn;
+   __m512i *inout = (__m512i*)rowInOut0;
+   __m512i *out = (__m512i*)rowOut;
+
+   state0 = _mm512_load_si512( (__m512i*)State     );
+   state1 = _mm512_load_si512( (__m512i*)State + 1 );
+   state2 = _mm512_load_si512( (__m512i*)State + 2 );
+   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+
+   for ( i = 0; i < nCols; i++ )
+   {
+     //Absorbing "M[prev] [+] M[row*]"
+     state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], inout[0] ) );
+     state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], inout[1] ) );
+     state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], inout[2] ) );
+
+     //Applies the reduced-round transformation f to the sponge's state
+     LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
+
+     {
+       register __m512i t0, t1, t2;
+
+       //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+       t0 = _mm512_permutex_epi64( state0, 0x93 );
+       t1 = _mm512_permutex_epi64( state1, 0x93 );
+       t2 = _mm512_permutex_epi64( state2, 0x93 );
+
+       inout[0] = _mm512_xor_si512( inout[0],
+                                    _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
+       inout[1] = _mm512_xor_si512( inout[1],
+                                    _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
+       inout[2] = _mm512_xor_si512( inout[2],
+                                     _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
+
+       out[0] = _mm512_xor_si512( out[0], state0 );
+       out[1] = _mm512_xor_si512( out[1], state1 );
+       out[2] = _mm512_xor_si512( out[2], state2 );
+
+     }
+
+     //Goes to next block
+     in     += BLOCK_LEN_M256I;
+     inout += BLOCK_LEN_M256I;
+     out    += BLOCK_LEN_M256I;
+   }
+
+   _mm512_store_si512( (__m512i*)State,     state0 );
+   _mm512_store_si512( (__m512i*)State + 1, state1 );
+   _mm512_store_si512( (__m512i*)State + 2, state2 );
+   _mm512_store_si512( (__m512i*)State + 3, state3 );
+}
+
+// Multi level specialization.
+// There are three cases that need to be handled:
+// unified: inout data is contiguous, fastest, unlikely.
+// normal: inout data is not contiguous with no overlap with out, likely. 
+// overlap: inout data is not contiguous and one lane overlaps with out
+//          slowest, unlikely.
+//
+// In adition different algos prefer different coding. x25x and x22i prefer
+// 256 bit memory acceses to handle the diverged data while all other
+// algos prefer 512 bit memory accesses with masking and blending.
+
+ 
+//  Wrapper
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
+{
+  if ( rowInOut0 == rowInOut1 )
+     reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+  else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+     reducedDuplexRow_2way_overlap( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+  else
+     reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                   rowOut, nCols );
+}
+
+inline void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols )
+{
+    if ( rowInOut0 == rowInOut1 )
+      reducedDuplexRow_2way_unified( State, rowIn, rowInOut0, rowOut, nCols );
+    else if ( ( rowInOut0 == rowOut ) || ( rowInOut1 == rowOut ) )
+    {
+       asm ( "nop" );  // This prevents GCC from merging with previous function
+       reducedDuplexRow_2way_overlap_X( State, rowIn, rowInOut0, rowInOut1,
+                                      rowOut, nCols );
+    }
+    else
+      reducedDuplexRow_2way_normal( State, rowIn, rowInOut0, rowInOut1,
+                                    rowOut, nCols );
+}
+
+
 #endif // AVX512
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -203,13 +203,12 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-union _povly
+union _inout_ovly
 {
-   __m512i *v512;
-   __m256i *v256;
-   uint64_t *u64;
+   __m512i v512[3];
+   __m256i v256[6];
 };
-typedef union _povly povly;
+typedef union _inout_ovly inout_ovly;

 //---- Housekeeping
 void initState_2way( uint64_t State[/*16*/] );
@@ -234,6 +233,10 @@ void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
                            uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols);

+void reducedDuplexRow_2way_X( uint64_t *State, uint64_t *rowIn,
+                              uint64_t *rowInOut0, uint64_t *rowInOut1,
+                              uint64_t *rowOut, uint64_t nCols);
+
 #endif


--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -267,6 +267,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
        }

+// rewrite to use 64 bit test.        
        const unsigned char *hash_ = (const unsigned char *)hash;
        const unsigned char *target_ = (const unsigned char *)ptarget;
        for ( i = 31; i >= 0; i-- )
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -102,7 +102,7 @@ int scanhash_nist5_8way( struct work *work, uint32_t max_nonce,
        nist5hash_8way( hash, vdata );

        for ( int lane = 0; lane < 8; lane++ )
-        if ( hash7[ lane<<1 ] < Htarg )
+        if ( hash7[ lane<<1 ] <= Htarg )
        {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
@@ -190,7 +190,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
        nist5hash_4way( hash, vdata );

        for ( int lane = 0; lane < 4; lane++ )
-        if ( hash7[ lane<<1 ] < Htarg )
+        if ( hash7[ lane<<1 ] <= Htarg )
        {
           extr_lane_4x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -1513,10 +1513,10 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
       hmq1725_4way_hash( hash, vdata );

       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane<<1 ] <= Htarg )
+       if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
       {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -248,11 +248,11 @@ void quark_8way_hash( void *state, const void *input )
       jh512_8way_close( &ctx.jh, vhashB );
    }

-    // Final blend, directly to state, only need 32 bytes.
-    casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
-    casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
-    casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
-    casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
+   // Final blend, directly to state, only need 32 bytes.
+   casti_m512i( state,0 ) = _mm512_mask_blend_epi64( vh_mask, vhA[0], vhB[0] );
+   casti_m512i( state,1 ) = _mm512_mask_blend_epi64( vh_mask, vhA[1], vhB[1] );
+   casti_m512i( state,2 ) = _mm512_mask_blend_epi64( vh_mask, vhA[2], vhB[2] );
+   casti_m512i( state,3 ) = _mm512_mask_blend_epi64( vh_mask, vhA[3], vhB[3] );
 }

 int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
@@ -267,23 +267,24 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
+    int thr_id = mythr->id; 
+    const uint32_t Htarg = ptarget[7];

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    do
    {
       *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

       quark_8way_hash( hash, vdata );
       pdata[19] = n;

       for ( int i = 0; i < 8; i++ )
-       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
+       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
       {
          extr_lane_8x64( lane_hash, hash, i, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
            pdata[19] = n+i;
            submit_lane_solution( work, lane_hash, mythr, i );
@@ -296,7 +297,6 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
    return 0;
 }

-
 #elif defined (QUARK_4WAY)

 typedef struct {
@@ -460,8 +460,9 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-    int thr_id = mythr->id;  // thr_id arg is deprecated
-
+    int thr_id = mythr->id;
+    const uint32_t Htarg = ptarget[7];
+ 
    mm256_bswap32_intrlv80_4x64( vdata, pdata );
    do
    {
@@ -472,10 +473,10 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
       pdata[19] = n;

       for ( int i = 0; i < 4; i++ )
-       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
+       if ( unlikely( hash7[ i<<1 ] <= Htarg ) )
       {
          extr_lane_4x64( lane_hash, hash, i, 256 );
-          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
+          if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
            pdata[19] = n+i;
            submit_lane_solution( work, lane_hash, mythr, i );
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -5,7 +5,7 @@
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h" 
-#ifndef NO_AES_NI
+#ifdef __AES__
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -15,10 +15,10 @@ typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-#ifdef NO_AES_NI
-        sph_echo512_context echo;
-#else
+#ifdef __AES__
        hashState_echo          echo;
+#else
+        sph_echo512_context echo;
 #endif
 } deep_ctx_holder;

@@ -29,10 +29,10 @@ void init_deep_ctx()
 {
        init_luffa( &deep_ctx.luffa, 512 );
        cubehashInit( &deep_ctx.cubehash, 512, 16, 32 );
-#ifdef NO_AES_NI
-        sph_echo512_init( &deep_ctx.echo );
-#else
+#ifdef __AES__
        init_echo( &deep_ctx.echo, 512 );
+#else
+        sph_echo512_init( &deep_ctx.echo );
 #endif
 };

@@ -59,12 +59,12 @@ void deep_hash(void *output, const void *input)
        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, 
                              (const byte*) hash,64);

-#ifdef NO_AES_NI
-        sph_echo512 (&ctx.echo, (const void*) hash, 64);
-        sph_echo512_close(&ctx.echo, (void*) hash);
-#else
+#ifdef __AES__
        update_final_echo ( &ctx.echo, (BitSequence *) hash,
                          (const BitSequence *) hash, 512);
+#else
+        sph_echo512 (&ctx.echo, (const void*) hash, 64);
+        sph_echo512_close(&ctx.echo, (void*) hash);
 #endif

        asm volatile ("emms");
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -149,8 +149,8 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
        pdata[19] = n;

        for ( int lane = 0; lane < 4; lane++ )
-        if ( ( hash+(lane<<3) )[7] < Htarg )
-        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        if ( unlikely( ( hash+(lane<<3) )[7] <= Htarg ) )
+        if ( likely( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark ) )
        {
           pdata[19] = n + lane;
           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
@@ -233,10 +233,6 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
     int thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };

     casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
     casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
@@ -248,32 +244,27 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
     luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
     luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );

-     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     do
     {
-        uint32_t mask = masks[m];
-        do
-        {
-            be32enc( noncep,   n   );
-            be32enc( noncep+4, n+1 );
-            qubit_2way_hash( hash, vdata );
-            pdata[19] = n;
+         be32enc( noncep,   n   );
+         be32enc( noncep+4, n+1 );
+         qubit_2way_hash( hash, vdata );
+         pdata[19] = n;

-            if ( !( hash[7] & mask ) )
-            if ( fulltest( hash, ptarget) && !opt_benchmark )
-            {
-                pdata[19] = n;
-                submit_lane_solution( work, hash, mythr, 0 );
-            }
-            if ( !( (hash+8)[7] & mask ) )
-            if ( fulltest( hash+8, ptarget) && !opt_benchmark )
-            {
-               pdata[19] = n+1;
-               submit_lane_solution( work, hash+8, mythr, 1 );
-            }
-            n += 2;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-     }
+         if ( unlikely( hash[7] <= Htarg ) )
+         if ( likely( fulltest( hash, ptarget) && !opt_benchmark ) )
+         {
+            pdata[19] = n;
+            submit_lane_solution( work, hash, mythr, 0 );
+         }
+         if ( unlikely( ( (hash+8))[7] <= Htarg ) )
+         if ( likely( fulltest( hash+8, ptarget) && !opt_benchmark ) )
+         {
+            pdata[19] = n+1;
+            submit_lane_solution( work, hash+8, mythr, 1 );
+         }
+         n += 2;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -7,7 +7,7 @@
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
-#ifndef NO_AES_NI
+#ifdef __AES__
 #include "algo/echo/aes_ni/hash_api.h"
 #else
 #include "algo/echo/sph_echo.h"
@@ -19,10 +19,10 @@ typedef struct
        cubehashParam           cubehash;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
-#ifdef NO_AES_NI
-        sph_echo512_context echo;
-#else
+#ifdef __AES__
        hashState_echo          echo;
+#else
+        sph_echo512_context echo;
 #endif
 } qubit_ctx_holder;

@@ -35,10 +35,10 @@ void init_qubit_ctx()
        cubehashInit(&qubit_ctx.cubehash,512,16,32);
        sph_shavite512_init(&qubit_ctx.shavite);
        init_sd(&qubit_ctx.simd,512);
-#ifdef NO_AES_NI
-        sph_echo512_init(&qubit_ctx.echo);
-#else
+#ifdef __AES__
        init_echo(&qubit_ctx.echo, 512);
+#else
+        sph_echo512_init(&qubit_ctx.echo);
 #endif
 };

@@ -71,12 +71,12 @@ void qubit_hash(void *output, const void *input)
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence*)hash,  512 );

-#ifdef NO_AES_NI
-        sph_echo512 (&ctx.echo, (const void*) hash, 64);
-        sph_echo512_close(&ctx.echo, (void*) hash);
-#else
+#ifdef __AES__
        update_final_echo( &ctx.echo, (BitSequence *) hash,
                     (const BitSequence *) hash, 512 );
+#else
+        sph_echo512 (&ctx.echo, (const void*) hash, 64);
+        sph_echo512_close(&ctx.echo, (void*) hash);
 #endif

        asm volatile ("emms");
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -275,7 +275,7 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
        pdata[19] = n;

        for ( int i = 0; i < 8; i++ )
-        if ( ( ( hash+(i<<3) )[7] < Htarg )
+        if ( ( ( hash+(i<<3) )[7] <= Htarg )
             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n+i;
@@ -443,37 +443,26 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
     int thr_id = mythr->id;  // thr_id arg is deprecated
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for (int m=0; m < 6; m++) 
-       if (Htarg <= htmax[m])
-       {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-            c11_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
+        c11_4way_hash( hash, vdata );
+        pdata[19] = n;

+        for ( int i = 0; i < 4; i++ )
+        if ( ( ( hash+(i<<3) )[7] <= Htarg )
+            && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 4;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -78,11 +78,9 @@ void c11_hash( void *output, const void *input )
    sph_bmw512_close( &ctx.bmw, hash );

 #if defined(__AES__)
-    init_groestl( &ctx.groestl, 64 );
    update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)hash, 512 );
 #else
-    sph_groestl512_init( &ctx.groestl );
    sph_groestl512( &ctx.groestl, hash, 64 );
    sph_groestl512_close( &ctx.groestl, hash );
 #endif
@@ -108,12 +106,12 @@ void c11_hash( void *output, const void *input )
     update_final_sd( &ctx.simd, (BitSequence *)hash,
                      (const BitSequence *)hash, 512 );

-#ifdef NO_AES_NI
-     sph_echo512( &ctx.echo, hash, 64 );
-     sph_echo512_close( &ctx.echo, hash );
-#else
+#if defined(__AES__)
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
                         (const BitSequence *)hash, 512 );
+#else
+     sph_echo512( &ctx.echo, hash, 64 );
+     sph_echo512_close( &ctx.echo, hash );
 #endif

        memcpy(output, hash, 32);
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -11,10 +11,10 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif

 static __thread uint32_t s_ntime = UINT32_MAX;
@@ -28,10 +28,10 @@ typedef struct {
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
        cubehashParam           cube;
-#ifdef NO_AES_NI
-        sph_groestl512_context  groestl;
-#else
+#ifdef __AES__
        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
 #endif
 } tt_ctx_holder;

@@ -47,10 +47,10 @@ void init_tt8_ctx()
        sph_keccak512_init( &tt_ctx.keccak );
        init_luffa( &tt_ctx.luffa, 512 );
        cubehashInit( &tt_ctx.cube, 512, 16, 32 );
-#ifdef NO_AES_NI
-        sph_groestl512_init( &tt_ctx.groestl );
-#else
+#ifdef __AES__
        init_groestl( &tt_ctx.groestl, 64 );
+#else
+        sph_groestl512_init( &tt_ctx.groestl );
 #endif
 };

@@ -110,7 +110,10 @@ void timetravel_hash(void *output, const void *input)
        }
        break;
     case 2:
-#ifdef NO_AES_NI
+#ifdef __AES__
+           update_and_final_groestl( &ctx.groestl, (char*)hashB,
+                                    (char*)hashA, dataLen*8 );
+#else
        if ( i == 0 )
        {
           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
@@ -122,19 +125,6 @@ void timetravel_hash(void *output, const void *input)
           sph_groestl512( &ctx.groestl, hashA, dataLen );
           sph_groestl512_close( &ctx.groestl, hashB );
        }
-#else
-// groestl midstate is slower
-//        if ( i == 0 )
-//        {
-//           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
-//           update_and_final_groestl( &ctx.groestl, (char*)hashB,
-//                                    (char*)input + midlen, tail*8 );
-//        }
-//        else
-//        {
-           update_and_final_groestl( &ctx.groestl, (char*)hashB,
-                                    (char*)hashA, dataLen*8 );
-//        }
 #endif
        break;
     case 3:
@@ -253,13 +243,9 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
           sph_bmw512( &tt_mid.bmw, endiandata, 64 );
           break;
        case 2:
-#ifdef NO_AES_NI
+#ifndef __AES__
           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
           sph_groestl512( &tt_mid.groestl, endiandata, 64 );
-#else
-// groestl midstate is slower
-//         memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
-//         update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
 #endif
           break;
        case 3:
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -12,11 +12,10 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
-
-#ifdef NO_AES_NI
-  #include "algo/groestl/sph_groestl.h"
-#else
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
 #endif

 static __thread uint32_t s_ntime = UINT32_MAX;
@@ -32,10 +31,10 @@ typedef struct {
        cubehashParam           cube;
        sph_shavite512_context  shavite;
        hashState_sd            simd;
-#ifdef NO_AES_NI
-        sph_groestl512_context  groestl;
-#else
+#ifdef __AES__
        hashState_groestl       groestl;
+#else
+        sph_groestl512_context  groestl;
 #endif
 } tt10_ctx_holder;

@@ -53,10 +52,10 @@ void init_tt10_ctx()
        cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
        sph_shavite512_init( &tt10_ctx.shavite );
        init_sd( &tt10_ctx.simd, 512 );
-#ifdef NO_AES_NI
-        sph_groestl512_init( &tt10_ctx.groestl );
-#else
+#ifdef __AES__
        init_groestl( &tt10_ctx.groestl, 64 );
+#else
+        sph_groestl512_init( &tt10_ctx.groestl );
 #endif
 };

@@ -116,7 +115,10 @@ void timetravel10_hash(void *output, const void *input)
        }
        break;
     case 2:
-#ifdef NO_AES_NI
+#ifdef __AES__
+           update_and_final_groestl( &ctx.groestl, (char*)hashB,
+                                    (char*)hashA, dataLen*8 );
+#else
        if ( i == 0 )
        {
           memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl );
@@ -128,19 +130,6 @@ void timetravel10_hash(void *output, const void *input)
           sph_groestl512( &ctx.groestl, hashA, dataLen );
           sph_groestl512_close( &ctx.groestl, hashB );
        }
-#else
-// groestl midstate is slower
-//        if ( i == 0 )
-//        {
-//           memcpy( &ctx.groestl, &tt10_mid.groestl, sizeof tt10_mid.groestl );
-//           update_and_final_groestl( &ctx.groestl, (char*)hashB,
-//                                    (char*)input + midlen, tail*8 );
-//        }
-//        else
-//        {
-           update_and_final_groestl( &ctx.groestl, (char*)hashB,
-                                    (char*)hashA, dataLen*8 );
-//        }
 #endif
        break;
     case 3:
@@ -286,13 +275,9 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
           sph_bmw512( &tt10_mid.bmw, endiandata, 64 );
           break;
        case 2:
-#ifdef NO_AES_NI
+#ifndef __AES__
           memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) );
           sph_groestl512( &tt10_mid.groestl, endiandata, 64 );
-#else
-// groestl midstate is slower
-//         memcpy( &tt10_mid.groestl, &tt10_ctx.groestl, sizeof(tt10_mid.groestl ) );
-//         update_groestl( &tt10_mid.groestl, (char*)endiandata, 64*8 );
 #endif
           break;
        case 3:
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -124,7 +124,7 @@ int scanhash_tribus_8way( struct work *work, uint32_t max_nonce,
     pdata[19] = n;

     for ( int i = 0; i < 8; i++ )
-     if ( (hash+(i<<3))[7] < Htarg )
+     if ( (hash+(i<<3))[7] <= Htarg )
     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
     {
          pdata[19] = n+i;
@@ -209,7 +209,7 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
     pdata[19] = n;

     for ( int i = 0; i < 4; i++ )
-     if ( (hash+(i<<3))[7] < Htarg )
+     if ( (hash+(i<<3))[7] <= Htarg )
     if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
     {
          pdata[19] = n+i;
--- a/algo/x11/tribus.c
+++ b/algo/x11/tribus.c
@@ -7,19 +7,19 @@
 #include "algo/jh//sph_jh.h"
 #include "algo/keccak/sph_keccak.h"

-#ifdef NO_AES_NI
-  #include "algo/echo/sph_echo.h"
-#else
+#ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
 #endif

 typedef struct {
    sph_jh512_context     jh;
    sph_keccak512_context keccak;
-#ifdef NO_AES_NI
-    sph_echo512_context   echo;
-#else
+#ifdef __AES__
    hashState_echo        echo;
+#else
+    sph_echo512_context   echo;
 #endif
 } tribus_ctx_holder;

@@ -29,10 +29,10 @@ bool tribus_thread_init()
 {
   sph_jh512_init( &tribus_ctx.jh );
   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
+#ifdef __AES__
   init_echo( &tribus_ctx.echo, 512 );
+#else
+   sph_echo512_init( &tribus_ctx.echo );
 #endif
  return true;
 }
@@ -49,12 +49,12 @@ void tribus_hash(void *state, const void *input)
     sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
     sph_keccak512_close( &ctx.keccak, (void*) hash );

-#ifdef NO_AES_NI
-     sph_echo512( &ctx.echo, hash, 64 );
-     sph_echo512_close (&ctx.echo, hash );
-#else
+#ifdef __AES__
     update_final_echo( &ctx.echo, (BitSequence *) hash,
                        (const BitSequence *) hash, 512 );
+#else
+     sph_echo512( &ctx.echo, hash, 64 );
+     sph_echo512_close (&ctx.echo, hash );
 #endif

     memcpy(state, hash, 32);
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -275,7 +275,7 @@ int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] < Htarg
+         if ( ( hash+(i<<3) )[7] <= Htarg
              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
             pdata[19] = n+i;
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -1,13 +1,10 @@
 #include "cpuminer-config.h"
 #include "x11evo-gate.h"
-
 #include <string.h>
 #include <stdint.h>
 #include <compat/portable_endian.h>
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
@@ -15,24 +12,24 @@
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
-#ifndef NO_AES_NI
+#ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
 #endif
-
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

 typedef struct {
-#ifdef NO_AES_NI
-    sph_groestl512_context  groestl;
-    sph_echo512_context     echo;
-#else
+#ifdef __AES__
    hashState_echo          echo;
    hashState_groestl       groestl;
+#else
+    sph_groestl512_context  groestl;
+    sph_echo512_context     echo;
 #endif
    hashState_luffa         luffa;
    cubehashParam           cube;
@@ -49,12 +46,12 @@ static x11evo_ctx_holder x11evo_ctx __attribute__ ((aligned (64)));

 void init_x11evo_ctx()
 {
-#ifdef NO_AES_NI
-     sph_groestl512_init( &x11evo_ctx.groestl );
-     sph_echo512_init( &x11evo_ctx.echo );
-#else
+#ifdef __AES__
     init_echo( &x11evo_ctx.echo, 512 );
     init_groestl( &x11evo_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &x11evo_ctx.groestl );
+     sph_echo512_init( &x11evo_ctx.echo );
 #endif
     init_luffa( &x11evo_ctx.luffa, 512 );
     cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
@@ -106,12 +103,12 @@ void x11evo_hash( void *state, const void *input )
 	      sph_bmw512_close( &ctx.bmw, (char*)hash );
 	      break;
 	   case 2:
-#ifdef NO_AES_NI
+#ifdef __AES__
+         update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                        (const char*)hash, 512 );
+#else
 	      sph_groestl512( &ctx.groestl, (char*)hash, size );
 	      sph_groestl512_close( &ctx.groestl, (char*)hash );
-#else
-              update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                        (const char*)hash, 512 );
 #endif
 	      break;
 	    case 3:
@@ -142,12 +139,12 @@ void x11evo_hash( void *state, const void *input )
              update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
 	      break;
 	    case 10:
-#ifdef NO_AES_NI
+#ifdef __AES__
+         update_final_echo( &ctx.echo, (char*)hash,
+                                 (const char*)hash, 512 );
+#else
 	      sph_echo512( &ctx.echo, (char*)hash, size );
 	      sph_echo512_close( &ctx.echo, (char*)hash );
-#else
-              update_final_echo( &ctx.echo, (char*)hash,
-                                 (const char*)hash, 512 );
 #endif
 	      break;
 	}
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -308,7 +308,7 @@ int scanhash_x11gost_8way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] < Htarg 
+         if ( ( hash+(i<<3) )[7] <= Htarg 
              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
             pdata[19] = n+i;
@@ -482,37 +482,26 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
     int thr_id = mythr->id;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };

     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for (int m=0; m < 6; m++)
-       if (Htarg <= htmax[m])
-       {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-            x11gost_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-               submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
+        x11gost_4way_hash( hash, vdata );
+        pdata[19] = n;

+        for ( int i = 0; i < 4; i++ )
+        if ( ( hash+(i<<3) )[7] <= Htarg
+             && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 4;
+     } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -114,12 +114,12 @@ void x11gost_hash(void *output, const void *input)
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                          (const BitSequence *)hash, 512 );

-#ifdef NO_AES_NI
-     sph_echo512(&ctx.echo, hash, 64);
-     sph_echo512_close(&ctx.echo, hash);
-#else
+#if defined(__AES__)
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
                         (const BitSequence *)hash, 512 );
+#else
+     sph_echo512(&ctx.echo, hash, 64);
+     sph_echo512_close(&ctx.echo, hash);
 #endif

     memcpy( output, hash, 32 );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -24,7 +24,6 @@

 #if defined(X12_8WAY)

-
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
@@ -96,7 +95,6 @@ void x12_8way_hash( void *state, const void *input )

 #if defined(__VAES__)

-     shavite512_4way_init( &ctx.shavite );
     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
     shavite512_4way_init( &ctx.shavite );
     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
@@ -151,19 +149,16 @@ void x12_8way_hash( void *state, const void *input )

 #endif

-     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );

 #if defined(__VAES__)

-     echo_4way_init( &ctx.echo, 512 );
     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
     echo_4way_init( &ctx.echo, 512 );
     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );

-     groestl512_4way_init( &ctx.groestl, 64 );
     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
     groestl512_4way_init( &ctx.groestl, 64 );
     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
@@ -174,7 +169,7 @@ void x12_8way_hash( void *state, const void *input )

     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
-     
+
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -1,18 +1,15 @@
 #include "phi1612-gate.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/jh/sph_jh.h"
-
-#ifndef NO_AES_NI
+#ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

@@ -22,10 +19,10 @@ typedef struct {
     cubehashParam           cube;
     sph_fugue512_context    fugue;
     sph_gost512_context     gost;
-#ifdef NO_AES_NI
-     sph_echo512_context     echo;
-#else
+#ifdef __AES__
     hashState_echo          echo;
+#else
+     sph_echo512_context     echo;
 #endif
 } phi_ctx_holder;

@@ -40,10 +37,10 @@ void init_phi1612_ctx()
     cubehashInit( &phi_ctx.cube, 512, 16, 32 );
     sph_fugue512_init( &phi_ctx.fugue );
     sph_gost512_init( &phi_ctx.gost );
-#ifdef NO_AES_NI
-     sph_echo512_init( &phi_ctx.echo );
-#else
+#ifdef __AES__
     init_echo( &phi_ctx.echo, 512 );
+#else
+     sph_echo512_init( &phi_ctx.echo );
 #endif
 }

@@ -64,9 +61,6 @@ void phi1612_hash(void *output, const void *input)
     sph_skein512( &ctx.skein, input + 64, 16 );
     sph_skein512_close( &ctx.skein, hash );

-//     sph_skein512( &ctx.skein, input, 80 );
-//     sph_skein512_close( &ctx.skein, (void*)hash );
-
     sph_jh512( &ctx.jh, (const void*)hash, 64 );
     sph_jh512_close( &ctx.jh, (void*)hash );

@@ -78,12 +72,12 @@ void phi1612_hash(void *output, const void *input)
     sph_gost512( &ctx.gost, hash, 64 );
     sph_gost512_close( &ctx.gost, hash );

-#ifdef NO_AES_NI
-     sph_echo512( &ctx.echo, hash, 64 );
-     sph_echo512_close( &ctx.echo, hash );
-#else
+#ifdef __AES__
     update_final_echo ( &ctx.echo, (BitSequence *)hash,
                         (const BitSequence *)hash, 512 );
+#else
+     sph_echo512( &ctx.echo, hash, 64 );
+     sph_echo512_close( &ctx.echo, hash );
 #endif

     memcpy(output, hash, 32);
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -315,7 +315,7 @@ int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 8; i++ )
-         if ( ( hash+(i<<3) )[7] < Htarg
+         if ( ( hash+(i<<3) )[7] <= Htarg
              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
             pdata[19] = n+i;
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -121,12 +121,12 @@ void x13sm3_hash(void *output, const void *input)
                         (const BitSequence *)hash, 512 );

        //11---echo---
-#ifdef NO_AES_NI
-        sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hash);
-#else
+#ifdef __AES__
        update_final_echo ( &ctx.echo, (BitSequence *)hash,
                            (const BitSequence *)hash, 512 );
+#else
+        sph_echo512(&ctx.echo, hash, 64);
+        sph_echo512_close(&ctx.echo, hash);
 #endif

        uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -1,29 +1,27 @@
 #include "polytimos-gate.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
-#ifndef NO_AES_NI
+#ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

 typedef struct {
 	sph_skein512_context    skein;
-        sph_shabal512_context   shabal;
-#ifdef NO_AES_NI
-	sph_echo512_context		echo;
+   sph_shabal512_context   shabal;
+#ifdef __AES__
+   hashState_echo          echo;
 #else
-        hashState_echo          echo;
+	sph_echo512_context		echo;
 #endif
-        hashState_luffa         luffa;
+   hashState_luffa         luffa;
 	sph_fugue512_context    fugue;
 	sph_gost512_context     gost;
 } poly_ctx_holder;
@@ -33,15 +31,15 @@ poly_ctx_holder poly_ctx;
 void init_polytimos_ctx()
 {
 	sph_skein512_init(&poly_ctx.skein);
-        sph_shabal512_init(&poly_ctx.shabal);
-#ifdef NO_AES_NI
-        sph_echo512_init(&poly_ctx.echo);
+   sph_shabal512_init(&poly_ctx.shabal);
+#ifdef __AES__
+   init_echo( &poly_ctx.echo, 512 );
 #else
-        init_echo( &poly_ctx.echo, 512 );
+   sph_echo512_init(&poly_ctx.echo);
 #endif
-        init_luffa( &poly_ctx.luffa, 512 );
-        sph_fugue512_init(&poly_ctx.fugue);
-        sph_gost512_init(&poly_ctx.gost);
+   init_luffa( &poly_ctx.luffa, 512 );
+   sph_fugue512_init(&poly_ctx.fugue);
+   sph_gost512_init(&poly_ctx.gost);
 }

 void polytimos_hash(void *output, const void *input)
@@ -56,12 +54,12 @@ void polytimos_hash(void *output, const void *input)
 	sph_shabal512(&ctx.shabal, hashA, 64);
 	sph_shabal512_close(&ctx.shabal, hashA);

-#ifdef NO_AES_NI
+#ifdef __AES__
+    update_final_echo ( &ctx.echo, (BitSequence *)hashA,
+                             (const BitSequence *)hashA, 512 );
+#else
 	sph_echo512(&ctx.echo, hashA, 64);
 	sph_echo512_close(&ctx.echo, hashA);
-#else
-        update_final_echo ( &ctx.echo, (BitSequence *)hashA,
-                            (const BitSequence *)hashA, 512 );
 #endif

        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -317,7 +317,7 @@ int scanhash_x14_8way( struct work *work, uint32_t max_nonce,

        uint32_t *hash7 = &(hash[7<<3]);
        for ( int lane = 0; lane < 8; lane++ )
-        if ( hash7[ lane ] < Htarg )
+        if ( hash7[ lane ] <= Htarg )
        {
            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
            extr_lane_8x32( lane_hash, hash, lane, 256 );
@@ -526,7 +526,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,

       uint32_t *hash7 = &(hash[7<<2]);
       for ( int lane = 0; lane < 4; lane++ )
-       if ( hash7[ lane ] < Htarg )
+       if ( hash7[ lane ] <= Htarg )
       {
           uint32_t lane_hash[8];
           extr_lane_4x32( lane_hash, hash, lane, 256 );
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -88,11 +88,9 @@ void x14hash(void *output, const void *input)
    sph_bmw512_close( &ctx.bmw, hash );

 #if defined(__AES__)
-    init_groestl( &ctx.groestl, 64 );
    update_and_final_groestl( &ctx.groestl, (char*)hash,
                                      (const char*)hash, 512 );
 #else
-    sph_groestl512_init( &ctx.groestl );
    sph_groestl512( &ctx.groestl, hash, 64 );
    sph_groestl512_close( &ctx.groestl, hash );
 #endif
@@ -118,12 +116,12 @@ void x14hash(void *output, const void *input)
    update_final_sd( &ctx.simd, (BitSequence *)hash,
                          (const BitSequence *)hash, 512 );

-#ifdef NO_AES_NI
-    sph_echo512(&ctx.echo, hash, 64);
-    sph_echo512_close(&ctx.echo, hash);
-#else
+#if defined(__AES__)
    update_final_echo ( &ctx.echo, (BitSequence *)hash,
                             (const BitSequence *)hash, 512 );
+#else
+    sph_echo512(&ctx.echo, hash, 64);
+    sph_echo512_close(&ctx.echo, hash);
 #endif

    sph_hamsi512(&ctx.hamsi, hash, 64);
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -360,7 +360,7 @@ int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
        pdata[19] = n;

        for ( int i = 0; i < 8; i++ )
-        if ( ( hash+(i<<3) )[7] < Htarg )
+        if ( ( hash+(i<<3) )[7] <= Htarg )
        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n+i;
@@ -588,7 +588,7 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-         if ( ( hash+(i<<3) )[7] < Htarg )
+         if ( ( hash+(i<<3) )[7] <= Htarg )
         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
            pdata[19] = n+i;
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -398,16 +398,16 @@ void x22i_8way_hash( void *output, const void *input )
   memset( hash7, 0, 64 );

   intrlv_2x256( vhash, hashA0, hashA1, 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
   intrlv_2x256( vhash, hashA2, hashA3, 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash2, hash3, vhash, 256 );
   intrlv_2x256( vhash, hashA4, hashA5, 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash4, hash5, vhash, 256 );
   intrlv_2x256( vhash, hashA6, hashA7, 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );

   sph_gost512_init( &ctx.gost );
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -408,16 +408,16 @@ void x25x_8way_hash( void *output, const void *input )
   sph_tiger_close(&ctx.tiger, (void*) hash7[18]);

   intrlv_2x256( vhash, hash0[18], hash1[18], 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash0[19], hash1[19], vhash, 256 );
   intrlv_2x256( vhash, hash2[18], hash3[18], 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash2[19], hash3[19], vhash, 256 );
   intrlv_2x256( vhash, hash4[18], hash5[18], 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash4[19], hash5[19], vhash, 256 );
   intrlv_2x256( vhash, hash6[18], hash7[18], 256 );
-   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );

 	sph_gost512_init(&ctx.gost);
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -401,7 +401,7 @@ int scanhash_yescrypt( struct work *work, uint32_t max_nonce,
        do {
           be32enc(&endiandata[19], n);
           yescrypt_hash((char*) endiandata, (char*) vhash, 80);
-           if (vhash[7] < Htarg && fulltest(vhash, ptarget ) 
+           if (vhash[7] <= Htarg && fulltest(vhash, ptarget ) 
               && !opt_benchmark )
           {
               pdata[19] = n;
--- a/algo/yespower/yespower-blake2b.c
+++ b/algo/yespower/yespower-blake2b.c
@@ -615,7 +615,7 @@ static volatile uint64_t Smask2var = Smask2;
 /* 64-bit without AVX.  This relies on out-of-order execution and register
 * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
 * it runs great on Haswell. */
-#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+//#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
 #undef MAYBE_MEMORY_BARRIER
 #define MAYBE_MEMORY_BARRIER \
    __asm__("" : : : "memory");
--- a/algo/yespower/yespower-gate.c
+++ b/algo/yespower/yespower-gate.c
@@ -55,7 +55,7 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce,
        do {
           be32enc(&endiandata[19], n);
           yespower_hash((char*) endiandata, (char*) vhash, 80);
-           if ( vhash[7] < Htarg && fulltest( vhash, ptarget )
+           if ( vhash[7] <= Htarg && fulltest( vhash, ptarget )
              && !opt_benchmark )
           {
               pdata[19] = n;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.11.2'
-PACKAGE_STRING='cpuminer-opt 3.11.2'
+PACKAGE_VERSION='3.11.3'
+PACKAGE_STRING='cpuminer-opt 3.11.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.11.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.11.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.11.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.11.3:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.11.2
+cpuminer-opt configure 3.11.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.11.2, which was
+It was created by cpuminer-opt $as_me 3.11.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.11.2'
+ VERSION='3.11.3'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.11.2, which was
+This file was extended by cpuminer-opt $as_me 3.11.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.11.2
+cpuminer-opt config.status 3.11.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.11.2])
+AC_INIT([cpuminer-opt], [3.11.3])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -6,18 +6,6 @@
 #define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION
 #define MAX_CPUS 16

-//#ifndef NO_AES_NI
- #ifndef __AES__
-  #define NO_AES_NI
- #endif
-//#endif
-
-//#if defined(FOUR_WAY) && defined(__AVX2__)
-// keep this until all algos remove reference to HASH_4WAY
-//#if defined(__AVX2__)
-//  #define HASH_4WAY
-//#endif
-
 #ifdef _MSC_VER

 #undef USE_ASM  /* to fix */
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2857,6 +2857,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
 //#define mm256_intrlv_blend_64( hi, lo )   _mm256_blend_epi32( hi, lo, 0x33 )
 #define mm256_intrlv_blend_32( hi, lo )   _mm256_blend_epi32( hi, lo, 0x55 )

+// change to _mm256_blend_epi32
+//
 // Select lanes of 32 byte hash from 2 sources according to control mask.
 // macro due to 256 bit value arg.
 #define mm256_blend_hash_4x64( dst, a, b, mask ) \