v3.15.2

2025-09-17 23:44:27 +00:00 · 2020-11-15 17:57:06 -05:00
42 changed files with 2490 additions and 445 deletions
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,10 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.
 This package is officially avalable only from:
 https://github.com/JayDDee/cpuminer-opt
 No other sources should be trusted.
 cpuminer is a console program that is executed from a DOS or Powershell
 prompt. There is no GUI and no mouse support.
@@ -31,20 +35,22 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
 https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
-Exe file name              Compile flags              Arch name
+Exe file name                Compile flags            Arch name
 cpuminer-sse2.exe            "-msse2"                 Core2, Nehalem   
-cpuminer-aes-sse42.exe       "-march=westmere"        Westmere
+cpuminer-aes-sse42.exe       "-marxh=westmere"        Westmere
 cpuminer-avx.exe             "-march=corei7-avx"      Sandybridge, Ivybridge
-cpuminer-avx2.exe            "-march=core-avx2 -maes" Haswell*
+cpuminer-avx2.exe            "-march=core-avx2 -maes" Haswell(1)
 cpuminer-avx512.exe          "-march=skylake-avx512"  Skylake-X, Cascadelake-X
-cpuminer-zen.exe             "-march=znver1"          AMD Ryzen, Threadripper
+cpuminer-zen.exe             "-march=znver1"          Zen1, Zen2
-cpuminer-avx512-sha-vaes.exe "-march=icelake-client"  Icelake*
+cpuminer-zen3.exe            "-march=znver2 -mvaes"   Zen3(2) 
 cpuminer-avx512-sha-vaes.exe "-march=icelake-client"  Icelake(3)
-* Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
+(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
-Icelake is only available on some laptops. Mining with a laptop is not
+(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
-recommended. The icelake build is included in anticipation of Intel eventually
+    available. Zen2 CPUs should use Zen build.
-releasing a desktop CPU with a microarchitecture newer than Skylake.
+(3) Icelake is only available on some laptops. Mining with a laptop is not
 recommended.
 Notes about included DLL files:
--- a/6
+++ b/6
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 v3.15.2
 Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
 allium.
 Zen3 build added to Windows binary package.
 v3.15.1
 Fix compile on AMD Zen3 CPUs with VAES.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -90,10 +90,11 @@ typedef  uint32_t set_t;
 #define AES_OPT          2  
 #define SSE42_OPT        4
 #define AVX_OPT          8   // Sandybridge
-#define AVX2_OPT      0x10   // Haswell
+#define AVX2_OPT      0x10   // Haswell, Zen1
-#define SHA_OPT       0x20   // sha256 (Ryzen, Ice Lake)
+#define SHA_OPT       0x20   // Zen1, Icelake (sha256)
-#define AVX512_OPT    0x40   // AVX512- F, VL, DQ, BW (Skylake-X)
+#define AVX512_OPT    0x40   // Skylake-X (AVX512[F,VL,DQ,BW])
-#define VAES_OPT      0x80   // VAES (Ice Lake)
+#define VAES_OPT      0x80   // Icelake (VAES & AVX512)
 #define VAES256_OPT   0x100  // Zen3 (VAES without AVX512)
 // return set containing all elements from sets a & b
@@ -111,9 +112,9 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
 typedef struct
 {
 // Mandatory functions, one of these is mandatory. If a generic scanhash
-// is used a custom hash function must be registered, with a custom scanhash
+// is used a custom target hash function must be registered, with a custom
-// the custom hash function can be called directly and doesn't need to be
+// scanhash the target hash function can be called directly and doesn't need
-// registered in the gate. 
+// to be registered in the gate. 
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
 int ( *hash )     ( void*, const void*, int );
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -1,5 +1,4 @@
-//#if 0
+#if defined(__VAES__)
 #if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #include "simd-utils.h"
 #include "echo-hash-4way.h"
@@ -13,8 +12,12 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 */
 // do these need to be reversed?
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 #define mul2mask \
-   _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+     m512_const2_64( 0, 0x00001b00 )
 //_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
 //   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
 #define lsbmask    m512_const1_32( 0x01010101 ) 
@@ -30,87 +33,87 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
   const int j2 = ( (j)+2 ) & 3; \
   const int j3 = ( (j)+3 ) & 3; \
   s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
-	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
+   t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask );\
+   t1 = _mm512_and_si512( t1, lsbmask );\
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ] [j ] = s2; \
+   state2[ 0 ] [j ] = s2; \
-	state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
-	state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
-	state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
+   state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
-	s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+   s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+   t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 );\
+   s2 = _mm512_xor_si512( s2, t2 );\
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+                              _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
-	s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+   s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+   t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
-	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+   s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+   t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)
 #define ECHO_ROUND_UNROLL2 \
-	ECHO_SUBBYTES(_state, 0, 0);\
+   ECHO_SUBBYTES(_state, 0, 0);\
   ECHO_SUBBYTES(_state, 1, 0);\
-	ECHO_SUBBYTES(_state, 2, 0);\
+   ECHO_SUBBYTES(_state, 2, 0);\
-	ECHO_SUBBYTES(_state, 3, 0);\
+   ECHO_SUBBYTES(_state, 3, 0);\
-	ECHO_SUBBYTES(_state, 0, 1);\
+   ECHO_SUBBYTES(_state, 0, 1);\
-	ECHO_SUBBYTES(_state, 1, 1);\
+   ECHO_SUBBYTES(_state, 1, 1);\
-	ECHO_SUBBYTES(_state, 2, 1);\
+   ECHO_SUBBYTES(_state, 2, 1);\
-	ECHO_SUBBYTES(_state, 3, 1);\
+   ECHO_SUBBYTES(_state, 3, 1);\
-	ECHO_SUBBYTES(_state, 0, 2);\
+   ECHO_SUBBYTES(_state, 0, 2);\
-	ECHO_SUBBYTES(_state, 1, 2);\
+   ECHO_SUBBYTES(_state, 1, 2);\
-	ECHO_SUBBYTES(_state, 2, 2);\
+   ECHO_SUBBYTES(_state, 2, 2);\
-	ECHO_SUBBYTES(_state, 3, 2);\
+   ECHO_SUBBYTES(_state, 3, 2);\
-	ECHO_SUBBYTES(_state, 0, 3);\
+   ECHO_SUBBYTES(_state, 0, 3);\
-	ECHO_SUBBYTES(_state, 1, 3);\
+   ECHO_SUBBYTES(_state, 1, 3);\
-	ECHO_SUBBYTES(_state, 2, 3);\
+   ECHO_SUBBYTES(_state, 2, 3);\
-	ECHO_SUBBYTES(_state, 3, 3);\
+   ECHO_SUBBYTES(_state, 3, 3);\
-	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-	ECHO_SUBBYTES(_state2, 0, 0);\
+   ECHO_SUBBYTES(_state2, 0, 0);\
-	ECHO_SUBBYTES(_state2, 1, 0);\
+   ECHO_SUBBYTES(_state2, 1, 0);\
-	ECHO_SUBBYTES(_state2, 2, 0);\
+   ECHO_SUBBYTES(_state2, 2, 0);\
-	ECHO_SUBBYTES(_state2, 3, 0);\
+   ECHO_SUBBYTES(_state2, 3, 0);\
-	ECHO_SUBBYTES(_state2, 0, 1);\
+   ECHO_SUBBYTES(_state2, 0, 1);\
-	ECHO_SUBBYTES(_state2, 1, 1);\
+   ECHO_SUBBYTES(_state2, 1, 1);\
-	ECHO_SUBBYTES(_state2, 2, 1);\
+   ECHO_SUBBYTES(_state2, 2, 1);\
-	ECHO_SUBBYTES(_state2, 3, 1);\
+   ECHO_SUBBYTES(_state2, 3, 1);\
-	ECHO_SUBBYTES(_state2, 0, 2);\
+   ECHO_SUBBYTES(_state2, 0, 2);\
-	ECHO_SUBBYTES(_state2, 1, 2);\
+   ECHO_SUBBYTES(_state2, 1, 2);\
-	ECHO_SUBBYTES(_state2, 2, 2);\
+   ECHO_SUBBYTES(_state2, 2, 2);\
-	ECHO_SUBBYTES(_state2, 3, 2);\
+   ECHO_SUBBYTES(_state2, 3, 2);\
-	ECHO_SUBBYTES(_state2, 0, 3);\
+   ECHO_SUBBYTES(_state2, 0, 3);\
-	ECHO_SUBBYTES(_state2, 1, 3);\
+   ECHO_SUBBYTES(_state2, 1, 3);\
-	ECHO_SUBBYTES(_state2, 2, 3);\
+   ECHO_SUBBYTES(_state2, 2, 3);\
-	ECHO_SUBBYTES(_state2, 3, 3);\
+   ECHO_SUBBYTES(_state2, 3, 3);\
-	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
@@ -224,43 +227,43 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
 int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 {
-	int i, j;
+   int i, j;
   ctx->k = m512_zero; 
-	ctx->processed_bits = 0;
+   ctx->processed_bits = 0;
-	ctx->uBufferBytes = 0;
+   ctx->uBufferBytes = 0;
-	switch( nHashSize )
+   switch( nHashSize )
-	{
+   {
-		case 256:
+	case 256:
-			ctx->uHashSize = 256;
+		ctx->uHashSize = 256;
-			ctx->uBlockLength = 192;
+		ctx->uBlockLength = 192;
-			ctx->uRounds = 8;
+		ctx->uRounds = 8;
-			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+		ctx->hashsize = m512_const2_64( 0, 0x100 );
-			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+		ctx->const1536 = m512_const2_64( 0, 0x600 );
-			break;
+		break;
-		case 512:
+	case 512:
-			ctx->uHashSize = 512;
+		ctx->uHashSize = 512;
-			ctx->uBlockLength = 128;
+		ctx->uBlockLength = 128;
-			ctx->uRounds = 10;
+		ctx->uRounds = 10;
-			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+		ctx->hashsize = m512_const2_64( 0, 0x200 );
-			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+		ctx->const1536 = m512_const2_64( 0, 0x400);
-			break;
+		break;
-		default:
+	default:
-			return 1;
+        	return 1;
-	}
+   }
-	for( i = 0; i < 4; i++ )
+   for( i = 0; i < 4; i++ )
-		for( j = 0; j < nHashSize / 256; j++ )
+	for( j = 0; j < nHashSize / 256; j++ )
-			ctx->state[ i ][ j ] = ctx->hashsize;
+		ctx->state[ i ][ j ] = ctx->hashsize;
-	for( i = 0; i < 4; i++ )
+   for( i = 0; i < 4; i++ )
-		for( j = nHashSize / 256; j < 4; j++ )
+	for( j = nHashSize / 256; j < 4; j++ )
-			ctx->state[ i ][ j ] = m512_zero;
+		ctx->state[ i ][ j ] = m512_zero;
-	return 0;
+   return 0;
 }
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
@@ -285,17 +288,13 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_512( state->buffer, data, vlen );
      state->processed_bits += (unsigned int)( databitlen );
-      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+      remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
   }
-   state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
   memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
-   state->buffer[ vblen-2 ] =
+   state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
-                _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
+   state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
   state->buffer[ vblen-1 ] =
                   _mm512_set4_epi64( 0, state->processed_bits,
                                      0, state->processed_bits );  
   state->k = _mm512_add_epi64( state->k, remainingbits );
   state->k = _mm512_sub_epi64( state->k, state->const1536 );
@@ -328,16 +327,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
         ctx->uHashSize = 256;
         ctx->uBlockLength = 192;
         ctx->uRounds = 8;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+         ctx->hashsize = m512_const2_64( 0, 0x100 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+         ctx->const1536 = m512_const2_64( 0, 0x600 );
         break;
      case 512:
         ctx->uHashSize = 512;
         ctx->uBlockLength = 128;
         ctx->uRounds = 10;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+         ctx->hashsize = m512_const2_64( 0, 0x200 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+         ctx->const1536 = m512_const2_64( 0, 0x400 );
         break;
      default:
@@ -372,17 +371,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_512( ctx->buffer, data, vlen );
      ctx->processed_bits += (unsigned int)( databitlen );
-      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+      remainingbits = m512_const2_64( 0, databitlen );
   }
-   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
   ctx->buffer[ vblen-2 ] =
-                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
+                     m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
-   ctx->buffer[ vblen-1 ] =
+   ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
                   _mm512_set4_epi64( 0, ctx->processed_bits,
                                      0, ctx->processed_bits );
   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
@@ -400,5 +396,380 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   return 0;
 }
 #endif  // AVX512
-#endif
+// AVX2 + VAES
 #define mul2mask_2way   m256_const2_64( 0, 0x0000000000001b00 ) 
 #define lsbmask_2way    m256_const1_32( 0x01010101 ) 
 #define ECHO_SUBBYTES_2WAY( state, i, j ) \
        state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
        state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
        k1 = _mm256_add_epi32( k1, m256_one_128 );
 #define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
 { \
   const int j1 = ( (j)+1 ) & 3; \
   const int j2 = ( (j)+2 ) & 3; \
   const int j3 = ( (j)+3 ) & 3; \
   s2 = _mm256_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
   t1 = _mm256_srli_epi16( state1[ 0 ][ j ], 7 ); \
   t1 = _mm256_and_si256( t1, lsbmask_2way );\
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 ); \
   state2[ 0 ] [j ] = s2; \
   state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
   state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
   state2[ 3 ] [j ] = _mm256_xor_si256( s2, state1[ 0 ][ j ] );\
   s2 = _mm256_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
   t1 = _mm256_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 );\
   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
                              _mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
   s2 = _mm256_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
   t1 = _mm256_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 ); \
   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
                            _mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
   s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
   t1 = _mm256_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 ); \
   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
                            _mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
 } while(0)
 #define ECHO_ROUND_UNROLL2_2WAY \
   ECHO_SUBBYTES_2WAY(_state, 0, 0);\
   ECHO_SUBBYTES_2WAY(_state, 1, 0);\
   ECHO_SUBBYTES_2WAY(_state, 2, 0);\
   ECHO_SUBBYTES_2WAY(_state, 3, 0);\
   ECHO_SUBBYTES_2WAY(_state, 0, 1);\
   ECHO_SUBBYTES_2WAY(_state, 1, 1);\
   ECHO_SUBBYTES_2WAY(_state, 2, 1);\
   ECHO_SUBBYTES_2WAY(_state, 3, 1);\
   ECHO_SUBBYTES_2WAY(_state, 0, 2);\
   ECHO_SUBBYTES_2WAY(_state, 1, 2);\
   ECHO_SUBBYTES_2WAY(_state, 2, 2);\
   ECHO_SUBBYTES_2WAY(_state, 3, 2);\
   ECHO_SUBBYTES_2WAY(_state, 0, 3);\
   ECHO_SUBBYTES_2WAY(_state, 1, 3);\
   ECHO_SUBBYTES_2WAY(_state, 2, 3);\
   ECHO_SUBBYTES_2WAY(_state, 3, 3);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
   ECHO_SUBBYTES_2WAY(_state2, 0, 0);\
   ECHO_SUBBYTES_2WAY(_state2, 1, 0);\
   ECHO_SUBBYTES_2WAY(_state2, 2, 0);\
   ECHO_SUBBYTES_2WAY(_state2, 3, 0);\
   ECHO_SUBBYTES_2WAY(_state2, 0, 1);\
   ECHO_SUBBYTES_2WAY(_state2, 1, 1);\
   ECHO_SUBBYTES_2WAY(_state2, 2, 1);\
   ECHO_SUBBYTES_2WAY(_state2, 3, 1);\
   ECHO_SUBBYTES_2WAY(_state2, 0, 2);\
   ECHO_SUBBYTES_2WAY(_state2, 1, 2);\
   ECHO_SUBBYTES_2WAY(_state2, 2, 2);\
   ECHO_SUBBYTES_2WAY(_state2, 3, 2);\
   ECHO_SUBBYTES_2WAY(_state2, 0, 3);\
   ECHO_SUBBYTES_2WAY(_state2, 1, 3);\
   ECHO_SUBBYTES_2WAY(_state2, 2, 3);\
   ECHO_SUBBYTES_2WAY(_state2, 3, 3);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
 #define SAVESTATE_2WAY(dst, src)\
        dst[0][0] = src[0][0];\
        dst[0][1] = src[0][1];\
        dst[0][2] = src[0][2];\
        dst[0][3] = src[0][3];\
        dst[1][0] = src[1][0];\
        dst[1][1] = src[1][1];\
        dst[1][2] = src[1][2];\
        dst[1][3] = src[1][3];\
        dst[2][0] = src[2][0];\
        dst[2][1] = src[2][1];\
        dst[2][2] = src[2][2];\
        dst[2][3] = src[2][3];\
        dst[3][0] = src[3][0];\
        dst[3][1] = src[3][1];\
        dst[3][2] = src[3][2];\
        dst[3][3] = src[3][3]
 // blockcount always 1
 void echo_2way_compress( echo_2way_context *ctx, const __m256i *pmsg,
               unsigned int uBlockCount )
 {
  unsigned int r, b, i, j;
  __m256i t1, t2, s2, k1;
  __m256i _state[4][4], _state2[4][4], _statebackup[4][4];
  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
  _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
  _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
  _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
  _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
  _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
  _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
  _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
  _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
  _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
  _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
  _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
  _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
  _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
  _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
  for ( b = 0; b < uBlockCount; b++ )
  {
    ctx->k = _mm256_add_epi64( ctx->k, ctx->const1536 );
    for( j = ctx->uHashSize / 256; j < 4; j++ )
    {
      for ( i = 0; i < 4; i++ )
      {
        _state[ i ][ j ] = _mm256_load_si256(
                     pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
      }
   }
   // save state
   SAVESTATE_2WAY( _statebackup, _state );
   k1 = ctx->k;
   for ( r = 0; r < ctx->uRounds / 2; r++ )
   {
       ECHO_ROUND_UNROLL2_2WAY;
   }
   if ( ctx->uHashSize == 256 )
   {
      for ( i = 0; i < 4; i++ )
      {
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _state[ i ][ 1 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _state[ i ][ 2 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _state[ i ][ 3 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _statebackup[ i ][ 0 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _statebackup[ i ][ 1 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _statebackup[ i ][ 2 ] ) ;
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _statebackup[ i ][ 3 ] );
       }
    }
    else
    {
       for ( i = 0; i < 4; i++ )
       {
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _state[ i ][ 2 ] );
         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
                                              _state[ i ][ 3 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
                                              _statebackup[ i ][ 0 ] );
         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ] [0 ],
                                              _statebackup[ i ][ 2 ] );
         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
                                              _statebackup[ i ][ 1 ] );
         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
                                              _statebackup[ i ][ 3 ] );
      }
    }
    pmsg += ctx->uBlockLength;
  }
  SAVESTATE_2WAY(ctx->state, _state);
 }
 int echo_2way_init( echo_2way_context *ctx, int nHashSize )
 {
        int i, j;
   ctx->k = m256_zero;
   ctx->processed_bits = 0;
   ctx->uBufferBytes = 0;
   switch( nHashSize )
   {
                case 256:
                        ctx->uHashSize = 256;
                        ctx->uBlockLength = 192;
                        ctx->uRounds = 8;
                        ctx->hashsize = m256_const2_64( 0, 0x100 );
                        ctx->const1536 = m256_const2_64( 0, 0x600 );
                        break;
                case 512:
                        ctx->uHashSize = 512;
                        ctx->uBlockLength = 128;
                        ctx->uRounds = 10;
                        ctx->hashsize = m256_const2_64( 0, 0x200 );
                        ctx->const1536 = m256_const2_64( 0, 0x400 );
                        break;
                default:
                        return 1;
        }
        for( i = 0; i < 4; i++ )
                for( j = 0; j < nHashSize / 256; j++ )
                        ctx->state[ i ][ j ] = ctx->hashsize;
        for( i = 0; i < 4; i++ )
                for( j = nHashSize / 256; j < 4; j++ )
                        ctx->state[ i ][ j ] = m256_zero;
        return 0;
 }
 int echo_2way_update_close( echo_2way_context *state, void *hashval,
                              const void *data, int databitlen )
 {
 // bytelen is either 32 (maybe), 64 or 80 or 128!
 // all are less than full block.
   int vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
   const int vblen = state->uBlockLength / 16; //  16 bytes per lane
   __m256i remainingbits;
   if ( databitlen == 1024 )
   {
      echo_2way_compress( state, data, 1 );
      state->processed_bits = 1024;
      remainingbits = m256_const2_64( 0, -1024 );
      vlen = 0;
   }
   else
   {
      memcpy_256( state->buffer, data, vlen );
      state->processed_bits += (unsigned int)( databitlen );
      remainingbits = m256_const2_64( 0, databitlen );
   }
   state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
   memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
   state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
   state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
   state->k = _mm256_add_epi64( state->k, remainingbits );
   state->k = _mm256_sub_epi64( state->k, state->const1536 );
   echo_2way_compress( state, state->buffer, 1 );
   _mm256_store_si256( (__m256i*)hashval + 0, state->state[ 0 ][ 0] );
   _mm256_store_si256( (__m256i*)hashval + 1, state->state[ 1 ][ 0] );
   if ( state->uHashSize == 512 )
   {
      _mm256_store_si256( (__m256i*)hashval + 2, state->state[ 2 ][ 0 ] );
      _mm256_store_si256( (__m256i*)hashval + 3, state->state[ 3 ][ 0 ] );
   }
   return 0;
 }
 int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
                    const void *data, int datalen )
 {
   int i, j;
   int databitlen = datalen * 8;
   ctx->k = m256_zero;
   ctx->processed_bits = 0;
   ctx->uBufferBytes = 0;
   switch( nHashSize )
   {
      case 256:
         ctx->uHashSize = 256;
         ctx->uBlockLength = 192;
         ctx->uRounds = 8;
         ctx->hashsize = m256_const2_64( 0, 0x100 );
         ctx->const1536 = m256_const2_64( 0, 0x600 );
         break;
      case 512:
         ctx->uHashSize = 512;
         ctx->uBlockLength = 128;
         ctx->uRounds = 10;
         ctx->hashsize = m256_const2_64( 0, 0x200 );
         ctx->const1536 = m256_const2_64( 0, 0x400 );
         break;
      default:
         return 1;
   }
   for( i = 0; i < 4; i++ )
      for( j = 0; j < nHashSize / 256; j++ )
         ctx->state[ i ][ j ] = ctx->hashsize;
   for( i = 0; i < 4; i++ )
      for( j = nHashSize / 256; j < 4; j++ )
         ctx->state[ i ][ j ] = m256_zero;
   int vlen = datalen / 32;
   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
   __m256i remainingbits;
   if ( databitlen == 1024 )
   {
      echo_2way_compress( ctx, data, 1 );
      ctx->processed_bits = 1024;
      remainingbits = m256_const2_64( 0, -1024 );
      vlen = 0;
   }
   else
   {
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_256( ctx->buffer, data, vlen );
      ctx->processed_bits += (unsigned int)( databitlen );
      remainingbits = m256_const2_64( 0, databitlen );
   }
   ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
   memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
   ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
   ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
   ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
   ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
   echo_2way_compress( ctx, ctx->buffer, 1 );
   _mm256_store_si256( (__m256i*)hashval + 0, ctx->state[ 0 ][ 0] );
   _mm256_store_si256( (__m256i*)hashval + 1, ctx->state[ 1 ][ 0] );
   if ( ctx->uHashSize == 512 )
   {
      _mm256_store_si256( (__m256i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
      _mm256_store_si256( (__m256i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
   }
   return 0;
 }
 #endif   // VAES
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -1,10 +1,12 @@
 #if !defined(ECHO_HASH_4WAY_H__)
 #define ECHO_HASH_4WAY_H__ 1
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)
 #include "simd-utils.h"
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct
 {
   __m512i    state[4][4];
@@ -20,6 +22,7 @@ typedef struct
   unsigned int   processed_bits;
 } echo_4way_context __attribute__ ((aligned (64)));
 #define echo512_4way_context echo_4way_context
 int echo_4way_init( echo_4way_context *state, int hashbitlen );
 #define echo512_4way_init( state ) echo_4way_init( state, 512 )
@@ -29,8 +32,8 @@ int echo_4way_update( echo_4way_context *state, const void *data,
    unsigned int databitlen);
 #define echo512_4way_update echo_4way_update
-int echo_close( echo_4way_context *state, void *hashval );
+// int echo_4way_close( echo_4way_context *state, void *hashval );
-#define echo512_4way_close echo_4way_close
+// #define echo512_4way_close echo_4way_close
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                              const void *data, int databitlen );
@@ -43,5 +46,45 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
 #define echo256_4way_full( state, hashval, data, datalen ) \
           echo_4way_full( state, hashval, 256, data, datalen )
-#endif 
+#endif   // AVX512
-#endif
+
 typedef struct
 {
   __m256i    state[4][4];
   __m256i    buffer[ 4 * 192 / 16 ];  // 4x128 interleaved 192 bytes
   __m256i    k;
   __m256i    hashsize;
   __m256i    const1536;
   unsigned int   uRounds;
   unsigned int   uHashSize;
   unsigned int   uBlockLength;
   unsigned int   uBufferBytes;
   unsigned int   processed_bits;
 } echo_2way_context __attribute__ ((aligned (64)));
 #define echo512_2way_context echo_2way_context
 int echo_2way_init( echo_2way_context *state, int hashbitlen );
 #define echo512_2way_init( state ) echo_2way_init( state, 512 )
 #define echo256_2way_init( state ) echo_2way_init( state, 256 )
 int echo_2way_update( echo_2way_context *state, const void *data,
    unsigned int databitlen);
 #define echo512_2way_update echo_2way_update
 int echo_2way_update_close( echo_2way_context *state, void *hashval,
                              const void *data, int databitlen );
 #define echo512_2way_update_close echo_2way_update_close
 int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
                    const void *data, int datalen );
 #define echo512_2way_full( state, hashval, data, datalen ) \
           echo_2way_full( state, hashval, 512, data, datalen )
 #define echo256_2way_full( state, hashval, data, datalen ) \
           echo_2way_full( state, hashval, 256, data, datalen )
 #endif  // VAES
 #endif   // ECHO_HASH_4WAY_H__
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -15,7 +15,9 @@
 #include "miner.h"
 #include "simd-utils.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
@@ -43,10 +45,10 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
 }
 int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
-                                const void* input, uint64_t databitlen )
+                                const void* input, uint64_t datalen )
 {
-   const int len = (int)databitlen / 128;
+   const int len = (int)datalen >> 4;
-   const int hashlen_m128i = 32 / 16;   // bytes to __m128i
+   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
@@ -172,5 +174,161 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
   return 0;
 }
-#endif   // VAES
+#endif   // AVX512
 // AVX2 + VAES
 int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
 {
  int i;
  ctx->hashlen = hashlen;
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m256_zero;
     ctx->buffer[i]   = m256_zero;
  }
  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
  return 0;
 }
 int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
                                const void* input, uint64_t datalen )
 {
   const int len = (int)datalen >> 4;
   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
   __m256i* in = (__m256i*)input;
   int i;
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m256_zero;
     ctx->buffer[i]   = m256_zero;
  }
  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
   // --- update ---
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;
   // copy any remaining data to buffer, it may already contain data
   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE256; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
   i += rem;    // use i as rem_ptr in final
   //--- final ---
   blocks++;      // adjust for final block
   if ( i == SIZE256 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m256_zero;
       // add length padding, second last byte is zero unless blocks > 255
      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
   }
 // digest final padding block and do output transform
   TF512_2way( ctx->chaining, ctx->buffer );
   OF512_2way( ctx->chaining );
   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
   return 0;
 }
 int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE256 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE256;
   __m256i* in = (__m256i*)input;
   int i;
   // --- update ---
   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
   ctx->buf_ptr = blocks * SIZE256;
   // copy any remaining data to buffer, it may already contain data
   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE256; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
   i += rem;    // use i as rem_ptr in final
   //--- final ---
   blocks++;      // adjust for final block
   if ( i == SIZE256 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
   }
   else
   {
       // add first padding
       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m256_zero;
       // add length padding, second last byte is zero unless blocks > 255
       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }
 // digest final padding block and do output transform
   TF512_2way( ctx->chaining, ctx->buffer );
   OF512_2way( ctx->chaining );
   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
   return 0;
 }
 #endif  // VAES
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -18,8 +18,8 @@
 #endif
 #include <stdlib.h>
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
-   
+
 #define LENGTH (256)
 //#include "brg_endian.h"
@@ -48,6 +48,8 @@
 #define SIZE256 (SIZE_512/16)
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
  __attribute__ ((aligned (64))) __m512i buffer[SIZE256];
@@ -55,7 +57,7 @@ typedef struct {
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
-  int databitlen;    // bits
+//  int databitlen;    // bits
 } groestl256_4way_context;
@@ -74,5 +76,25 @@ int groestl256_4way_update_close( groestl256_4way_context*,  void*,
 int groestl256_4way_full( groestl256_4way_context*, void*,
                          const void*, uint64_t );
-#endif
+#endif  // AVX512
-#endif 
+
 typedef struct {
  __attribute__ ((aligned (128))) __m256i chaining[SIZE256];
  __attribute__ ((aligned (64))) __m256i buffer[SIZE256];
  int hashlen;       // byte
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
 //  int databitlen;    // bits
 } groestl256_2way_context;
 int groestl256_2way_init( groestl256_2way_context*, uint64_t );
 int groestl256_2way_update_close( groestl256_2way_context*,  void*,
                                        const void*, uint64_t );
 int groestl256_2way_full( groestl256_2way_context*, void*,
                          const void*, uint64_t );
 #endif  // VAES
 #endif  // GROESTL256_HASH_4WAY_H__
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -12,7 +12,7 @@
 #include "groestl256-hash-4way.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
 {
@@ -42,6 +42,8 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
                                     0x2d2529212c242820, 0x2f272b232e262a22,
@@ -499,5 +501,398 @@ void OF512_4way( __m512i* chaining )
  chaining[3] = xmm11;
 }
 #endif  // AVX512
 static const __m256i TRANSP_MASK_2WAY =
             { 0x0d0509010c040800, 0x0f070b030e060a02,
               0x1d1519111c141810, 0x1f171b131e161a12 };
 static const __m256i SUBSH_MASK0_2WAY =
             { 0x0c0f0104070b0e00, 0x03060a0d08020509,
               0x1c1f1114171b1e10, 0x13161a1d18121519 };
 static const __m256i SUBSH_MASK1_2WAY =
             { 0x0e090205000d0801, 0x04070c0f0a03060b,
               0x1e191215101d1801, 0x14171c1f1a13161b };
 static const __m256i SUBSH_MASK2_2WAY =
               { 0x080b0306010f0a02, 0x05000e090c04070d,
                 0x181b1316111f1a12, 0x15101e191c14171d };
 static const __m256i SUBSH_MASK3_2WAY =
               { 0x0a0d040702090c03, 0x0601080b0e05000f,
                 0x1a1d141712191c13, 0x1611181b1e15101f };
 static const __m256i SUBSH_MASK4_2WAY =
               { 0x0b0e0500030a0d04, 0x0702090c0f060108,
                 0x1b1e1510131a1d14, 0x1712191c1f161118 };
 static const __m256i SUBSH_MASK5_2WAY =
               { 0x0d080601040c0f05, 0x00030b0e0907020a,
                 0x1d181611141c1f15, 0x10131b1e1917121a };
 static const __m256i SUBSH_MASK6_2WAY =
               { 0x0f0a0702050e0906, 0x01040d080b00030c,
                 0x1f1a1712151e1916, 0x11141d181b10131c };
 static const __m256i SUBSH_MASK7_2WAY =
               { 0x090c000306080b07, 0x02050f0a0d01040e,
                 0x191c101316181b17, 0x12151f1a1d11141e, };
 #define tos(a)    #a
 #define tostr(a)  tos(a)
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2_2WAY(i, j, k){\
  j = _mm256_xor_si256(j, j);\
  j = _mm256_cmpgt_epi8(j, i );\
  i = _mm256_add_epi8(i, i);\
  j = _mm256_and_si256(j, k);\
  i = _mm256_xor_si256(i, j);\
 }
 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
  a0 = _mm256_xor_si256(a0, a1);\
  b0 = a2;\
  a1 = _mm256_xor_si256(a1, a2);\
  b1 = a3;\
  a2 = _mm256_xor_si256(a2, a3);\
  b2 = a4;\
  a3 = _mm256_xor_si256(a3, a4);\
  b3 = a5;\
  a4 = _mm256_xor_si256(a4, a5);\
  b4 = a6;\
  a5 = _mm256_xor_si256(a5, a6);\
  b5 = a7;\
  a6 = _mm256_xor_si256(a6, a7);\
  a7 = _mm256_xor_si256(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  b0 = _mm256_xor_si256(b0, a4);\
  b6 = _mm256_xor_si256(b6, a4);\
  b1 = _mm256_xor_si256(b1, a5);\
  b7 = _mm256_xor_si256(b7, a5);\
  b2 = _mm256_xor_si256(b2, a6);\
  b0 = _mm256_xor_si256(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
  b3 = _mm256_xor_si256(b3, a7);\
  b1 = _mm256_xor_si256(b1, a7);\
  TEMP1 = b1;\
  b4 = _mm256_xor_si256(b4, a0);\
  b2 = _mm256_xor_si256(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
  b5 = _mm256_xor_si256(b5, a1);\
  b3 = _mm256_xor_si256(b3, a1);\
  b1 = a1;\
  b6 = _mm256_xor_si256(b6, a2);\
  b4 = _mm256_xor_si256(b4, a2);\
  TEMP2 = a2;\
  b7 = _mm256_xor_si256(b7, a3);\
  b5 = _mm256_xor_si256(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm256_xor_si256(a0, a3);\
  a1 = _mm256_xor_si256(a1, a4);\
  a2 = _mm256_xor_si256(a2, a5);\
  a3 = _mm256_xor_si256(a3, a6);\
  a4 = _mm256_xor_si256(a4, a7);\
  a5 = _mm256_xor_si256(a5, b0);\
  a6 = _mm256_xor_si256(a6, b1);\
  a7 = _mm256_xor_si256(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2_2WAY(a0, b0, b1);\
  a0 = _mm256_xor_si256(a0, TEMP0);\
  MUL2_2WAY(a1, b0, b1);\
  a1 = _mm256_xor_si256(a1, TEMP1);\
  MUL2_2WAY(a2, b0, b1);\
  a2 = _mm256_xor_si256(a2, b2);\
  MUL2_2WAY(a3, b0, b1);\
  a3 = _mm256_xor_si256(a3, b3);\
  MUL2_2WAY(a4, b0, b1);\
  a4 = _mm256_xor_si256(a4, b4);\
  MUL2_2WAY(a5, b0, b1);\
  a5 = _mm256_xor_si256(a5, b5);\
  MUL2_2WAY(a6, b0, b1);\
  a6 = _mm256_xor_si256(a6, b6);\
  MUL2_2WAY(a7, b0, b1);\
  a7 = _mm256_xor_si256(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2_2WAY(a0, b0, b1);\
  b5 = _mm256_xor_si256(b5, a0);\
  MUL2_2WAY(a1, b0, b1);\
  b6 = _mm256_xor_si256(b6, a1);\
  MUL2_2WAY(a2, b0, b1);\
  b7 = _mm256_xor_si256(b7, a2);\
  MUL2_2WAY(a5, b0, b1);\
  b2 = _mm256_xor_si256(b2, a5);\
  MUL2_2WAY(a6, b0, b1);\
  b3 = _mm256_xor_si256(b3, a6);\
  MUL2_2WAY(a7, b0, b1);\
  b4 = _mm256_xor_si256(b4, a7);\
  MUL2_2WAY(a3, b0, b1);\
  MUL2_2WAY(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
  b0 = _mm256_xor_si256(b0, a3);\
  b1 = _mm256_xor_si256(b1, a4);\
 }/*MixBytes*/
 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
  b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
  a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
  a3 = _mm256_xor_si256( a3, b1 );\
  a4 = _mm256_xor_si256( a4, b1 );\
  a5 = _mm256_xor_si256( a5, b1 );\
  a6 = _mm256_xor_si256( a6, b1 );\
  a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm256_xor_si256( b0, b0 );\
  a0 = _mm256_shuffle_epi8( a0, SUBSH_MASK0_2WAY );\
  a0 = _mm256_aesenclast_epi128(a0, b0 );\
  a1 = _mm256_shuffle_epi8( a1, SUBSH_MASK1_2WAY );\
  a1 = _mm256_aesenclast_epi128(a1, b0 );\
  a2 = _mm256_shuffle_epi8( a2, SUBSH_MASK2_2WAY );\
  a2 = _mm256_aesenclast_epi128(a2, b0 );\
  a3 = _mm256_shuffle_epi8( a3, SUBSH_MASK3_2WAY );\
  a3 = _mm256_aesenclast_epi128(a3, b0 );\
  a4 = _mm256_shuffle_epi8( a4, SUBSH_MASK4_2WAY );\
  a4 = _mm256_aesenclast_epi128(a4, b0 );\
  a5 = _mm256_shuffle_epi8( a5, SUBSH_MASK5_2WAY );\
  a5 = _mm256_aesenclast_epi128(a5, b0 );\
  a6 = _mm256_shuffle_epi8( a6, SUBSH_MASK6_2WAY );\
  a6 = _mm256_aesenclast_epi128(a6, b0 );\
  a7 = _mm256_shuffle_epi8( a7, SUBSH_MASK7_2WAY );\
  a7 = _mm256_aesenclast_epi128( a7, b0 );\
  \
  /* MixBytes */\
  MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
 \
 }
 /* 10 rounds, P and Q in parallel */
 #define ROUNDS_P_Q_2WAY(){\
  ROUND_2WAY(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
  ROUND_2WAY(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  ROUND_2WAY(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
  ROUND_2WAY(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  ROUND_2WAY(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
  ROUND_2WAY(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  ROUND_2WAY(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
  ROUND_2WAY(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  ROUND_2WAY(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
  ROUND_2WAY(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
 }
 #define Matrix_Transpose_A_2way(i0, i1, i2, i3, o1, o2, o3, t0){\
  t0 = TRANSP_MASK_2WAY;\
  \
  i0 = _mm256_shuffle_epi8( i0, t0 );\
  i1 = _mm256_shuffle_epi8( i1, t0 );\
  i2 = _mm256_shuffle_epi8( i2, t0 );\
  i3 = _mm256_shuffle_epi8( i3, t0 );\
  \
  o1 = i0;\
  t0 = i2;\
  \
  i0 = _mm256_unpacklo_epi16( i0, i1 );\
  o1 = _mm256_unpackhi_epi16( o1, i1 );\
  i2 = _mm256_unpacklo_epi16( i2, i3 );\
  t0 = _mm256_unpackhi_epi16( t0, i3 );\
  \
  i0 = _mm256_shuffle_epi32( i0, 216 );\
  o1 = _mm256_shuffle_epi32( o1, 216 );\
  i2 = _mm256_shuffle_epi32( i2, 216 );\
  t0 = _mm256_shuffle_epi32( t0, 216 );\
  \
  o2 = i0;\
  o3 = o1;\
  \
  i0 = _mm256_unpacklo_epi32( i0, i2 );\
  o1 = _mm256_unpacklo_epi32( o1, t0 );\
  o2 = _mm256_unpackhi_epi32( o2, i2 );\
  o3 = _mm256_unpackhi_epi32( o3, t0 );\
 }/**/
 #define Matrix_Transpose_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
  o1 = i0;\
  o2 = i1;\
  i0 = _mm256_unpacklo_epi64( i0, i4 );\
  o1 = _mm256_unpackhi_epi64( o1, i4 );\
  o3 = i1;\
  o4 = i2;\
  o2 = _mm256_unpacklo_epi64( o2, i5 );\
  o3 = _mm256_unpackhi_epi64( o3, i5 );\
  o5 = i2;\
  o6 = i3;\
  o4 = _mm256_unpacklo_epi64( o4, i6 );\
  o5 = _mm256_unpackhi_epi64( o5, i6 );\
  o7 = i3;\
  o6 = _mm256_unpacklo_epi64( o6, i7 );\
  o7 = _mm256_unpackhi_epi64( o7, i7 );\
 }/**/
 #define Matrix_Transpose_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
  o0 = i0;\
  i0 = _mm256_unpacklo_epi64( i0, i1 );\
  o0 = _mm256_unpackhi_epi64( o0, i1 );\
  o1 = i2;\
  i2 = _mm256_unpacklo_epi64( i2, i3 );\
  o1 = _mm256_unpackhi_epi64( o1, i3 );\
  o2 = i4;\
  i4 = _mm256_unpacklo_epi64( i4, i5 );\
  o2 = _mm256_unpackhi_epi64( o2, i5 );\
  o3 = i6;\
  i6 = _mm256_unpacklo_epi64( i6, i7 );\
  o3 = _mm256_unpackhi_epi64( o3, i7 );\
 }/**/
 #define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
  t0 = _mm256_xor_si256( t0, t0 );\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
  i7 = i6;\
  i0 = _mm256_unpacklo_epi64( i0, t0 );\
  i1 = _mm256_unpackhi_epi64( i1, t0 );\
  i2 = _mm256_unpacklo_epi64( i2, t0 );\
  i3 = _mm256_unpackhi_epi64( i3, t0 );\
  i4 = _mm256_unpacklo_epi64( i4, t0 );\
  i5 = _mm256_unpackhi_epi64( i5, t0 );\
  i6 = _mm256_unpacklo_epi64( i6, t0 );\
  i7 = _mm256_unpackhi_epi64( i7, t0 );\
 }/**/
 #define Matrix_Transpose_O_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7){\
  i0 = _mm256_unpacklo_epi64( i0, i1 );\
  i2 = _mm256_unpacklo_epi64( i2, i3 );\
  i4 = _mm256_unpacklo_epi64( i4, i5 );\
  i6 = _mm256_unpacklo_epi64( i6, i7 );\
 }/**/
 void TF512_2way( __m256i* chaining, __m256i* message )
 {
  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m256i TEMP0;
  static __m256i TEMP1;
  static __m256i TEMP2;
  /* load message into registers xmm12 - xmm15 */
  xmm12 = message[0];
  xmm13 = message[1];
  xmm14 = message[2];
  xmm15 = message[3];
  /* transform message M from column ordering into row ordering */
  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
  Matrix_Transpose_A_2way(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
  /* load previous chaining value */
  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
  xmm8 = chaining[0];
  xmm0 = chaining[1];
  xmm4 = chaining[2];
  xmm5 = chaining[3];
  /* xor message to CV get input of P */
  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
  xmm8 = _mm256_xor_si256( xmm8, xmm12 );
  xmm0 = _mm256_xor_si256( xmm0, xmm2 );
  xmm4 = _mm256_xor_si256( xmm4, xmm6 );
  xmm5 = _mm256_xor_si256( xmm5, xmm7 );
  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
  Matrix_Transpose_B_2way(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
  /* compute the two permutations P and Q in parallel */
  ROUNDS_P_Q_2WAY();
  /* unpack again to get two rows of P or two rows of Q in one xmm register */
  Matrix_Transpose_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
  /* xor output of P and Q */
  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
  xmm0 = _mm256_xor_si256( xmm0, xmm8 );
  xmm1 = _mm256_xor_si256( xmm1, xmm10 );
  xmm2 = _mm256_xor_si256( xmm2, xmm12 );
  xmm3 = _mm256_xor_si256( xmm3, xmm14 );
  /* xor CV (feed-forward) */
  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
  xmm0 = _mm256_xor_si256( xmm0, (chaining[0]) );
  xmm1 = _mm256_xor_si256( xmm1, (chaining[1]) );
  xmm2 = _mm256_xor_si256( xmm2, (chaining[2]) );
  xmm3 = _mm256_xor_si256( xmm3, (chaining[3]) );
  /* store CV */
  chaining[0] = xmm0;
  chaining[1] = xmm1;
  chaining[2] = xmm2;
  chaining[3] = xmm3;
  return;
 }
 void OF512_2way( __m256i* chaining )
 {
  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m256i TEMP0;
  static __m256i TEMP1;
  static __m256i TEMP2;
  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
  xmm10 = chaining[1];
  xmm12 = chaining[2];
  xmm14 = chaining[3];
  /* there are now 2 rows of the CV in one xmm register */
  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
  /* result: the 8 input rows of P in xmm8 - xmm15 */
  Matrix_Transpose_O_B_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
  /* compute the permutation P */
  /* result: the output of P(CV) in xmm8 - xmm15 */
  ROUNDS_P_Q_2WAY();
  /* unpack again to get two rows of P in one xmm register */
  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
  Matrix_Transpose_O_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
  xmm8  = _mm256_xor_si256( xmm8,  (chaining[0]) );
  xmm10 = _mm256_xor_si256( xmm10, (chaining[1]) );
  xmm12 = _mm256_xor_si256( xmm12, (chaining[2]) );
  xmm14 = _mm256_xor_si256( xmm14, (chaining[3]) );
  /* transform state back from row ordering into column ordering */
  /* result: final hash value in xmm9, xmm11 */
  Matrix_Transpose_A_2way(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
  /* we only need to return the truncated half of the state */
  chaining[2] = xmm9;
  chaining[3] = xmm11;
 }
 #endif  // VAES
-#endif  // GROESTL512_INTR_4WAY_H__
+#endif  // GROESTL256_INTR_4WAY_H__
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,7 +15,9 @@
 #include "miner.h"
 #include "simd-utils.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
@@ -137,5 +139,130 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
   return 0;
 }
 #endif   // AVX512
 // AVX2 + VAES
 int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
 {
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;
  memset_zero_256( ctx->chaining, SIZE512 );
  memset_zero_256( ctx->buffer, SIZE512 );
  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
  return 0;
 }
 int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE512;
   __m256i* in = (__m256i*)input;
   int i;
   // --- update ---
   for ( i = 0; i < blocks; i++ )
      TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
   i += rem;
   //--- final ---
   blocks++;      // adjust for final block
   if ( i == SIZE512 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
   }
   else
   {
       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m256_zero;
       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
   }
   TF1024_2way( ctx->chaining, ctx->buffer );
   OF1024_2way( ctx->chaining );
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
   return 0;
 }
 int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
                          const void* input, uint64_t datalen )
 {
   const int len = (int)datalen >> 4;
   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   uint64_t blocks = len / SIZE512;
   __m256i* in = (__m256i*)input;
   int i;
   // --- init ---
   memset_zero_256( ctx->chaining, SIZE512 );
   memset_zero_256( ctx->buffer, SIZE512 );
   ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;
   ctx->rem_ptr = 0;
   // --- update ---
   for ( i = 0; i < blocks; i++ )
      TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
   i += ctx->rem_ptr;
   // --- close ---
   blocks++;
   if ( i == SIZE512 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
   }
   else
   {
       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m256_zero;
       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
   }
   TF1024_2way( ctx->chaining, ctx->buffer );
   OF1024_2way( ctx->chaining );
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
   return 0;
 }
 #endif   // VAES
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -10,7 +10,7 @@
 #endif
 #include <stdlib.h>
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 #define LENGTH (512)
@@ -36,20 +36,19 @@
 #define SIZE512 (SIZE_1024/16)
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
  int databitlen;    // bits
 } groestl512_4way_context;
 int groestl512_4way_init( groestl512_4way_context*, uint64_t );
 //int reinit_groestl( hashState_groestl* );
 int groestl512_4way_update( groestl512_4way_context*, const void*,
                              uint64_t );
 int groestl512_4way_close( groestl512_4way_context*, void* );
@@ -58,5 +57,29 @@ int groestl512_4way_update_close( groestl512_4way_context*,  void*,
 int groestl512_4way_full( groestl512_4way_context*,  void*,
                          const void*, uint64_t );
 #endif   // AVX512
 // AVX2 + VAES
 typedef struct {
  __attribute__ ((aligned (128))) __m256i chaining[SIZE512];
  __attribute__ ((aligned (64))) __m256i buffer[SIZE512];
  int blk_count;     // SIZE_m128i
  int buf_ptr;       // __m128i offset
  int rem_ptr;
 } groestl512_2way_context;
 int groestl512_2way_init( groestl512_2way_context*, uint64_t );
 int groestl512_2way_update( groestl512_2way_context*, const void*,
                              uint64_t );
 int groestl512_2way_close( groestl512_2way_context*, void* );
 int groestl512_2way_update_close( groestl512_2way_context*,  void*,
                                        const void*, uint64_t );
 int groestl512_2way_full( groestl512_2way_context*,  void*,
                          const void*, uint64_t );
 #endif   // VAES
 #endif   // GROESTL512_HASH_4WAY_H__
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -12,7 +12,7 @@
 #include "groestl512-hash-4way.h"
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
 {
@@ -50,6 +50,8 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                     0x1d1519111c141810, 0x1f171b131e161a12,
                                     0x2d2529212c242820, 0x2f272b232e262a22,
@@ -660,5 +662,578 @@ void OF1024_4way( __m512i* chaining )
  return;
 }
 #endif  // AVX512
 // AVX2 + VAES
 static const __m256i TRANSP_MASK_2WAY =
    { 0x0d0509010c040800, 0x0f070b030e060a02,
      0x1d1519111c141810, 0x1f171b131e161a12 };
 static const __m256i SUBSH_MASK0_2WAY =
    { 0x0b0e0104070a0d00, 0x0306090c0f020508,
      0x1b1e1114171a1d10, 0x1316191c1f121518 };
 static const __m256i SUBSH_MASK1_2WAY =
    { 0x0c0f0205080b0e01, 0x04070a0d00030609,
      0x1c1f1215181b1e11, 0x14171a1d10131619 };
 static const __m256i SUBSH_MASK2_2WAY =
    { 0x0d000306090c0f02, 0x05080b0e0104070a,
      0x1d101316191c1f12, 0x15181b1e1114171a };
 static const __m256i SUBSH_MASK3_2WAY =
    { 0x0e0104070a0d0003, 0x06090c0f0205080b,
      0x1e1114171a1d1013, 0x16191c1f1215181b };
 static const __m256i SUBSH_MASK4_2WAY = 
    { 0x0f0205080b0e0104, 0x070a0d000306090c,
      0x1f1215181b1e1114, 0x171a1d101316191c };
 static const __m256i SUBSH_MASK5_2WAY =
    { 0x000306090c0f0205, 0x080b0e0104070a0d,
      0x101316191c1f1215, 0x181b1e1114171a1d };
 static const __m256i SUBSH_MASK6_2WAY =
    { 0x0104070a0d000306, 0x090c0f0205080b0e,
      0x1114171a1d101316, 0x191c1f1215181b1e };
 static const __m256i SUBSH_MASK7_2WAY =
    { 0x06090c0f0205080b, 0x0e0104070a0d0003,
      0x16191c1f1215181b, 0x1e1114171a1d1013 };
 #define tos(a)    #a
 #define tostr(a)  tos(a)
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
 #define MUL2_2WAY(i, j, k){\
  j = _mm256_xor_si256(j, j);\
  j = _mm256_cmpgt_epi8(j, i );\
  i = _mm256_add_epi8(i, i);\
  j = _mm256_and_si256(j, k);\
  i = _mm256_xor_si256(i, j);\
 }
 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* t_i = a_i + a_{i+1} */\
  b6 = a0;\
  b7 = a1;\
  a0 = _mm256_xor_si256(a0, a1);\
  b0 = a2;\
  a1 = _mm256_xor_si256(a1, a2);\
  b1 = a3;\
  a2 = _mm256_xor_si256(a2, a3);\
  b2 = a4;\
  a3 = _mm256_xor_si256(a3, a4);\
  b3 = a5;\
  a4 = _mm256_xor_si256(a4, a5);\
  b4 = a6;\
  a5 = _mm256_xor_si256(a5, a6);\
  b5 = a7;\
  a6 = _mm256_xor_si256(a6, a7);\
  a7 = _mm256_xor_si256(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  b0 = _mm256_xor_si256(b0, a4);\
  b6 = _mm256_xor_si256(b6, a4);\
  b1 = _mm256_xor_si256(b1, a5);\
  b7 = _mm256_xor_si256(b7, a5);\
  b2 = _mm256_xor_si256(b2, a6);\
  b0 = _mm256_xor_si256(b0, a6);\
  /* spill values y_4, y_5 to memory */\
  TEMP0 = b0;\
  b3 = _mm256_xor_si256(b3, a7);\
  b1 = _mm256_xor_si256(b1, a7);\
  TEMP1 = b1;\
  b4 = _mm256_xor_si256(b4, a0);\
  b2 = _mm256_xor_si256(b2, a0);\
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
  b0 = a0;\
  b5 = _mm256_xor_si256(b5, a1);\
  b3 = _mm256_xor_si256(b3, a1);\
  b1 = a1;\
  b6 = _mm256_xor_si256(b6, a2);\
  b4 = _mm256_xor_si256(b4, a2);\
  TEMP2 = a2;\
  b7 = _mm256_xor_si256(b7, a3);\
  b5 = _mm256_xor_si256(b5, a3);\
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm256_xor_si256(a0, a3);\
  a1 = _mm256_xor_si256(a1, a4);\
  a2 = _mm256_xor_si256(a2, a5);\
  a3 = _mm256_xor_si256(a3, a6);\
  a4 = _mm256_xor_si256(a4, a7);\
  a5 = _mm256_xor_si256(a5, b0);\
  a6 = _mm256_xor_si256(a6, b1);\
  a7 = _mm256_xor_si256(a7, TEMP2);\
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
  MUL2_2WAY(a0, b0, b1);\
  a0 = _mm256_xor_si256(a0, TEMP0);\
  MUL2_2WAY(a1, b0, b1);\
  a1 = _mm256_xor_si256(a1, TEMP1);\
  MUL2_2WAY(a2, b0, b1);\
  a2 = _mm256_xor_si256(a2, b2);\
  MUL2_2WAY(a3, b0, b1);\
  a3 = _mm256_xor_si256(a3, b3);\
  MUL2_2WAY(a4, b0, b1);\
  a4 = _mm256_xor_si256(a4, b4);\
  MUL2_2WAY(a5, b0, b1);\
  a5 = _mm256_xor_si256(a5, b5);\
  MUL2_2WAY(a6, b0, b1);\
  a6 = _mm256_xor_si256(a6, b6);\
  MUL2_2WAY(a7, b0, b1);\
  a7 = _mm256_xor_si256(a7, b7);\
  \
  /* compute v_i : double w_i      */\
  /* add to y_4 y_5 .. v3, v4, ... */\
  MUL2_2WAY(a0, b0, b1);\
  b5 = _mm256_xor_si256(b5, a0);\
  MUL2_2WAY(a1, b0, b1);\
  b6 = _mm256_xor_si256(b6, a1);\
  MUL2_2WAY(a2, b0, b1);\
  b7 = _mm256_xor_si256(b7, a2);\
  MUL2_2WAY(a5, b0, b1);\
  b2 = _mm256_xor_si256(b2, a5);\
  MUL2_2WAY(a6, b0, b1);\
  b3 = _mm256_xor_si256(b3, a6);\
  MUL2_2WAY(a7, b0, b1);\
  b4 = _mm256_xor_si256(b4, a7);\
  MUL2_2WAY(a3, b0, b1);\
  MUL2_2WAY(a4, b0, b1);\
  b0 = TEMP0;\
  b1 = TEMP1;\
  b0 = _mm256_xor_si256(b0, a3);\
  b1 = _mm256_xor_si256(b1, a4);\
 }/*MixBytes*/
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
 */
 #define SUBMIX_2WAY(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* SubBytes */\
  b0 = _mm256_xor_si256( b0, b0 );\
  a0 = _mm256_aesenclast_epi128( a0, b0 );\
  a1 = _mm256_aesenclast_epi128( a1, b0 );\
  a2 = _mm256_aesenclast_epi128( a2, b0 );\
  a3 = _mm256_aesenclast_epi128( a3, b0 );\
  a4 = _mm256_aesenclast_epi128( a4, b0 );\
  a5 = _mm256_aesenclast_epi128( a5, b0 );\
  a6 = _mm256_aesenclast_epi128( a6, b0 );\
  a7 = _mm256_aesenclast_epi128( a7, b0 );\
  /* MixBytes */\
  MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
 }
 #define ROUNDS_P_2WAY(){\
  uint8_t round_counter = 0;\
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
    xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
             casti_m128i( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
    xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK2_2WAY );\
    xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK3_2WAY );\
    xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK4_2WAY );\
    xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK5_2WAY );\
    xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK6_2WAY );\
    xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK7_2WAY );\
    /* SubBytes + MixBytes */\
    SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
    xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
             casti_m128i( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
    xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK2_2WAY );\
    xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK3_2WAY );\
    xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK4_2WAY );\
    xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK5_2WAY );\
    xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK6_2WAY );\
    xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK7_2WAY );\
    /* SubBytes + MixBytes */\
     SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }
 #define ROUNDS_Q_2WAY(){\
  uint8_t round_counter = 0;\
  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
  { \
    /* AddRoundConstant Q1024 */\
    xmm1 = m256_neg1;\
    xmm8  = _mm256_xor_si256( xmm8,  xmm1 );\
    xmm9  = _mm256_xor_si256( xmm9,  xmm1 );\
    xmm10 = _mm256_xor_si256( xmm10, xmm1 );\
    xmm11 = _mm256_xor_si256( xmm11, xmm1 );\
    xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
    xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
                 casti_m128i( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
    xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK5_2WAY );\
    xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK7_2WAY );\
    xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK0_2WAY );\
    xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK2_2WAY );\
    xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK4_2WAY );\
    xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK6_2WAY );\
    /* SubBytes + MixBytes */\
    SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
    /* AddRoundConstant Q1024 */\
    xmm9 = m256_neg1;\
    xmm0 = _mm256_xor_si256( xmm0, xmm9 );\
    xmm1 = _mm256_xor_si256( xmm1, xmm9 );\
    xmm2 = _mm256_xor_si256( xmm2, xmm9 );\
    xmm3 = _mm256_xor_si256( xmm3, xmm9 );\
    xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
    xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
             casti_m128i( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
    xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK5_2WAY );\
    xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK7_2WAY );\
    xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK0_2WAY );\
    xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK2_2WAY );\
    xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK4_2WAY );\
    xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK6_2WAY );\
    /* SubBytes + MixBytes */\
    SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }
 #define Matrix_Transpose_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
  t0 = TRANSP_MASK_2WAY;\
 \
  i6 = _mm256_shuffle_epi8(i6, t0);\
  i0 = _mm256_shuffle_epi8(i0, t0);\
  i1 = _mm256_shuffle_epi8(i1, t0);\
  i2 = _mm256_shuffle_epi8(i2, t0);\
  i3 = _mm256_shuffle_epi8(i3, t0);\
  t1 = i2;\
  i4 = _mm256_shuffle_epi8(i4, t0);\
  i5 = _mm256_shuffle_epi8(i5, t0);\
  t2 = i4;\
  t3 = i6;\
  i7 = _mm256_shuffle_epi8(i7, t0);\
 \
  /* continue with unpack using 4 temp registers */\
  t0 = i0;\
  t2 = _mm256_unpackhi_epi16(t2, i5);\
  i4 = _mm256_unpacklo_epi16(i4, i5);\
  t3 = _mm256_unpackhi_epi16(t3, i7);\
  i6 = _mm256_unpacklo_epi16(i6, i7);\
  t0 = _mm256_unpackhi_epi16(t0, i1);\
  t1 = _mm256_unpackhi_epi16(t1, i3);\
  i2 = _mm256_unpacklo_epi16(i2, i3);\
  i0 = _mm256_unpacklo_epi16(i0, i1);\
 \
  /* shuffle with immediate */\
  t0 = _mm256_shuffle_epi32(t0, 216);\
  t1 = _mm256_shuffle_epi32(t1, 216);\
  t2 = _mm256_shuffle_epi32(t2, 216);\
  t3 = _mm256_shuffle_epi32(t3, 216);\
  i0 = _mm256_shuffle_epi32(i0, 216);\
  i2 = _mm256_shuffle_epi32(i2, 216);\
  i4 = _mm256_shuffle_epi32(i4, 216);\
  i6 = _mm256_shuffle_epi32(i6, 216);\
 \
  /* continue with unpack */\
  t4 = i0;\
  i0 = _mm256_unpacklo_epi32(i0, i2);\
  t4 = _mm256_unpackhi_epi32(t4, i2);\
  t5 = t0;\
  t0 = _mm256_unpacklo_epi32(t0, t1);\
  t5 = _mm256_unpackhi_epi32(t5, t1);\
  t6 = i4;\
  i4 = _mm256_unpacklo_epi32(i4, i6);\
  t7 = t2;\
  t6 = _mm256_unpackhi_epi32(t6, i6);\
  i2 = t0;\
  t2 = _mm256_unpacklo_epi32(t2, t3);\
  i3 = t0;\
  t7 = _mm256_unpackhi_epi32(t7, t3);\
 \
  /* there are now 2 rows in each xmm */\
  /* unpack to get 1 row of CV in each xmm */\
  i1 = i0;\
  i1 = _mm256_unpackhi_epi64(i1, i4);\
  i0 = _mm256_unpacklo_epi64(i0, i4);\
  i4 = t4;\
  i3 = _mm256_unpackhi_epi64(i3, t2);\
  i5 = t4;\
  i2 = _mm256_unpacklo_epi64(i2, t2);\
  i6 = t5;\
  i5 = _mm256_unpackhi_epi64(i5, t6);\
  i7 = t5;\
  i4 = _mm256_unpacklo_epi64(i4, t6);\
  i7 = _mm256_unpackhi_epi64(i7, t7);\
  i6 = _mm256_unpacklo_epi64(i6, t7);\
  /* transpose done */\
 }/**/
 #define Matrix_Transpose_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
  /*  transpose matrix to get output format */\
  o1 = i0;\
  i0 = _mm256_unpacklo_epi64(i0, i1);\
  o1 = _mm256_unpackhi_epi64(o1, i1);\
  t0 = i2;\
  i2 = _mm256_unpacklo_epi64(i2, i3);\
  t0 = _mm256_unpackhi_epi64(t0, i3);\
  t1 = i4;\
  i4 = _mm256_unpacklo_epi64(i4, i5);\
  t1 = _mm256_unpackhi_epi64(t1, i5);\
  t2 = i6;\
  o0 = TRANSP_MASK_2WAY;\
  i6 = _mm256_unpacklo_epi64(i6, i7);\
  t2 = _mm256_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
  i0 = _mm256_shuffle_epi8(i0, o0);\
  i2 = _mm256_shuffle_epi8(i2, o0);\
  i4 = _mm256_shuffle_epi8(i4, o0);\
  i6 = _mm256_shuffle_epi8(i6, o0);\
  o1 = _mm256_shuffle_epi8(o1, o0);\
  t0 = _mm256_shuffle_epi8(t0, o0);\
  t1 = _mm256_shuffle_epi8(t1, o0);\
  t2 = _mm256_shuffle_epi8(t2, o0);\
  /* continue with unpack using 4 temp registers */\
  t3 = i4;\
  o2 = o1;\
  o0 = i0;\
  t4 = t1;\
  \
  t3 = _mm256_unpackhi_epi16(t3, i6);\
  i4 = _mm256_unpacklo_epi16(i4, i6);\
  o0 = _mm256_unpackhi_epi16(o0, i2);\
  i0 = _mm256_unpacklo_epi16(i0, i2);\
  o2 = _mm256_unpackhi_epi16(o2, t0);\
  o1 = _mm256_unpacklo_epi16(o1, t0);\
  t4 = _mm256_unpackhi_epi16(t4, t2);\
  t1 = _mm256_unpacklo_epi16(t1, t2);\
  /* shuffle with immediate */\
  i4 = _mm256_shuffle_epi32(i4, 216);\
  t3 = _mm256_shuffle_epi32(t3, 216);\
  o1 = _mm256_shuffle_epi32(o1, 216);\
  o2 = _mm256_shuffle_epi32(o2, 216);\
  i0 = _mm256_shuffle_epi32(i0, 216);\
  o0 = _mm256_shuffle_epi32(o0, 216);\
  t1 = _mm256_shuffle_epi32(t1, 216);\
  t4 = _mm256_shuffle_epi32(t4, 216);\
  /* continue with unpack */\
  i1 = i0;\
  i3 = o0;\
  i5 = o1;\
  i7 = o2;\
  i0 = _mm256_unpacklo_epi32(i0, i4);\
  i1 = _mm256_unpackhi_epi32(i1, i4);\
  o0 = _mm256_unpacklo_epi32(o0, t3);\
  i3 = _mm256_unpackhi_epi32(i3, t3);\
  o1 = _mm256_unpacklo_epi32(o1, t1);\
  i5 = _mm256_unpackhi_epi32(i5, t1);\
  o2 = _mm256_unpacklo_epi32(o2, t4);\
  i7 = _mm256_unpackhi_epi32(i7, t4);\
  /* transpose done */\
 }/**/
 void INIT_2way( __m256i *chaining )
 {
  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  /* load IV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
  xmm9 = chaining[1];
  xmm10 = chaining[2];
  xmm11 = chaining[3];
  xmm12 = chaining[4];
  xmm13 = chaining[5];
  xmm14 = chaining[6];
  xmm15 = chaining[7];
  /* transform chaining value from column ordering into row ordering */
  Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  /* store transposed IV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
 }
 void TF1024_2way( __m256i *chaining, const __m256i *message )
 {
  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m256i QTEMP[8];
  static __m256i TEMP0;
  static __m256i TEMP1;
  static __m256i TEMP2;
  /* load message into registers xmm8 - xmm15 (Q = message) */
  xmm8 = message[0];
  xmm9 = message[1];
  xmm10 = message[2];
  xmm11 = message[3];
  xmm12 = message[4];
  xmm13 = message[5];
  xmm14 = message[6];
  xmm15 = message[7];
  /* transform message M from column ordering into row ordering */
  Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
  /* store message M (Q input) for later */
  QTEMP[0] = xmm8;
  QTEMP[1] = xmm9;
  QTEMP[2] = xmm10;
  QTEMP[3] = xmm11;
  QTEMP[4] = xmm12;
  QTEMP[5] = xmm13;
  QTEMP[6] = xmm14;
  QTEMP[7] = xmm15;
  /* xor CV to message to get P input */
  /* result: CV+M in xmm8...xmm15 */
  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
  /* compute permutation P */
  /* result: P(CV+M) in xmm8...xmm15 */
  ROUNDS_P_2WAY();
  /* xor CV to P output (feed-forward) */
  /* result: P(CV+M)+CV in xmm8...xmm15 */
  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
  /* store P(CV+M)+CV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
  /* load message M (Q input) into xmm8-15 */
  xmm8 = QTEMP[0];
  xmm9 = QTEMP[1];
  xmm10 = QTEMP[2];
  xmm11 = QTEMP[3];
  xmm12 = QTEMP[4];
  xmm13 = QTEMP[5];
  xmm14 = QTEMP[6];
  xmm15 = QTEMP[7];
  /* compute permutation Q */
  /* result: Q(M) in xmm8...xmm15 */
  ROUNDS_Q_2WAY();
  /* xor Q output */
  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
  /* store CV */
  chaining[0] = xmm8;
  chaining[1] = xmm9;
  chaining[2] = xmm10;
  chaining[3] = xmm11;
  chaining[4] = xmm12;
  chaining[5] = xmm13;
  chaining[6] = xmm14;
  chaining[7] = xmm15;
  return;
 }
 void OF1024_2way( __m256i* chaining )
 {
  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m256i TEMP0;
  static __m256i TEMP1;
  static __m256i TEMP2;
  /* load CV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
  xmm9 = chaining[1];
  xmm10 = chaining[2];
  xmm11 = chaining[3];
  xmm12 = chaining[4];
  xmm13 = chaining[5];
  xmm14 = chaining[6];
  xmm15 = chaining[7];
  /* compute permutation P */
  /* result: P(CV) in xmm8...xmm15 */
  ROUNDS_P_2WAY();
  /* xor CV to P output (feed-forward) */
  /* result: P(CV)+CV in xmm8...xmm15 */
  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
  /* transpose CV back from row ordering to column ordering */
  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
  Matrix_Transpose_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
  /* we only need to return the truncated half of the state */
  chaining[4] = xmm0;
  chaining[5] = xmm6;
  chaining[6] = xmm13;
  chaining[7] = xmm15;
  return;
 }
 #endif  // VAES
 #endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -174,24 +174,19 @@ void allium_16way_hash( void *state, const void *input )
 #if defined(__VAES__)
   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
-
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
-
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
-
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
- 
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
 #else
@@ -262,8 +257,11 @@ typedef struct {
   keccak256_4way_context    keccak;
   cubehashParam             cube;
   skein256_4way_context     skein;
 #if defined(__VAES__)
   groestl256_2way_context   groestl;
 #else
   hashState_groestl256      groestl;
-
+#endif
 } allium_8way_ctx_holder;
 static __thread allium_8way_ctx_holder allium_8way_ctx;
@@ -273,7 +271,11 @@ bool init_allium_8way_ctx()
   keccak256_4way_init( &allium_8way_ctx.keccak );
   cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &allium_8way_ctx.skein );
 #if defined(__VAES__)
   groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
 #else
   init_groestl256( &allium_8way_ctx.groestl, 32 );
 #endif
   return true;
 }
@@ -352,9 +354,28 @@ void allium_8way_hash( void *hash, const void *input )
   skein256_4way_update( &ctx.skein, vhashB, 32 );
   skein256_4way_close( &ctx.skein, vhashB );
 #if defined(__VAES__)
   uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
   uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
   rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
   groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
   dintrlv_2x128( hash0, hash1, vhashC, 256 );
   dintrlv_2x128( hash2, hash3, vhashD, 256 );
   rintrlv_4x64_2x128( vhashC, vhashD, vhashB, 256 );
   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
   groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
   dintrlv_2x128( hash4, hash5, vhashC, 256 );
   dintrlv_2x128( hash6, hash7, vhashD, 256 );
 #else
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
-
+   
   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
@@ -363,6 +384,8 @@ void allium_8way_hash( void *hash, const void *input )
   groestl256_full( &ctx.groestl, hash5, hash5, 256 );
   groestl256_full( &ctx.groestl, hash6, hash6, 256 );
   groestl256_full( &ctx.groestl, hash7, hash7, 256 );
 #endif
 }
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -187,7 +187,8 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
 	              | VAES_OPT | VAES256_OPT;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,36 +3,38 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
-#elif defined(SHA256T_4WAY)
+#else
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 /*
 #else
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 */
 #endif
    gate->optimizations = SSE2_OPT | AVX2_OPT;
    return true;
 }
 bool register_sha256q_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q_8way;
    gate->hash       = (void*)&sha256q_8way_hash;
-#elif defined(SHA256T_4WAY)
+#else
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q_4way;
    gate->hash       = (void*)&sha256q_4way_hash;
 /*
 #else
    gate->optimizations = SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256q;
    gate->hash       = (void*)&sha256q_hash;
 */
 #endif
    gate->optimizations = SSE2_OPT | AVX2_OPT;
    return true;
 }
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -4,13 +4,10 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
-// Override multi way on ryzen, SHA is better.
+#if defined(__AVX2__)
 #if !defined(__SHA__)
 #if defined(__AVX2__)
  #define SHA256T_8WAY
- #elif defined(__SSE2__)
+#else
  #define SHA256T_4WAY
 #endif
 #endif
 bool register_sha256t_algo( algo_gate_t* gate );
@@ -36,12 +33,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 /*
 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
-
+*/
 #endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,5 +1,7 @@
 #include "sha256t-gate.h"
 // Obsolete
 #if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
 #include <stdlib.h>
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -26,7 +26,11 @@ static const uint32_t IV512[] =
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
 {
 #if defined(__VAES__)
   const __m256i zero = _mm256_setzero_si256();
 #else
   const __m128i zero = _mm_setzero_si128();
 #endif
   __m256i p0, p1, p2, p3, x;
   __m256i k00, k01, k02, k03, k10, k11, k12, k13;
   __m256i *m = (__m256i*)msg;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -619,11 +619,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-         break;
+#endif
   	    break;
         case JH:
            if ( i == 0 )
               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
@@ -711,11 +720,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
            }
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            shavite512_full( &ctx.shavite, hash0, in0, size );
            shavite512_full( &ctx.shavite, hash1, in1, size );
            shavite512_full( &ctx.shavite, hash2, in2, size );
            shavite512_full( &ctx.shavite, hash3, in3, size );
-         break;
+#endif
   	    break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd512_2way_full( &ctx.simd, vhash, vhash, size );
@@ -725,6 +743,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                              (const BitSequence *)in0, size );
            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
@@ -733,7 +759,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
                              (const BitSequence *)in2, size );
            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                              (const BitSequence *)in3, size );
-         break;
+#endif
   	    break;
         case HAMSI:
            if ( i == 0 )
               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -61,7 +61,8 @@ bool register_x16r_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -79,7 +80,8 @@ bool register_x16rv2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -97,7 +99,8 @@ bool register_x16s_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -230,7 +233,8 @@ bool register_x16rt_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -247,7 +251,8 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -277,22 +282,17 @@ bool register_x21s_algo( algo_gate_t* gate )
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
                            | VAES_OPT;
 #elif defined (X16R_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
                            | AVX512_OPT | VAES_OPT;
 #else
  gate->scanhash          = (void*)&scanhash_x21s;
  gate->hash              = (void*)&x21s_hash;
  gate->miner_thread_init = (void*)&x21s_thread_init;
  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
                          | AVX512_OPT | VAES_OPT;
 #endif
-//  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                    VAES_OPT | VAES256_OPT;
  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -41,6 +41,7 @@
 #include "algo/sha/sha-hash-4way.h"
 #if defined(__VAES__)
  #include "algo/groestl/groestl512-hash-4way.h"
  #include "algo/shavite/shavite-hash-2way.h"
  #include "algo/shavite/shavite-hash-4way.h"
  #include "algo/echo/echo-hash-4way.h"
 #endif
@@ -145,15 +146,21 @@ union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
-    hashState_echo          echo;
+#if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
    echo_2way_context       echo;
 #else
    hashState_groestl       groestl;
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    hashState_luffa         luffa1;
    cubehashParam           cube;
    shavite512_context      shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -672,14 +672,20 @@ union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
-    hashState_echo          echo;
+#if defined(__VAES__)
    groestl512_2way_context groestl;
    shavite512_2way_context shavite;
    echo_2way_context       echo;
 #else
    hashState_groestl       groestl;
    shavite512_context      shavite;
    hashState_echo          echo;
 #endif
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cubehashParam           cube;
    shavite512_context      shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -745,10 +751,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
 #endif
         break;
         case JH:
            if ( i == 0 )
@@ -887,10 +902,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            }
         break;
         case SHAVITE:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            shavite512_full( &ctx.shavite, hash0, in0, size );
            shavite512_full( &ctx.shavite, hash1, in1, size );
            shavite512_full( &ctx.shavite, hash2, in2, size );
            shavite512_full( &ctx.shavite, hash3, in3, size );
 #endif
         break;
         case SIMD:
            intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -901,6 +925,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
 #if defined(__VAES__)
            intrlv_2x128( vhash, in0, in1, size<<3 );
            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
            dintrlv_2x128_512( hash2, hash3, vhash );
 #else
            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                              (const BitSequence *)in0, size );
            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
@@ -909,6 +941,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                              (const BitSequence *)in2, size );
            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                              (const BitSequence *)in3, size );
 #endif
         break;
         case HAMSI:
            if ( i == 0 )
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -1124,7 +1124,13 @@ union _sonoa_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo512_2way_context    echo;
 #else
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
@@ -1132,7 +1138,6 @@ union _sonoa_4way_context_overlay
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
    shabal512_4way_context  shabal;
@@ -1162,6 +1167,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1171,6 +1187,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1195,6 +1213,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1206,16 +1233,29 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
                     (const BitSequence *)hash2, 64 );
     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                     (const BitSequence *)hash3, 64 );
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     if ( work_restart[thr_id].restart ) return 0;
 // 2
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
     bmw512_4way_init( &ctx.bmw );
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1225,6 +1265,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1249,6 +1291,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1263,6 +1314,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1274,6 +1327,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1283,6 +1347,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1307,6 +1373,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1321,6 +1396,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1340,6 +1417,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1349,6 +1437,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1373,6 +1463,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1387,6 +1486,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1410,6 +1511,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     echo_full( &ctx.echo, (BitSequence *)hash0, 512,
@@ -1424,6 +1534,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_2x128_512( vhashA, hash0, hash1 );
     intrlv_2x128_512( vhashB, hash2, hash3 );
 #endif
     shavite512_2way_init( &ctx.shavite );
     shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
     shavite512_2way_init( &ctx.shavite );
@@ -1443,6 +1555,20 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     shabal512_4way_update( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
 #if defined(__VAES__)
 //     rintrlv_4x32_2x128( vhashA, vhashB, vhash, 512 ); 
     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_2x128_512( vhashA, hash0, hash1 );
     intrlv_2x128_512( vhashB, hash2, hash3 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1452,6 +1578,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1476,6 +1604,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1490,6 +1627,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1523,6 +1662,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1532,6 +1682,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1556,6 +1708,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1570,6 +1731,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1616,6 +1779,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1625,6 +1799,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif     
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -1649,6 +1825,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -1663,6 +1848,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
  init_sonoa_ctx();
  gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
  return true;
 };
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -240,7 +240,13 @@ union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo512_2way_context    echo;
 #else
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
@@ -248,7 +254,6 @@ union _x17_4way_context_overlay
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
    hashState_echo          echo;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
    shabal512_4way_context  shabal;
@@ -275,6 +280,17 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -284,6 +300,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
     jh512_4way_init( &ctx.jh );
@@ -308,6 +326,15 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_2x128_512( hash0, hash1, vhashA );
     dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -322,6 +349,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
 #else
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
  return true;
 };
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -405,15 +405,20 @@ union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
        bmw512_4way_context     bmw;
-        hashState_groestl       groestl;
+#if defined(__VAES__)
-        skein512_4way_context   skein;
+        groestl512_2way_context groestl;
        echo_2way_context       echo;
 #else
 	hashState_groestl       groestl;
        hashState_echo          echo;
 #endif
 	skein512_4way_context   skein;
        jh512_4way_context      jh;
        keccak512_4way_context  keccak;
        luffa_2way_context      luffa;
        cube_2way_context       cube;
        shavite512_2way_context shavite;
        simd_2way_context       simd;
        hashState_echo          echo;
        hamsi512_4way_context   hamsi;
        hashState_fugue         fugue;
        shabal512_4way_context  shabal;
@@ -442,7 +447,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );
-     // Serial
+#if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
 #else
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
@@ -450,9 +465,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
     // Parallel 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 #endif
     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
     jh512_4way_init( &ctx.jh );
@@ -477,6 +493,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
 #else
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
@@ -489,9 +514,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                     (const BitSequence *)hash3, dataLen );
     // Parallel
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -542,6 +568,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
     bmw512_4way_close( &ctx.bmw, vhash );
 #if defined(__VAES__)
     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
 #else
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
@@ -551,6 +588,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 #endif
     skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
     jh512_4way_init( &ctx.jh );
@@ -575,6 +614,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
     simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
 #if defined(__VAES__)
     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
 #else
     dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
     dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
@@ -589,6 +637,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 #endif
     hamsi512_4way_init( &ctx.hamsi );
     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
     hamsi512_4way_close( &ctx.hamsi, vhash );
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate )
  init_xevan_ctx();
  gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
  opt_target_factor = 256.0;
  return true;
 };
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -11,7 +11,7 @@
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
@@ -494,14 +494,19 @@ union _x22i_4way_ctx_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo_2way_context       echo;
 #else
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
    shavite512_2way_context shavite;
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
    luffa_2way_context      luffa;
    cube_2way_context       cube;
    shavite512_2way_context shavite;
    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
@@ -535,14 +540,28 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
   bmw512_4way_init( &ctx.bmw );
   bmw512_4way_update( &ctx.bmw, vhash, 64 );
   bmw512_4way_close( &ctx.bmw, vhash );
   dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
-   groestl512_full( &ctx.groestl, (char*)hash0, (const char*)hash0, 512 );
+#if defined(__VAES__)
   groestl512_full( &ctx.groestl, (char*)hash1, (const char*)hash1, 512 );
   groestl512_full( &ctx.groestl, (char*)hash2, (const char*)hash2, 512 );
   groestl512_full( &ctx.groestl, (char*)hash3, (const char*)hash3, 512 );
-   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
@@ -570,6 +589,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
   simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
   simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 #if defined(__VAES__)
   echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
   echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
   dintrlv_2x128_512( hash0, hash1, vhashA );
   dintrlv_2x128_512( hash2, hash3, vhashB );
@@ -584,6 +612,8 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 #endif
   if ( work_restart[thrid].restart ) return false;
   hamsi512_4way_init( &ctx.hamsi );
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -20,7 +20,7 @@ bool register_x22i_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x22i;
  gate->hash      = (void*)&x22i_hash;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
+                      | AVX512_OPT | VAES_OPT | VAES256_OPT;
 #endif
  return true;
 };
@@ -30,20 +30,15 @@ bool register_x25x_algo( algo_gate_t* gate )
 #if defined (X25X_8WAY)
  gate->scanhash  = (void*)&scanhash_x25x_8way;
  gate->hash      = (void*)&x25x_8way_hash;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
                      | AVX512_OPT | VAES_OPT;
 #elif defined (X25X_4WAY)
  gate->scanhash  = (void*)&scanhash_x25x_4way;
  gate->hash      = (void*)&x25x_4way_hash;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
                      | AVX512_OPT | VAES_OPT;
 #else
  gate->scanhash  = (void*)&scanhash_x25x;
  gate->hash      = (void*)&x25x_hash;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
                      | AVX512_OPT | VAES_OPT;
 #endif
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
 	                VAES_OPT | VAES256_OPT;
  return true;
 };
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -15,6 +15,7 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -412,9 +413,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
   LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
   dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );
-	sph_gost512_init(&ctx.gost);
+   sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+   sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_close(&ctx.gost, (void*) hash0[20]);
   sph_gost512_init(&ctx.gost);
   sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
   sph_gost512_close(&ctx.gost, (void*) hash1[20]);
@@ -574,68 +575,26 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }
 /*
 int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   uint32_t n = first_nonce;
   const uint32_t last_nonce = max_nonce - 4;
   const int thr_id = mythr->id;
   const uint32_t Htarg = ptarget[7];
   if (opt_benchmark)
      ((uint32_t*)ptarget)[7] = 0x08ff;
   InitializeSWIFFTX();
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   do
   {
      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      x25x_8way_hash( hash, vdata );
      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
              submit_solution( work, lane_hash, mythr );
         }
      }
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 */
 #elif defined(X25X_4WAY)
 union _x25x_4way_ctx_overlay
 {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
 #if defined(__VAES__)
    groestl512_2way_context groestl;
    echo_2way_context       echo;
 #else
    hashState_groestl       groestl;
    hashState_echo          echo;
 #endif
    skein512_4way_context   skein;
    jh512_4way_context      jh;
    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
-    cubehashParam           cube;
+    cube_2way_context       cube;
-    sph_shavite512_context  shavite;
+    shavite512_2way_context shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
    hamsi512_4way_context   hamsi;
    hashState_fugue         fugue;
    shabal512_4way_context  shabal;
@@ -658,6 +617,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
   unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
   unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
   uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
   uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
   x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
   blake512_4way_full( &ctx.blake, vhash, input, 80 );
@@ -668,11 +629,25 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   bmw512_4way_close( &ctx.bmw, vhash );
   dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
 #if defined(__VAES__)
   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
   groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
   groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
   dintrlv_2x128_512( hash0[2], hash1[2], vhashA );
   dintrlv_2x128_512( hash2[2], hash3[2], vhashB );
 #else
   groestl512_full( &ctx.groestl, (char*)hash0[2], (const char*)hash0[1], 512 );
   groestl512_full( &ctx.groestl, (char*)hash1[2], (const char*)hash1[1], 512 );
   groestl512_full( &ctx.groestl, (char*)hash2[2], (const char*)hash2[1], 512 );
   groestl512_full( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 );
 #endif
   intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
   skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
   dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
@@ -689,41 +664,38 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   keccak512_4way_close( &ctx.keccak, vhash );
   dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
-   luffa_full( &ctx.luffa, (BitSequence*)hash0[6], 512,
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
                     (const BitSequence*)hash0[5], 64 );
   luffa_full( &ctx.luffa, (BitSequence*)hash1[6], 512,
                     (const BitSequence*)hash1[5], 64 );
   luffa_full( &ctx.luffa, (BitSequence*)hash2[6], 512,
                     (const BitSequence*)hash2[5], 64 );
   luffa_full( &ctx.luffa, (BitSequence*)hash3[6], 512,
                     (const BitSequence*)hash3[5], 64 );
-   cubehash_full( &ctx.cube, (byte*)hash0[7], 512, (const byte*)hash0[6], 64 );
+   luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
-   cubehash_full( &ctx.cube, (byte*)hash1[7], 512, (const byte*)hash1[6], 64 );
+   luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
-   cubehash_full( &ctx.cube, (byte*)hash2[7], 512, (const byte*)hash2[6], 64 );
+   dintrlv_2x128_512( hash0[6], hash1[6], vhashA );
-   cubehash_full( &ctx.cube, (byte*)hash3[7], 512, (const byte*)hash3[6], 64 );
+   dintrlv_2x128_512( hash2[6], hash3[6], vhashB );
-   sph_shavite512_init(&ctx.shavite);
+   cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
-   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+   cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
-   sph_shavite512_close(&ctx.shavite, hash0[8]);
+   dintrlv_2x128_512( hash0[7], hash1[7], vhashA );
-   sph_shavite512_init(&ctx.shavite);
+   dintrlv_2x128_512( hash2[7], hash3[7], vhashB );
   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
   sph_shavite512_close(&ctx.shavite, hash1[8]);
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
   sph_shavite512_close(&ctx.shavite, hash2[8]);
   sph_shavite512_init(&ctx.shavite);
   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
   sph_shavite512_close(&ctx.shavite, hash3[8]);
-   simd_full( &ctx.simd, (BitSequence*)hash0[9],
+   shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
-                   (const BitSequence*)hash0[8], 512 );
+   shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
-   simd_full( &ctx.simd, (BitSequence*)hash1[9],
+   dintrlv_2x128_512( hash0[8], hash1[8], vhashA );
-                   (const BitSequence*)hash1[8], 512 );
+   dintrlv_2x128_512( hash2[8], hash3[8], vhashB );
-   simd_full( &ctx.simd, (BitSequence*)hash2[9],
+
-                   (const BitSequence*)hash2[8], 512 );
+   simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
-   simd_full( &ctx.simd, (BitSequence*)hash3[9],
+   simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
-                   (const BitSequence*)hash3[8], 512 );
+   dintrlv_2x128_512( hash0[9], hash1[9], vhashA );
   dintrlv_2x128_512( hash2[9], hash3[9], vhashB );
 #if defined(__VAES__)
   echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
   echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
   dintrlv_2x128_512( hash0[10], hash1[10], vhashA );
   dintrlv_2x128_512( hash2[10], hash3[10], vhashB );
   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 #else
   echo_full( &ctx.echo, (BitSequence *)hash0[10], 512,
                   (const BitSequence *)hash0[ 9], 64 );
@@ -736,6 +708,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
   intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
 #endif
   if ( work_restart[thrid].restart ) return 0;
   hamsi512_4way_init( &ctx.hamsi );
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,8 +4,9 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
 # Icelake AVX512 SHA VAES
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
@@ -16,6 +17,20 @@ mv cpuminer.exe cpuminer-avx512-sha-vaes.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 # Rocketlake AVX512 AES SHA
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
 # CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-avx512-sha.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha
 # Slylake-X AVX512 AES
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
@@ -23,6 +38,7 @@ mv cpuminer.exe cpuminer-avx512.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512
 # Haswell AVX2 AES
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES with core-avx2
@@ -33,6 +49,7 @@ mv cpuminer.exe cpuminer-avx2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2
 # Sandybridge AVX AES
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl
@@ -42,15 +59,17 @@ mv cpuminer.exe cpuminer-avx.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx
 # Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -maes -msse4.2 -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42
 # Nehalem SSE4.2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl
@@ -60,6 +79,7 @@ mv cpuminer.exe cpuminer-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse42
 # Core2 SSSE3
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl
@@ -69,6 +89,7 @@ mv cpuminer.exe cpuminer-ssse3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-ssse3
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl
@@ -78,6 +99,7 @@ mv cpuminer.exe cpuminer-sse2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 # Zen1 AVX2 SHA
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl
@@ -87,6 +109,7 @@ mv cpuminer.exe cpuminer-zen.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen
 # Zen3 AVX2 SHA VAES
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
@@ -97,6 +120,7 @@ mv cpuminer.exe cpuminer-zen3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
 # Native to current CPU
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
--- a/build-avx2.sh
+++ b/build-avx2.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 #if [ "$OS" = "Windows_NT" ]; then
 #    ./mingw64.sh
 #    exit 0
 #fi
 # Linux build
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 # Ubuntu 10.04 (gcc 4.4)
 # extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
 strip -s cpuminer
--- a/build-no-common.sh
+++ b/build-no-common.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 #if [ "$OS" = "Windows_NT" ]; then
 #    ./mingw64.sh
 #    exit 0
 #fi
 # Linux build
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 # Ubuntu 10.04 (gcc 4.4)
 # extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
 strip -s cpuminer
--- a/build.sh
+++ b/build.sh
@@ -12,15 +12,8 @@ make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 # Ubuntu 10.04 (gcc 4.4)
 # extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
--- a/buildjdd.sh
+++ b/buildjdd.sh
@@ -1,27 +0,0 @@
 #!/bin/bash
 #if [ "$OS" = "Windows_NT" ]; then
 #    ./mingw64.sh
 #    exit 0
 #fi
 # Linux build
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 # Ubuntu 10.04 (gcc 4.4)
 # extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
 strip -s cpuminer
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
 make distclean > /dev/null
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.1'
+PACKAGE_VERSION='3.15.2'
-PACKAGE_STRING='cpuminer-opt 3.15.1'
+PACKAGE_STRING='cpuminer-opt 3.15.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.2 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.2:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.15.1
+cpuminer-opt configure 3.15.2
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.15.1, which was
+It was created by cpuminer-opt $as_me 3.15.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.15.1'
+ VERSION='3.15.2'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.1, which was
+This file was extended by cpuminer-opt $as_me 3.15.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.1
+cpuminer-opt config.status 3.15.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.1])
+AC_INIT([cpuminer-opt], [3.15.2])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3383,13 +3383,14 @@ bool check_cpu_capability ()
     bool sw_has_sha    = false;
     bool sw_has_vaes   = false;
     set_t algo_features = algo_gate.optimizations;
-     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
+     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
+     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
-     bool algo_has_sse42  = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
+     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
+     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
-     bool algo_has_vaes   = set_incl( VAES_OPT,    algo_features );
+     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
@@ -3510,7 +3511,8 @@ bool check_cpu_capability ()
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512;
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes 
 	       && ( use_avx512 || algo_has_vaes256 );   
     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
                   use_sha || use_vaes );
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -143,8 +143,8 @@ do { \
 // Parallel AES, for when x is expected to be in a 256 bit register.
 // Use same 128 bit key.
-//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)
-#if 0
+
 #define mm256_aesenc_2x128( x, k ) \
   _mm256_aesenc_epi128( x, k )
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -483,11 +483,13 @@ static inline bool has_avx512()
 // AMD Zen3 added support for 256 bit VAES without requiring AVX512.
 // The original Intel spec requires AVX512F to support 512 bit VAES and 
 // requires AVX512VL to support 256 bit VAES.
-// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient.
+// The CPUID VAES bit alone can't distiguish 256 vs 512 bit.
-// However, proper detection of VAES512 and VAES256 requires more work:
+// If necessary:
-// VAES512 = VAES && AVX512F  (may not support VAES256)
+// VAES 256 & 512 = VAES && AVX512VL
-// VAES256 = AVX512VL ? VAES : ( AVX && VAES )   (may not support VAES512) 
+// VAES 512 = VAES && AVX512F  
-// VAES    = VAES && AVX512F && AVX512VL (supports both)
+// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F )
 // VAES 512 only = VAES && AVX512F && !AVX512VL
 // VAES 256 only = VAES && !AVX512F
 static inline bool has_vaes()
 {
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -40,6 +40,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 # Start building...
 # Icelake AVX512 SHA VAES
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
@@ -48,6 +49,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 # Zen1 AVX2 SHA
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
@@ -55,6 +57,16 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
 # Zen3 AVX2 SHA VAES
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
 # CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen3.exe
 # Slylake-X AVX512 AES
 # mingw won't compile avx512 without -fno-asynchronous-unwind-tables
 make clean || echo clean
 rm -f config.status
@@ -64,6 +76,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
 # Haswell AVX2 AES
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES in -march=core-avx2
@@ -72,6 +85,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
 # Sandybridge AVX AES
 make clean || echo clean
 rm -f config.status
 # -march=corei7-avx still includes aes, but just in case
@@ -80,6 +94,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
 # Westmere SSE4.2 AES
 # -march=westmere is supported in gcc5
 make clean || echo clean
 rm -f config.status
@@ -104,6 +119,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
 #make clean || echo clean
 # Generic SSE2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS