diff --git a/README.txt b/README.txt
index edaff7d..5d50a87 100644
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,10 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.
 
+This package is officially avalable only from:
+ https://github.com/JayDDee/cpuminer-opt
+No other sources should be trusted.
+
 cpuminer is a console program that is executed from a DOS or Powershell
 prompt. There is no GUI and no mouse support.
 
@@ -31,20 +35,22 @@ https://en.wikipedia.org/wiki/List_of_Intel_CPU_microarchitectures
 https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures
 
 
-Exe file name              Compile flags              Arch name
+Exe file name                Compile flags            Arch name
 
 cpuminer-sse2.exe            "-msse2"                 Core2, Nehalem   
-cpuminer-aes-sse42.exe       "-march=westmere"        Westmere
+cpuminer-aes-sse42.exe       "-marxh=westmere"        Westmere
 cpuminer-avx.exe             "-march=corei7-avx"      Sandybridge, Ivybridge
-cpuminer-avx2.exe            "-march=core-avx2 -maes" Haswell*
+cpuminer-avx2.exe            "-march=core-avx2 -maes" Haswell(1)
 cpuminer-avx512.exe          "-march=skylake-avx512"  Skylake-X, Cascadelake-X
-cpuminer-zen.exe             "-march=znver1"          AMD Ryzen, Threadripper
-cpuminer-avx512-sha-vaes.exe "-march=icelake-client"  Icelake*
+cpuminer-zen.exe             "-march=znver1"          Zen1, Zen2
+cpuminer-zen3.exe            "-march=znver2 -mvaes"   Zen3(2) 
+cpuminer-avx512-sha-vaes.exe "-march=icelake-client"  Icelake(3)
 
-* Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
-Icelake is only available on some laptops. Mining with a laptop is not
-recommended. The icelake build is included in anticipation of Intel eventually
-releasing a desktop CPU with a microarchitecture newer than Skylake.
+(1) Haswell includes Broadwell, Skylake, Kabylake, Coffeelake & Cometlake. 
+(2) Zen3 build uses Zen2+VAES as workaround until Zen3 compiler support is
+    available. Zen2 CPUs should use Zen build.
+(3) Icelake is only available on some laptops. Mining with a laptop is not
+recommended.
 
 Notes about included DLL files:
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 1c4d406..106d499 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------
 
+v3.15.2
+
+Zen3 AVX2+VAES optimization for x16*, x17, sonoa, xevan, x21s, x22i, x25x,
+allium.
+Zen3 build added to Windows binary package.
+
 v3.15.1
 
 Fix compile on AMD Zen3 CPUs with VAES.
diff --git a/algo-gate-api.h b/algo-gate-api.h
index 80aa3b4..af29ecb 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -90,10 +90,11 @@ typedef  uint32_t set_t;
 #define AES_OPT          2  
 #define SSE42_OPT        4
 #define AVX_OPT          8   // Sandybridge
-#define AVX2_OPT      0x10   // Haswell
-#define SHA_OPT       0x20   // sha256 (Ryzen, Ice Lake)
-#define AVX512_OPT    0x40   // AVX512- F, VL, DQ, BW (Skylake-X)
-#define VAES_OPT      0x80   // VAES (Ice Lake)
+#define AVX2_OPT      0x10   // Haswell, Zen1
+#define SHA_OPT       0x20   // Zen1, Icelake (sha256)
+#define AVX512_OPT    0x40   // Skylake-X (AVX512[F,VL,DQ,BW])
+#define VAES_OPT      0x80   // Icelake (VAES & AVX512)
+#define VAES256_OPT   0x100  // Zen3 (VAES without AVX512)
 
 
 // return set containing all elements from sets a & b
@@ -111,9 +112,9 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
 typedef struct
 {
 // Mandatory functions, one of these is mandatory. If a generic scanhash
-// is used a custom hash function must be registered, with a custom scanhash
-// the custom hash function can be called directly and doesn't need to be
-// registered in the gate. 
+// is used a custom target hash function must be registered, with a custom
+// scanhash the target hash function can be called directly and doesn't need
+// to be registered in the gate. 
 int ( *scanhash ) ( struct work*, uint32_t, uint64_t*, struct thr_info* );
 
 int ( *hash )     ( void*, const void*, int );
diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c
index 57c0a94..eb3c41c 100644
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -1,5 +1,4 @@
-//#if 0
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)
 
 #include "simd-utils.h"
 #include "echo-hash-4way.h"
@@ -13,8 +12,12 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
 */
 // do these need to be reversed?
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
 #define mul2mask \
-   _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+     m512_const2_64( 0, 0x00001b00 )
+//_mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
 //   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
 
 #define lsbmask    m512_const1_32( 0x01010101 ) 
@@ -30,87 +33,87 @@ static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
    const int j2 = ( (j)+2 ) & 3; \
    const int j3 = ( (j)+3 ) & 3; \
    s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
-	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask );\
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ] [j ] = s2; \
-	state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
-	state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
-	state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
-	s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 );\
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
-                            _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
-	s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+   t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
+   t1 = _mm512_and_si512( t1, lsbmask );\
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
+   state2[ 0 ] [j ] = s2; \
+   state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
+   s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+   t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   s2 = _mm512_xor_si512( s2, t2 );\
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+                              _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+   s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+   t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
                             _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
-	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
-	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
-	t1 = _mm512_and_si512( t1, lsbmask ); \
-	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
-	s2 = _mm512_xor_si512( s2, t2 ); \
-	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
-	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+   s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+   t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+   t1 = _mm512_and_si512( t1, lsbmask ); \
+   t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+   s2 = _mm512_xor_si512( s2, t2 ); \
+   state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
                             _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
+   state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)
 
 #define ECHO_ROUND_UNROLL2 \
-	ECHO_SUBBYTES(_state, 0, 0);\
+   ECHO_SUBBYTES(_state, 0, 0);\
    ECHO_SUBBYTES(_state, 1, 0);\
-	ECHO_SUBBYTES(_state, 2, 0);\
-	ECHO_SUBBYTES(_state, 3, 0);\
-	ECHO_SUBBYTES(_state, 0, 1);\
-	ECHO_SUBBYTES(_state, 1, 1);\
-	ECHO_SUBBYTES(_state, 2, 1);\
-	ECHO_SUBBYTES(_state, 3, 1);\
-	ECHO_SUBBYTES(_state, 0, 2);\
-	ECHO_SUBBYTES(_state, 1, 2);\
-	ECHO_SUBBYTES(_state, 2, 2);\
-	ECHO_SUBBYTES(_state, 3, 2);\
-	ECHO_SUBBYTES(_state, 0, 3);\
-	ECHO_SUBBYTES(_state, 1, 3);\
-	ECHO_SUBBYTES(_state, 2, 3);\
-	ECHO_SUBBYTES(_state, 3, 3);\
-	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-	ECHO_SUBBYTES(_state2, 0, 0);\
-	ECHO_SUBBYTES(_state2, 1, 0);\
-	ECHO_SUBBYTES(_state2, 2, 0);\
-	ECHO_SUBBYTES(_state2, 3, 0);\
-	ECHO_SUBBYTES(_state2, 0, 1);\
-	ECHO_SUBBYTES(_state2, 1, 1);\
-	ECHO_SUBBYTES(_state2, 2, 1);\
-	ECHO_SUBBYTES(_state2, 3, 1);\
-	ECHO_SUBBYTES(_state2, 0, 2);\
-	ECHO_SUBBYTES(_state2, 1, 2);\
-	ECHO_SUBBYTES(_state2, 2, 2);\
-	ECHO_SUBBYTES(_state2, 3, 2);\
-	ECHO_SUBBYTES(_state2, 0, 3);\
-	ECHO_SUBBYTES(_state2, 1, 3);\
-	ECHO_SUBBYTES(_state2, 2, 3);\
-	ECHO_SUBBYTES(_state2, 3, 3);\
-	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+   ECHO_SUBBYTES(_state, 2, 0);\
+   ECHO_SUBBYTES(_state, 3, 0);\
+   ECHO_SUBBYTES(_state, 0, 1);\
+   ECHO_SUBBYTES(_state, 1, 1);\
+   ECHO_SUBBYTES(_state, 2, 1);\
+   ECHO_SUBBYTES(_state, 3, 1);\
+   ECHO_SUBBYTES(_state, 0, 2);\
+   ECHO_SUBBYTES(_state, 1, 2);\
+   ECHO_SUBBYTES(_state, 2, 2);\
+   ECHO_SUBBYTES(_state, 3, 2);\
+   ECHO_SUBBYTES(_state, 0, 3);\
+   ECHO_SUBBYTES(_state, 1, 3);\
+   ECHO_SUBBYTES(_state, 2, 3);\
+   ECHO_SUBBYTES(_state, 3, 3);\
+   ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES(_state2, 0, 0);\
+   ECHO_SUBBYTES(_state2, 1, 0);\
+   ECHO_SUBBYTES(_state2, 2, 0);\
+   ECHO_SUBBYTES(_state2, 3, 0);\
+   ECHO_SUBBYTES(_state2, 0, 1);\
+   ECHO_SUBBYTES(_state2, 1, 1);\
+   ECHO_SUBBYTES(_state2, 2, 1);\
+   ECHO_SUBBYTES(_state2, 3, 1);\
+   ECHO_SUBBYTES(_state2, 0, 2);\
+   ECHO_SUBBYTES(_state2, 1, 2);\
+   ECHO_SUBBYTES(_state2, 2, 2);\
+   ECHO_SUBBYTES(_state2, 3, 2);\
+   ECHO_SUBBYTES(_state2, 0, 3);\
+   ECHO_SUBBYTES(_state2, 1, 3);\
+   ECHO_SUBBYTES(_state2, 2, 3);\
+   ECHO_SUBBYTES(_state2, 3, 3);\
+   ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
@@ -224,43 +227,43 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
 
 int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 {
-	int i, j;
+   int i, j;
 
    ctx->k = m512_zero; 
-	ctx->processed_bits = 0;
-	ctx->uBufferBytes = 0;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
 
-	switch( nHashSize )
-	{
-		case 256:
-			ctx->uHashSize = 256;
-			ctx->uBlockLength = 192;
-			ctx->uRounds = 8;
-			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
-			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
-			break;
+   switch( nHashSize )
+   {
+	case 256:
+		ctx->uHashSize = 256;
+		ctx->uBlockLength = 192;
+		ctx->uRounds = 8;
+		ctx->hashsize = m512_const2_64( 0, 0x100 );
+		ctx->const1536 = m512_const2_64( 0, 0x600 );
+		break;
 
-		case 512:
-			ctx->uHashSize = 512;
-			ctx->uBlockLength = 128;
-			ctx->uRounds = 10;
-			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
-			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
-			break;
+	case 512:
+		ctx->uHashSize = 512;
+		ctx->uBlockLength = 128;
+		ctx->uRounds = 10;
+		ctx->hashsize = m512_const2_64( 0, 0x200 );
+		ctx->const1536 = m512_const2_64( 0, 0x400);
+		break;
 
-		default:
-			return 1;
-	}
+	default:
+        	return 1;
+   }
 
-	for( i = 0; i < 4; i++ )
-		for( j = 0; j < nHashSize / 256; j++ )
-			ctx->state[ i ][ j ] = ctx->hashsize;
+   for( i = 0; i < 4; i++ )
+	for( j = 0; j < nHashSize / 256; j++ )
+		ctx->state[ i ][ j ] = ctx->hashsize;
 
-	for( i = 0; i < 4; i++ )
-		for( j = nHashSize / 256; j < 4; j++ )
-			ctx->state[ i ][ j ] = m512_zero;
+   for( i = 0; i < 4; i++ )
+	for( j = nHashSize / 256; j < 4; j++ )
+		ctx->state[ i ][ j ] = m512_zero;
 
-	return 0;
+   return 0;
 }
 
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
@@ -285,17 +288,13 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
       vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
       memcpy_512( state->buffer, data, vlen );
       state->processed_bits += (unsigned int)( databitlen );
-      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
-
+      remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
    }
 
-   state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
    memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
-   state->buffer[ vblen-2 ] =
-                _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
-   state->buffer[ vblen-1 ] =
-                   _mm512_set4_epi64( 0, state->processed_bits,
-                                      0, state->processed_bits );  
+   state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
+   state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
 
    state->k = _mm512_add_epi64( state->k, remainingbits );
    state->k = _mm512_sub_epi64( state->k, state->const1536 );
@@ -328,16 +327,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
          ctx->uHashSize = 256;
          ctx->uBlockLength = 192;
          ctx->uRounds = 8;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+         ctx->hashsize = m512_const2_64( 0, 0x100 );
+         ctx->const1536 = m512_const2_64( 0, 0x600 );
          break;
 
       case 512:
          ctx->uHashSize = 512;
          ctx->uBlockLength = 128;
          ctx->uRounds = 10;
-         ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
-         ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+         ctx->hashsize = m512_const2_64( 0, 0x200 );
+         ctx->const1536 = m512_const2_64( 0, 0x400 );
          break;
 
       default:
@@ -372,17 +371,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
       vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
       memcpy_512( ctx->buffer, data, vlen );
       ctx->processed_bits += (unsigned int)( databitlen );
-      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
-
+      remainingbits = m512_const2_64( 0, databitlen );
    }
 
-   ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+   ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
    memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
    ctx->buffer[ vblen-2 ] =
-                _mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
-   ctx->buffer[ vblen-1 ] =
-                   _mm512_set4_epi64( 0, ctx->processed_bits,
-                                      0, ctx->processed_bits );
+                     m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
+   ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
 
    ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
    ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
@@ -400,5 +396,380 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
    return 0;
 }
 
+#endif  // AVX512
 
-#endif
+// AVX2 + VAES
+
+#define mul2mask_2way   m256_const2_64( 0, 0x0000000000001b00 ) 
+
+#define lsbmask_2way    m256_const1_32( 0x01010101 ) 
+
+#define ECHO_SUBBYTES_2WAY( state, i, j ) \
+        state[i][j] = _mm256_aesenc_epi128( state[i][j], k1 ); \
+        state[i][j] = _mm256_aesenc_epi128( state[i][j], m256_zero ); \
+        k1 = _mm256_add_epi32( k1, m256_one_128 );
+
+#define ECHO_MIXBYTES_2WAY( state1, state2, j, t1, t2, s2 ) do \
+{ \
+   const int j1 = ( (j)+1 ) & 3; \
+   const int j2 = ( (j)+2 ) & 3; \
+   const int j3 = ( (j)+3 ) & 3; \
+   s2 = _mm256_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
+   t1 = _mm256_srli_epi16( state1[ 0 ][ j ], 7 ); \
+   t1 = _mm256_and_si256( t1, lsbmask_2way );\
+   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
+   s2 = _mm256_xor_si256( s2, t2 ); \
+   state2[ 0 ] [j ] = s2; \
+   state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+   state2[ 3 ] [j ] = _mm256_xor_si256( s2, state1[ 0 ][ j ] );\
+   s2 = _mm256_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+   t1 = _mm256_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
+   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
+   s2 = _mm256_xor_si256( s2, t2 );\
+   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
+                              _mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
+   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
+   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+   s2 = _mm256_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+   t1 = _mm256_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
+   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
+   s2 = _mm256_xor_si256( s2, t2 ); \
+   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
+                            _mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
+   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
+   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+   s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+   t1 = _mm256_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
+   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
+   s2 = _mm256_xor_si256( s2, t2 ); \
+   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
+                            _mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
+   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
+} while(0)
+
+#define ECHO_ROUND_UNROLL2_2WAY \
+   ECHO_SUBBYTES_2WAY(_state, 0, 0);\
+   ECHO_SUBBYTES_2WAY(_state, 1, 0);\
+   ECHO_SUBBYTES_2WAY(_state, 2, 0);\
+   ECHO_SUBBYTES_2WAY(_state, 3, 0);\
+   ECHO_SUBBYTES_2WAY(_state, 0, 1);\
+   ECHO_SUBBYTES_2WAY(_state, 1, 1);\
+   ECHO_SUBBYTES_2WAY(_state, 2, 1);\
+   ECHO_SUBBYTES_2WAY(_state, 3, 1);\
+   ECHO_SUBBYTES_2WAY(_state, 0, 2);\
+   ECHO_SUBBYTES_2WAY(_state, 1, 2);\
+   ECHO_SUBBYTES_2WAY(_state, 2, 2);\
+   ECHO_SUBBYTES_2WAY(_state, 3, 2);\
+   ECHO_SUBBYTES_2WAY(_state, 0, 3);\
+   ECHO_SUBBYTES_2WAY(_state, 1, 3);\
+   ECHO_SUBBYTES_2WAY(_state, 2, 3);\
+   ECHO_SUBBYTES_2WAY(_state, 3, 3);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state, _state2, 3, t1, t2, s2);\
+   ECHO_SUBBYTES_2WAY(_state2, 0, 0);\
+   ECHO_SUBBYTES_2WAY(_state2, 1, 0);\
+   ECHO_SUBBYTES_2WAY(_state2, 2, 0);\
+   ECHO_SUBBYTES_2WAY(_state2, 3, 0);\
+   ECHO_SUBBYTES_2WAY(_state2, 0, 1);\
+   ECHO_SUBBYTES_2WAY(_state2, 1, 1);\
+   ECHO_SUBBYTES_2WAY(_state2, 2, 1);\
+   ECHO_SUBBYTES_2WAY(_state2, 3, 1);\
+   ECHO_SUBBYTES_2WAY(_state2, 0, 2);\
+   ECHO_SUBBYTES_2WAY(_state2, 1, 2);\
+   ECHO_SUBBYTES_2WAY(_state2, 2, 2);\
+   ECHO_SUBBYTES_2WAY(_state2, 3, 2);\
+   ECHO_SUBBYTES_2WAY(_state2, 0, 3);\
+   ECHO_SUBBYTES_2WAY(_state2, 1, 3);\
+   ECHO_SUBBYTES_2WAY(_state2, 2, 3);\
+   ECHO_SUBBYTES_2WAY(_state2, 3, 3);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 0, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 1, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 2, t1, t2, s2);\
+   ECHO_MIXBYTES_2WAY(_state2, _state, 3, t1, t2, s2)
+
+#define SAVESTATE_2WAY(dst, src)\
+        dst[0][0] = src[0][0];\
+        dst[0][1] = src[0][1];\
+        dst[0][2] = src[0][2];\
+        dst[0][3] = src[0][3];\
+        dst[1][0] = src[1][0];\
+        dst[1][1] = src[1][1];\
+        dst[1][2] = src[1][2];\
+        dst[1][3] = src[1][3];\
+        dst[2][0] = src[2][0];\
+        dst[2][1] = src[2][1];\
+        dst[2][2] = src[2][2];\
+        dst[2][3] = src[2][3];\
+        dst[3][0] = src[3][0];\
+        dst[3][1] = src[3][1];\
+        dst[3][2] = src[3][2];\
+        dst[3][3] = src[3][3]
+
+// blockcount always 1
+void echo_2way_compress( echo_2way_context *ctx, const __m256i *pmsg,
+               unsigned int uBlockCount )
+{
+  unsigned int r, b, i, j;
+  __m256i t1, t2, s2, k1;
+  __m256i _state[4][4], _state2[4][4], _statebackup[4][4];
+
+  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
+  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
+  _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
+  _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
+  _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
+  _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
+  _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
+  _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
+  _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
+  _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
+  _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
+  _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
+  _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
+  _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
+  _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
+  _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
+
+  for ( b = 0; b < uBlockCount; b++ )
+  {
+    ctx->k = _mm256_add_epi64( ctx->k, ctx->const1536 );
+
+    for( j = ctx->uHashSize / 256; j < 4; j++ )
+    {
+      for ( i = 0; i < 4; i++ )
+      {
+        _state[ i ][ j ] = _mm256_load_si256(
+                     pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+      }
+   }
+
+   // save state
+   SAVESTATE_2WAY( _statebackup, _state );
+
+   k1 = ctx->k;
+
+   for ( r = 0; r < ctx->uRounds / 2; r++ )
+   {
+       ECHO_ROUND_UNROLL2_2WAY;
+   }
+
+   if ( ctx->uHashSize == 256 )
+   {
+      for ( i = 0; i < 4; i++ )
+      {
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _state[ i ][ 1 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _state[ i ][ 3 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 1 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 2 ] ) ;
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 3 ] );
+       }
+    }
+    else
+    {
+       for ( i = 0; i < 4; i++ )
+       {
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
+                                              _state[ i ][ 3 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+         _state[ i ][ 0 ] = _mm256_xor_si256( _state[ i ] [0 ],
+                                              _statebackup[ i ][ 2 ] );
+         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 1 ] );
+         _state[ i ][ 1 ] = _mm256_xor_si256( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 3 ] );
+      }
+    }
+    pmsg += ctx->uBlockLength;
+  }
+  SAVESTATE_2WAY(ctx->state, _state);
+
+}
+int echo_2way_init( echo_2way_context *ctx, int nHashSize )
+{
+        int i, j;
+
+   ctx->k = m256_zero;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+                case 256:
+                        ctx->uHashSize = 256;
+                        ctx->uBlockLength = 192;
+                        ctx->uRounds = 8;
+                        ctx->hashsize = m256_const2_64( 0, 0x100 );
+                        ctx->const1536 = m256_const2_64( 0, 0x600 );
+                        break;
+
+                case 512:
+                        ctx->uHashSize = 512;
+                        ctx->uBlockLength = 128;
+                        ctx->uRounds = 10;
+                        ctx->hashsize = m256_const2_64( 0, 0x200 );
+                        ctx->const1536 = m256_const2_64( 0, 0x400 );
+                        break;
+
+                default:
+                        return 1;
+        }
+
+        for( i = 0; i < 4; i++ )
+                for( j = 0; j < nHashSize / 256; j++ )
+                        ctx->state[ i ][ j ] = ctx->hashsize;
+
+        for( i = 0; i < 4; i++ )
+                for( j = nHashSize / 256; j < 4; j++ )
+                        ctx->state[ i ][ j ] = m256_zero;
+
+        return 0;
+}
+
+int echo_2way_update_close( echo_2way_context *state, void *hashval,
+                              const void *data, int databitlen )
+{
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
+
+   int vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+   const int vblen = state->uBlockLength / 16; //  16 bytes per lane
+   __m256i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_2way_compress( state, data, 1 );
+      state->processed_bits = 1024;
+      remainingbits = m256_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      memcpy_256( state->buffer, data, vlen );
+      state->processed_bits += (unsigned int)( databitlen );
+      remainingbits = m256_const2_64( 0, databitlen );
+   }
+
+   state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
+   memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
+   state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
+   state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
+
+   state->k = _mm256_add_epi64( state->k, remainingbits );
+   state->k = _mm256_sub_epi64( state->k, state->const1536 );
+
+   echo_2way_compress( state, state->buffer, 1 );
+
+   _mm256_store_si256( (__m256i*)hashval + 0, state->state[ 0 ][ 0] );
+   _mm256_store_si256( (__m256i*)hashval + 1, state->state[ 1 ][ 0] );
+
+   if ( state->uHashSize == 512 )
+   {
+      _mm256_store_si256( (__m256i*)hashval + 2, state->state[ 2 ][ 0 ] );
+      _mm256_store_si256( (__m256i*)hashval + 3, state->state[ 3 ][ 0 ] );
+   }
+   return 0;
+}
+
+int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
+                    const void *data, int datalen )
+{
+   int i, j;
+   int databitlen = datalen * 8;
+   ctx->k = m256_zero;
+   ctx->processed_bits = 0;
+   ctx->uBufferBytes = 0;
+
+   switch( nHashSize )
+   {
+      case 256:
+         ctx->uHashSize = 256;
+         ctx->uBlockLength = 192;
+         ctx->uRounds = 8;
+         ctx->hashsize = m256_const2_64( 0, 0x100 );
+         ctx->const1536 = m256_const2_64( 0, 0x600 );
+         break;
+
+      case 512:
+         ctx->uHashSize = 512;
+         ctx->uBlockLength = 128;
+         ctx->uRounds = 10;
+         ctx->hashsize = m256_const2_64( 0, 0x200 );
+         ctx->const1536 = m256_const2_64( 0, 0x400 );
+         break;
+
+      default:
+         return 1;
+   }
+
+   for( i = 0; i < 4; i++ )
+      for( j = 0; j < nHashSize / 256; j++ )
+         ctx->state[ i ][ j ] = ctx->hashsize;
+
+   for( i = 0; i < 4; i++ )
+      for( j = nHashSize / 256; j < 4; j++ )
+         ctx->state[ i ][ j ] = m256_zero;
+
+   int vlen = datalen / 32;
+   const int vblen = ctx->uBlockLength / 16; //  16 bytes per lane
+   __m256i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_2way_compress( ctx, data, 1 );
+      ctx->processed_bits = 1024;
+      remainingbits = m256_const2_64( 0, -1024 );
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_256( ctx->buffer, data, vlen );
+      ctx->processed_bits += (unsigned int)( databitlen );
+      remainingbits = m256_const2_64( 0, databitlen );
+   }
+
+   ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
+   memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
+   ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
+   ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
+
+   ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
+   ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
+
+   echo_2way_compress( ctx, ctx->buffer, 1 );
+
+   _mm256_store_si256( (__m256i*)hashval + 0, ctx->state[ 0 ][ 0] );
+   _mm256_store_si256( (__m256i*)hashval + 1, ctx->state[ 1 ][ 0] );
+
+   if ( ctx->uHashSize == 512 )
+   {
+      _mm256_store_si256( (__m256i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
+      _mm256_store_si256( (__m256i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
+   }
+   return 0;
+}
+
+
+#endif   // VAES
diff --git a/algo/echo/echo-hash-4way.h b/algo/echo/echo-hash-4way.h
index f9e906f..5808685 100644
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -1,10 +1,12 @@
 #if !defined(ECHO_HASH_4WAY_H__)
 #define ECHO_HASH_4WAY_H__ 1
 
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__VAES__)
 
 #include "simd-utils.h"
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct
 {
    __m512i    state[4][4];
@@ -20,6 +22,7 @@ typedef struct
    unsigned int   processed_bits;
 
 } echo_4way_context __attribute__ ((aligned (64)));
+#define echo512_4way_context echo_4way_context
 
 int echo_4way_init( echo_4way_context *state, int hashbitlen );
 #define echo512_4way_init( state ) echo_4way_init( state, 512 )
@@ -29,8 +32,8 @@ int echo_4way_update( echo_4way_context *state, const void *data,
     unsigned int databitlen);
 #define echo512_4way_update echo_4way_update
 
-int echo_close( echo_4way_context *state, void *hashval );
-#define echo512_4way_close echo_4way_close
+// int echo_4way_close( echo_4way_context *state, void *hashval );
+// #define echo512_4way_close echo_4way_close
 
 int echo_4way_update_close( echo_4way_context *state, void *hashval,
                               const void *data, int databitlen );
@@ -43,5 +46,45 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
 #define echo256_4way_full( state, hashval, data, datalen ) \
            echo_4way_full( state, hashval, 256, data, datalen )
 
-#endif 
-#endif
+#endif   // AVX512
+
+typedef struct
+{
+   __m256i    state[4][4];
+   __m256i    buffer[ 4 * 192 / 16 ];  // 4x128 interleaved 192 bytes
+   __m256i    k;
+   __m256i    hashsize;
+   __m256i    const1536;
+
+   unsigned int   uRounds;
+   unsigned int   uHashSize;
+   unsigned int   uBlockLength;
+   unsigned int   uBufferBytes;
+   unsigned int   processed_bits;
+
+} echo_2way_context __attribute__ ((aligned (64)));
+#define echo512_2way_context echo_2way_context
+
+int echo_2way_init( echo_2way_context *state, int hashbitlen );
+#define echo512_2way_init( state ) echo_2way_init( state, 512 )
+#define echo256_2way_init( state ) echo_2way_init( state, 256 )
+
+int echo_2way_update( echo_2way_context *state, const void *data,
+    unsigned int databitlen);
+#define echo512_2way_update echo_2way_update
+
+int echo_2way_update_close( echo_2way_context *state, void *hashval,
+                              const void *data, int databitlen );
+#define echo512_2way_update_close echo_2way_update_close
+
+int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
+                    const void *data, int datalen );
+#define echo512_2way_full( state, hashval, data, datalen ) \
+           echo_2way_full( state, hashval, 512, data, datalen )
+#define echo256_2way_full( state, hashval, data, datalen ) \
+           echo_2way_full( state, hashval, 256, data, datalen )
+
+
+#endif  // VAES
+
+#endif   // ECHO_HASH_4WAY_H__
diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c
index ef296fd..dd82a86 100644
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -15,7 +15,9 @@
 #include "miner.h"
 #include "simd-utils.h"
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 
 int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
@@ -43,10 +45,10 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
 }
 
 int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
-                                const void* input, uint64_t databitlen )
+                                const void* input, uint64_t datalen )
 {
-   const int len = (int)databitlen / 128;
-   const int hashlen_m128i = 32 / 16;   // bytes to __m128i
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
    const int hash_offset = SIZE256 - hashlen_m128i;
    int rem = ctx->rem_ptr;
    int blocks = len / SIZE256;
@@ -172,5 +174,161 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
    return 0;
 }
 
-#endif   // VAES
+#endif   // AVX512
 
+// AVX2 + VAES
+
+int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = m256_zero;
+     ctx->buffer[i]   = m256_zero;
+  }
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
+
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
+                                const void* input, uint64_t datalen )
+{
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 32 >> 4;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m256i* in = (__m256i*)input;
+   int i;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = m256_zero;
+     ctx->buffer[i]   = m256_zero;
+  }
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE256 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = m256_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+      ctx->buffer[i] = m256_const2_64( (uint64_t)blocks << 56, 0 );
+   }
+
+// digest final padding block and do output transform
+   TF512_2way( ctx->chaining, ctx->buffer );
+
+   OF512_2way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m256i* in = (__m256i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512_2way( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE256 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = m256_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+// digest final padding block and do output transform
+   TF512_2way( ctx->chaining, ctx->buffer );
+
+   OF512_2way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+#endif  // VAES
diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h
index 907a64b..59c6270 100644
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -18,8 +18,8 @@
 #endif
 #include <stdlib.h>
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-   
+#if defined(__AVX2__) && defined(__VAES__)
+
 #define LENGTH (256)
 
 //#include "brg_endian.h"
@@ -48,6 +48,8 @@
 
 #define SIZE256 (SIZE_512/16)
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
   __attribute__ ((aligned (128))) __m512i chaining[SIZE256];
   __attribute__ ((aligned (64))) __m512i buffer[SIZE256];
@@ -55,7 +57,7 @@ typedef struct {
   int blk_count;     // SIZE_m128i
   int buf_ptr;       // __m128i offset
   int rem_ptr;
-  int databitlen;    // bits
+//  int databitlen;    // bits
 } groestl256_4way_context;
 
 
@@ -74,5 +76,25 @@ int groestl256_4way_update_close( groestl256_4way_context*,  void*,
 int groestl256_4way_full( groestl256_4way_context*, void*,
                           const void*, uint64_t );
 
-#endif
-#endif 
+#endif  // AVX512
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m256i chaining[SIZE256];
+  __attribute__ ((aligned (64))) __m256i buffer[SIZE256];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+//  int databitlen;    // bits
+} groestl256_2way_context;
+
+int groestl256_2way_init( groestl256_2way_context*, uint64_t );
+
+int groestl256_2way_update_close( groestl256_2way_context*,  void*,
+                                        const void*, uint64_t );
+
+int groestl256_2way_full( groestl256_2way_context*, void*,
+                          const void*, uint64_t );
+
+#endif  // VAES
+#endif  // GROESTL256_HASH_4WAY_H__
diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h
index 8175f74..25d9171 100644
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -12,7 +12,7 @@
       
 #include "groestl256-hash-4way.h"
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 
 static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
 {
@@ -42,6 +42,8 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
    { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
 };
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                      0x1d1519111c141810, 0x1f171b131e161a12,
                                      0x2d2529212c242820, 0x2f272b232e262a22,
@@ -499,5 +501,398 @@ void OF512_4way( __m512i* chaining )
   chaining[3] = xmm11;
 }
 
+#endif  // AVX512
+
+static const __m256i TRANSP_MASK_2WAY =
+             { 0x0d0509010c040800, 0x0f070b030e060a02,
+               0x1d1519111c141810, 0x1f171b131e161a12 };
+
+static const __m256i SUBSH_MASK0_2WAY =
+             { 0x0c0f0104070b0e00, 0x03060a0d08020509,
+               0x1c1f1114171b1e10, 0x13161a1d18121519 };
+
+static const __m256i SUBSH_MASK1_2WAY =
+             { 0x0e090205000d0801, 0x04070c0f0a03060b,
+               0x1e191215101d1801, 0x14171c1f1a13161b };
+
+static const __m256i SUBSH_MASK2_2WAY =
+               { 0x080b0306010f0a02, 0x05000e090c04070d,
+                 0x181b1316111f1a12, 0x15101e191c14171d };
+
+static const __m256i SUBSH_MASK3_2WAY =
+               { 0x0a0d040702090c03, 0x0601080b0e05000f,
+                 0x1a1d141712191c13, 0x1611181b1e15101f };
+
+static const __m256i SUBSH_MASK4_2WAY =
+               { 0x0b0e0500030a0d04, 0x0702090c0f060108,
+                 0x1b1e1510131a1d14, 0x1712191c1f161118 };
+
+static const __m256i SUBSH_MASK5_2WAY =
+               { 0x0d080601040c0f05, 0x00030b0e0907020a,
+                 0x1d181611141c1f15, 0x10131b1e1917121a };
+
+static const __m256i SUBSH_MASK6_2WAY =
+               { 0x0f0a0702050e0906, 0x01040d080b00030c,
+                 0x1f1a1712151e1916, 0x11141d181b10131c };
+
+static const __m256i SUBSH_MASK7_2WAY =
+               { 0x090c000306080b07, 0x02050f0a0d01040e,
+                 0x191c101316181b17, 0x12151f1a1d11141e, };
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2_2WAY(i, j, k){\
+  j = _mm256_xor_si256(j, j);\
+  j = _mm256_cmpgt_epi8(j, i );\
+  i = _mm256_add_epi8(i, i);\
+  j = _mm256_and_si256(j, k);\
+  i = _mm256_xor_si256(i, j);\
+}
+
+#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm256_xor_si256(a0, a1);\
+  b0 = a2;\
+  a1 = _mm256_xor_si256(a1, a2);\
+  b1 = a3;\
+  a2 = _mm256_xor_si256(a2, a3);\
+  b2 = a4;\
+  a3 = _mm256_xor_si256(a3, a4);\
+  b3 = a5;\
+  a4 = _mm256_xor_si256(a4, a5);\
+  b4 = a6;\
+  a5 = _mm256_xor_si256(a5, a6);\
+  b5 = a7;\
+  a6 = _mm256_xor_si256(a6, a7);\
+  a7 = _mm256_xor_si256(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm256_xor_si256(b0, a4);\
+  b6 = _mm256_xor_si256(b6, a4);\
+  b1 = _mm256_xor_si256(b1, a5);\
+  b7 = _mm256_xor_si256(b7, a5);\
+  b2 = _mm256_xor_si256(b2, a6);\
+  b0 = _mm256_xor_si256(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm256_xor_si256(b3, a7);\
+  b1 = _mm256_xor_si256(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm256_xor_si256(b4, a0);\
+  b2 = _mm256_xor_si256(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm256_xor_si256(b5, a1);\
+  b3 = _mm256_xor_si256(b3, a1);\
+  b1 = a1;\
+  b6 = _mm256_xor_si256(b6, a2);\
+  b4 = _mm256_xor_si256(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm256_xor_si256(b7, a3);\
+  b5 = _mm256_xor_si256(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm256_xor_si256(a0, a3);\
+  a1 = _mm256_xor_si256(a1, a4);\
+  a2 = _mm256_xor_si256(a2, a5);\
+  a3 = _mm256_xor_si256(a3, a6);\
+  a4 = _mm256_xor_si256(a4, a7);\
+  a5 = _mm256_xor_si256(a5, b0);\
+  a6 = _mm256_xor_si256(a6, b1);\
+  a7 = _mm256_xor_si256(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2_2WAY(a0, b0, b1);\
+  a0 = _mm256_xor_si256(a0, TEMP0);\
+  MUL2_2WAY(a1, b0, b1);\
+  a1 = _mm256_xor_si256(a1, TEMP1);\
+  MUL2_2WAY(a2, b0, b1);\
+  a2 = _mm256_xor_si256(a2, b2);\
+  MUL2_2WAY(a3, b0, b1);\
+  a3 = _mm256_xor_si256(a3, b3);\
+  MUL2_2WAY(a4, b0, b1);\
+  a4 = _mm256_xor_si256(a4, b4);\
+  MUL2_2WAY(a5, b0, b1);\
+  a5 = _mm256_xor_si256(a5, b5);\
+  MUL2_2WAY(a6, b0, b1);\
+  a6 = _mm256_xor_si256(a6, b6);\
+  MUL2_2WAY(a7, b0, b1);\
+  a7 = _mm256_xor_si256(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2_2WAY(a0, b0, b1);\
+  b5 = _mm256_xor_si256(b5, a0);\
+  MUL2_2WAY(a1, b0, b1);\
+  b6 = _mm256_xor_si256(b6, a1);\
+  MUL2_2WAY(a2, b0, b1);\
+  b7 = _mm256_xor_si256(b7, a2);\
+  MUL2_2WAY(a5, b0, b1);\
+  b2 = _mm256_xor_si256(b2, a5);\
+  MUL2_2WAY(a6, b0, b1);\
+  b3 = _mm256_xor_si256(b3, a6);\
+  MUL2_2WAY(a7, b0, b1);\
+  b4 = _mm256_xor_si256(b4, a7);\
+  MUL2_2WAY(a3, b0, b1);\
+  MUL2_2WAY(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm256_xor_si256(b0, a3);\
+  b1 = _mm256_xor_si256(b1, a4);\
+}/*MixBytes*/
+
+#define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
+  a1 = _mm256_xor_si256( a1, b1 );\
+  a2 = _mm256_xor_si256( a2, b1 );\
+  a3 = _mm256_xor_si256( a3, b1 );\
+  a4 = _mm256_xor_si256( a4, b1 );\
+  a5 = _mm256_xor_si256( a5, b1 );\
+  a6 = _mm256_xor_si256( a6, b1 );\
+  a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm256_xor_si256( b0, b0 );\
+  a0 = _mm256_shuffle_epi8( a0, SUBSH_MASK0_2WAY );\
+  a0 = _mm256_aesenclast_epi128(a0, b0 );\
+  a1 = _mm256_shuffle_epi8( a1, SUBSH_MASK1_2WAY );\
+  a1 = _mm256_aesenclast_epi128(a1, b0 );\
+  a2 = _mm256_shuffle_epi8( a2, SUBSH_MASK2_2WAY );\
+  a2 = _mm256_aesenclast_epi128(a2, b0 );\
+  a3 = _mm256_shuffle_epi8( a3, SUBSH_MASK3_2WAY );\
+  a3 = _mm256_aesenclast_epi128(a3, b0 );\
+  a4 = _mm256_shuffle_epi8( a4, SUBSH_MASK4_2WAY );\
+  a4 = _mm256_aesenclast_epi128(a4, b0 );\
+  a5 = _mm256_shuffle_epi8( a5, SUBSH_MASK5_2WAY );\
+  a5 = _mm256_aesenclast_epi128(a5, b0 );\
+  a6 = _mm256_shuffle_epi8( a6, SUBSH_MASK6_2WAY );\
+  a6 = _mm256_aesenclast_epi128(a6, b0 );\
+  a7 = _mm256_shuffle_epi8( a7, SUBSH_MASK7_2WAY );\
+  a7 = _mm256_aesenclast_epi128( a7, b0 );\
+  \
+  /* MixBytes */\
+  MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q_2WAY(){\
+  ROUND_2WAY(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND_2WAY(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND_2WAY(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND_2WAY(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND_2WAY(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND_2WAY(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND_2WAY(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND_2WAY(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND_2WAY(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND_2WAY(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+#define Matrix_Transpose_A_2way(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK_2WAY;\
+  \
+  i0 = _mm256_shuffle_epi8( i0, t0 );\
+  i1 = _mm256_shuffle_epi8( i1, t0 );\
+  i2 = _mm256_shuffle_epi8( i2, t0 );\
+  i3 = _mm256_shuffle_epi8( i3, t0 );\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm256_unpacklo_epi16( i0, i1 );\
+  o1 = _mm256_unpackhi_epi16( o1, i1 );\
+  i2 = _mm256_unpacklo_epi16( i2, i3 );\
+  t0 = _mm256_unpackhi_epi16( t0, i3 );\
+  \
+  i0 = _mm256_shuffle_epi32( i0, 216 );\
+  o1 = _mm256_shuffle_epi32( o1, 216 );\
+  i2 = _mm256_shuffle_epi32( i2, 216 );\
+  t0 = _mm256_shuffle_epi32( t0, 216 );\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm256_unpacklo_epi32( i0, i2 );\
+  o1 = _mm256_unpacklo_epi32( o1, t0 );\
+  o2 = _mm256_unpackhi_epi32( o2, i2 );\
+  o3 = _mm256_unpackhi_epi32( o3, t0 );\
+}/**/
+
+#define Matrix_Transpose_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm256_unpacklo_epi64( i0, i4 );\
+  o1 = _mm256_unpackhi_epi64( o1, i4 );\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm256_unpacklo_epi64( o2, i5 );\
+  o3 = _mm256_unpackhi_epi64( o3, i5 );\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm256_unpacklo_epi64( o4, i6 );\
+  o5 = _mm256_unpackhi_epi64( o5, i6 );\
+  o7 = i3;\
+  o6 = _mm256_unpacklo_epi64( o6, i7 );\
+  o7 = _mm256_unpackhi_epi64( o7, i7 );\
+}/**/
+
+#define Matrix_Transpose_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm256_unpacklo_epi64( i0, i1 );\
+  o0 = _mm256_unpackhi_epi64( o0, i1 );\
+  o1 = i2;\
+  i2 = _mm256_unpacklo_epi64( i2, i3 );\
+  o1 = _mm256_unpackhi_epi64( o1, i3 );\
+  o2 = i4;\
+  i4 = _mm256_unpacklo_epi64( i4, i5 );\
+  o2 = _mm256_unpackhi_epi64( o2, i5 );\
+  o3 = i6;\
+  i6 = _mm256_unpacklo_epi64( i6, i7 );\
+  o3 = _mm256_unpackhi_epi64( o3, i7 );\
+}/**/
+
+#define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm256_xor_si256( t0, t0 );\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm256_unpacklo_epi64( i0, t0 );\
+  i1 = _mm256_unpackhi_epi64( i1, t0 );\
+  i2 = _mm256_unpacklo_epi64( i2, t0 );\
+  i3 = _mm256_unpackhi_epi64( i3, t0 );\
+  i4 = _mm256_unpacklo_epi64( i4, t0 );\
+  i5 = _mm256_unpackhi_epi64( i5, t0 );\
+  i6 = _mm256_unpacklo_epi64( i6, t0 );\
+  i7 = _mm256_unpackhi_epi64( i7, t0 );\
+}/**/
+
+#define Matrix_Transpose_O_B_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm256_unpacklo_epi64( i0, i1 );\
+  i2 = _mm256_unpacklo_epi64( i2, i3 );\
+  i4 = _mm256_unpacklo_epi64( i4, i5 );\
+  i6 = _mm256_unpacklo_epi64( i6, i7 );\
+}/**/
+
+void TF512_2way( __m256i* chaining, __m256i* message )
+{
+  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m256i TEMP0;
+  static __m256i TEMP1;
+  static __m256i TEMP2;
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A_2way(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm256_xor_si256( xmm8, xmm12 );
+  xmm0 = _mm256_xor_si256( xmm0, xmm2 );
+  xmm4 = _mm256_xor_si256( xmm4, xmm6 );
+  xmm5 = _mm256_xor_si256( xmm5, xmm7 );
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B_2way(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q_2WAY();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm256_xor_si256( xmm0, xmm8 );
+  xmm1 = _mm256_xor_si256( xmm1, xmm10 );
+  xmm2 = _mm256_xor_si256( xmm2, xmm12 );
+  xmm3 = _mm256_xor_si256( xmm3, xmm14 );
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm256_xor_si256( xmm0, (chaining[0]) );
+  xmm1 = _mm256_xor_si256( xmm1, (chaining[1]) );
+  xmm2 = _mm256_xor_si256( xmm2, (chaining[2]) );
+  xmm3 = _mm256_xor_si256( xmm3, (chaining[3]) );
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+  return;
+}
+  
+void OF512_2way( __m256i* chaining )
+{
+  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m256i TEMP0;
+  static __m256i TEMP1;
+  static __m256i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q_2WAY();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8  = _mm256_xor_si256( xmm8,  (chaining[0]) );
+  xmm10 = _mm256_xor_si256( xmm10, (chaining[1]) );
+  xmm12 = _mm256_xor_si256( xmm12, (chaining[2]) );
+  xmm14 = _mm256_xor_si256( xmm14, (chaining[3]) );
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A_2way(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
 #endif  // VAES
-#endif  // GROESTL512_INTR_4WAY_H__
+#endif  // GROESTL256_INTR_4WAY_H__
diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c
index 8e5e139..bff6af5 100644
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -15,7 +15,9 @@
 #include "miner.h"
 #include "simd-utils.h"
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
@@ -137,5 +139,130 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
    return 0;
 }
 
+#endif   // AVX512
+
+
+// AVX2 + VAES
+
+int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
+{
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  memset_zero_256( ctx->chaining, SIZE512 );
+  memset_zero_256( ctx->buffer, SIZE512 );
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
+
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = 64 / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE512;
+   __m256i* in = (__m256i*)input;
+   int i;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE512 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }
+   else
+   {
+       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m256_zero;
+       ctx->buffer[i] = m256_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+   TF1024_2way( ctx->chaining, ctx->buffer );
+   OF1024_2way( ctx->chaining );
+
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
+                          const void* input, uint64_t datalen )
+{
+   const int len = (int)datalen >> 4;
+   const int hashlen_m128i = 64 >> 4;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   uint64_t blocks = len / SIZE512;
+   __m256i* in = (__m256i*)input;
+   int i;
+
+   // --- init ---
+
+   memset_zero_256( ctx->chaining, SIZE512 );
+   memset_zero_256( ctx->buffer, SIZE512 );
+   ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
+   ctx->buf_ptr = 0;
+   ctx->rem_ptr = 0;
+
+   // --- update ---
+
+   for ( i = 0; i < blocks; i++ )
+      TF1024_2way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
+   i += ctx->rem_ptr;
+
+   // --- close ---
+
+   blocks++;
+
+   if ( i == SIZE512 - 1 )
+   {
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
+   }
+   else
+   {
+       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m256_zero;
+       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
+   }
+
+   TF1024_2way( ctx->chaining, ctx->buffer );
+   OF1024_2way( ctx->chaining );
+
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m256i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+   
 #endif   // VAES
 
diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h
index 68ac7e5..7025428 100644
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -10,7 +10,7 @@
 #endif
 #include <stdlib.h>
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 
 #define LENGTH (512)
 
@@ -36,20 +36,19 @@
 
 #define SIZE512 (SIZE_1024/16)
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 typedef struct {
   __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
   __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
   int blk_count;     // SIZE_m128i
   int buf_ptr;       // __m128i offset
   int rem_ptr;
-  int databitlen;    // bits
 } groestl512_4way_context;
 
 
 int groestl512_4way_init( groestl512_4way_context*, uint64_t );
 
-//int reinit_groestl( hashState_groestl* );
-
 int groestl512_4way_update( groestl512_4way_context*, const void*,
                               uint64_t );
 int groestl512_4way_close( groestl512_4way_context*, void* );
@@ -58,5 +57,29 @@ int groestl512_4way_update_close( groestl512_4way_context*,  void*,
 int groestl512_4way_full( groestl512_4way_context*,  void*,
                           const void*, uint64_t );
 
+#endif   // AVX512
+
+// AVX2 + VAES
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m256i chaining[SIZE512];
+  __attribute__ ((aligned (64))) __m256i buffer[SIZE512];
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+} groestl512_2way_context;
+
+
+int groestl512_2way_init( groestl512_2way_context*, uint64_t );
+
+int groestl512_2way_update( groestl512_2way_context*, const void*,
+                              uint64_t );
+int groestl512_2way_close( groestl512_2way_context*, void* );
+int groestl512_2way_update_close( groestl512_2way_context*,  void*,
+                                        const void*, uint64_t );
+int groestl512_2way_full( groestl512_2way_context*,  void*,
+                          const void*, uint64_t );
+
+
 #endif   // VAES
 #endif   // GROESTL512_HASH_4WAY_H__
diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h
index 96788f4..5d8d715 100644
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -12,7 +12,7 @@
       
 #include "groestl512-hash-4way.h"
 
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if defined(__AVX2__) && defined(__VAES__)
 
 static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
 {
@@ -50,6 +50,8 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
    { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
 };
 
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
                                      0x1d1519111c141810, 0x1f171b131e161a12,
                                      0x2d2529212c242820, 0x2f272b232e262a22,
@@ -660,5 +662,578 @@ void OF1024_4way( __m512i* chaining )
   return;
 }
 
+#endif  // AVX512
+
+// AVX2 + VAES
+
+static const __m256i TRANSP_MASK_2WAY =
+    { 0x0d0509010c040800, 0x0f070b030e060a02,
+      0x1d1519111c141810, 0x1f171b131e161a12 };
+
+static const __m256i SUBSH_MASK0_2WAY =
+    { 0x0b0e0104070a0d00, 0x0306090c0f020508,
+      0x1b1e1114171a1d10, 0x1316191c1f121518 };
+
+static const __m256i SUBSH_MASK1_2WAY =
+    { 0x0c0f0205080b0e01, 0x04070a0d00030609,
+      0x1c1f1215181b1e11, 0x14171a1d10131619 };
+
+static const __m256i SUBSH_MASK2_2WAY =
+    { 0x0d000306090c0f02, 0x05080b0e0104070a,
+      0x1d101316191c1f12, 0x15181b1e1114171a };
+
+static const __m256i SUBSH_MASK3_2WAY =
+    { 0x0e0104070a0d0003, 0x06090c0f0205080b,
+      0x1e1114171a1d1013, 0x16191c1f1215181b };
+
+static const __m256i SUBSH_MASK4_2WAY = 
+    { 0x0f0205080b0e0104, 0x070a0d000306090c,
+      0x1f1215181b1e1114, 0x171a1d101316191c };
+
+static const __m256i SUBSH_MASK5_2WAY =
+    { 0x000306090c0f0205, 0x080b0e0104070a0d,
+      0x101316191c1f1215, 0x181b1e1114171a1d };
+
+static const __m256i SUBSH_MASK6_2WAY =
+    { 0x0104070a0d000306, 0x090c0f0205080b0e,
+      0x1114171a1d101316, 0x191c1f1215181b1e };
+
+static const __m256i SUBSH_MASK7_2WAY =
+    { 0x06090c0f0205080b, 0x0e0104070a0d0003,
+      0x16191c1f1215181b, 0x1e1114171a1d1013 };
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2_2WAY(i, j, k){\
+  j = _mm256_xor_si256(j, j);\
+  j = _mm256_cmpgt_epi8(j, i );\
+  i = _mm256_add_epi8(i, i);\
+  j = _mm256_and_si256(j, k);\
+  i = _mm256_xor_si256(i, j);\
+}
+
+#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm256_xor_si256(a0, a1);\
+  b0 = a2;\
+  a1 = _mm256_xor_si256(a1, a2);\
+  b1 = a3;\
+  a2 = _mm256_xor_si256(a2, a3);\
+  b2 = a4;\
+  a3 = _mm256_xor_si256(a3, a4);\
+  b3 = a5;\
+  a4 = _mm256_xor_si256(a4, a5);\
+  b4 = a6;\
+  a5 = _mm256_xor_si256(a5, a6);\
+  b5 = a7;\
+  a6 = _mm256_xor_si256(a6, a7);\
+  a7 = _mm256_xor_si256(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm256_xor_si256(b0, a4);\
+  b6 = _mm256_xor_si256(b6, a4);\
+  b1 = _mm256_xor_si256(b1, a5);\
+  b7 = _mm256_xor_si256(b7, a5);\
+  b2 = _mm256_xor_si256(b2, a6);\
+  b0 = _mm256_xor_si256(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm256_xor_si256(b3, a7);\
+  b1 = _mm256_xor_si256(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm256_xor_si256(b4, a0);\
+  b2 = _mm256_xor_si256(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm256_xor_si256(b5, a1);\
+  b3 = _mm256_xor_si256(b3, a1);\
+  b1 = a1;\
+  b6 = _mm256_xor_si256(b6, a2);\
+  b4 = _mm256_xor_si256(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm256_xor_si256(b7, a3);\
+  b5 = _mm256_xor_si256(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm256_xor_si256(a0, a3);\
+  a1 = _mm256_xor_si256(a1, a4);\
+  a2 = _mm256_xor_si256(a2, a5);\
+  a3 = _mm256_xor_si256(a3, a6);\
+  a4 = _mm256_xor_si256(a4, a7);\
+  a5 = _mm256_xor_si256(a5, b0);\
+  a6 = _mm256_xor_si256(a6, b1);\
+  a7 = _mm256_xor_si256(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  MUL2_2WAY(a0, b0, b1);\
+  a0 = _mm256_xor_si256(a0, TEMP0);\
+  MUL2_2WAY(a1, b0, b1);\
+  a1 = _mm256_xor_si256(a1, TEMP1);\
+  MUL2_2WAY(a2, b0, b1);\
+  a2 = _mm256_xor_si256(a2, b2);\
+  MUL2_2WAY(a3, b0, b1);\
+  a3 = _mm256_xor_si256(a3, b3);\
+  MUL2_2WAY(a4, b0, b1);\
+  a4 = _mm256_xor_si256(a4, b4);\
+  MUL2_2WAY(a5, b0, b1);\
+  a5 = _mm256_xor_si256(a5, b5);\
+  MUL2_2WAY(a6, b0, b1);\
+  a6 = _mm256_xor_si256(a6, b6);\
+  MUL2_2WAY(a7, b0, b1);\
+  a7 = _mm256_xor_si256(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2_2WAY(a0, b0, b1);\
+  b5 = _mm256_xor_si256(b5, a0);\
+  MUL2_2WAY(a1, b0, b1);\
+  b6 = _mm256_xor_si256(b6, a1);\
+  MUL2_2WAY(a2, b0, b1);\
+  b7 = _mm256_xor_si256(b7, a2);\
+  MUL2_2WAY(a5, b0, b1);\
+  b2 = _mm256_xor_si256(b2, a5);\
+  MUL2_2WAY(a6, b0, b1);\
+  b3 = _mm256_xor_si256(b3, a6);\
+  MUL2_2WAY(a7, b0, b1);\
+  b4 = _mm256_xor_si256(b4, a7);\
+  MUL2_2WAY(a3, b0, b1);\
+  MUL2_2WAY(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm256_xor_si256(b0, a3);\
+  b1 = _mm256_xor_si256(b1, a4);\
+}/*MixBytes*/
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX_2WAY(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm256_xor_si256( b0, b0 );\
+  a0 = _mm256_aesenclast_epi128( a0, b0 );\
+  a1 = _mm256_aesenclast_epi128( a1, b0 );\
+  a2 = _mm256_aesenclast_epi128( a2, b0 );\
+  a3 = _mm256_aesenclast_epi128( a3, b0 );\
+  a4 = _mm256_aesenclast_epi128( a4, b0 );\
+  a5 = _mm256_aesenclast_epi128( a5, b0 );\
+  a6 = _mm256_aesenclast_epi128( a6, b0 );\
+  a7 = _mm256_aesenclast_epi128( a7, b0 );\
+  /* MixBytes */\
+  MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P_2WAY(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
+  { \
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
+             casti_m128i( round_const_p, round_counter ) ) ); \
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
+    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK1_2WAY );\
+    xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK2_2WAY );\
+    xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK3_2WAY );\
+    xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK4_2WAY );\
+    xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK5_2WAY );\
+    xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK6_2WAY );\
+    xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK7_2WAY );\
+    /* SubBytes + MixBytes */\
+    SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+     /* AddRoundConstant P1024 */\
+    xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
+             casti_m128i( round_const_p, round_counter+1 ) ) ); \
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
+    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK1_2WAY );\
+    xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK2_2WAY );\
+    xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK3_2WAY );\
+    xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK4_2WAY );\
+    xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK5_2WAY );\
+    xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK6_2WAY );\
+    xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK7_2WAY );\
+    /* SubBytes + MixBytes */\
+     SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q_2WAY(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
+  { \
+    /* AddRoundConstant Q1024 */\
+    xmm1 = m256_neg1;\
+    xmm8  = _mm256_xor_si256( xmm8,  xmm1 );\
+    xmm9  = _mm256_xor_si256( xmm9,  xmm1 );\
+    xmm10 = _mm256_xor_si256( xmm10, xmm1 );\
+    xmm11 = _mm256_xor_si256( xmm11, xmm1 );\
+    xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
+    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
+    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
+    xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
+                 casti_m128i( round_const_q, round_counter ) ) ); \
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
+    xmm9  = _mm256_shuffle_epi8( xmm9,  SUBSH_MASK3_2WAY );\
+    xmm10 = _mm256_shuffle_epi8( xmm10, SUBSH_MASK5_2WAY );\
+    xmm11 = _mm256_shuffle_epi8( xmm11, SUBSH_MASK7_2WAY );\
+    xmm12 = _mm256_shuffle_epi8( xmm12, SUBSH_MASK0_2WAY );\
+    xmm13 = _mm256_shuffle_epi8( xmm13, SUBSH_MASK2_2WAY );\
+    xmm14 = _mm256_shuffle_epi8( xmm14, SUBSH_MASK4_2WAY );\
+    xmm15 = _mm256_shuffle_epi8( xmm15, SUBSH_MASK6_2WAY );\
+    /* SubBytes + MixBytes */\
+    SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = m256_neg1;\
+    xmm0 = _mm256_xor_si256( xmm0, xmm9 );\
+    xmm1 = _mm256_xor_si256( xmm1, xmm9 );\
+    xmm2 = _mm256_xor_si256( xmm2, xmm9 );\
+    xmm3 = _mm256_xor_si256( xmm3, xmm9 );\
+    xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
+    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
+    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
+    xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
+             casti_m128i( round_const_q, round_counter+1 ) ) ); \
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
+    xmm1 = _mm256_shuffle_epi8( xmm1, SUBSH_MASK3_2WAY );\
+    xmm2 = _mm256_shuffle_epi8( xmm2, SUBSH_MASK5_2WAY );\
+    xmm3 = _mm256_shuffle_epi8( xmm3, SUBSH_MASK7_2WAY );\
+    xmm4 = _mm256_shuffle_epi8( xmm4, SUBSH_MASK0_2WAY );\
+    xmm5 = _mm256_shuffle_epi8( xmm5, SUBSH_MASK2_2WAY );\
+    xmm6 = _mm256_shuffle_epi8( xmm6, SUBSH_MASK4_2WAY );\
+    xmm7 = _mm256_shuffle_epi8( xmm7, SUBSH_MASK6_2WAY );\
+    /* SubBytes + MixBytes */\
+    SUBMIX_2WAY(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define Matrix_Transpose_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK_2WAY;\
+\
+  i6 = _mm256_shuffle_epi8(i6, t0);\
+  i0 = _mm256_shuffle_epi8(i0, t0);\
+  i1 = _mm256_shuffle_epi8(i1, t0);\
+  i2 = _mm256_shuffle_epi8(i2, t0);\
+  i3 = _mm256_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm256_shuffle_epi8(i4, t0);\
+  i5 = _mm256_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm256_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm256_unpackhi_epi16(t2, i5);\
+  i4 = _mm256_unpacklo_epi16(i4, i5);\
+  t3 = _mm256_unpackhi_epi16(t3, i7);\
+  i6 = _mm256_unpacklo_epi16(i6, i7);\
+  t0 = _mm256_unpackhi_epi16(t0, i1);\
+  t1 = _mm256_unpackhi_epi16(t1, i3);\
+  i2 = _mm256_unpacklo_epi16(i2, i3);\
+  i0 = _mm256_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm256_shuffle_epi32(t0, 216);\
+  t1 = _mm256_shuffle_epi32(t1, 216);\
+  t2 = _mm256_shuffle_epi32(t2, 216);\
+  t3 = _mm256_shuffle_epi32(t3, 216);\
+  i0 = _mm256_shuffle_epi32(i0, 216);\
+  i2 = _mm256_shuffle_epi32(i2, 216);\
+  i4 = _mm256_shuffle_epi32(i4, 216);\
+  i6 = _mm256_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm256_unpacklo_epi32(i0, i2);\
+  t4 = _mm256_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm256_unpacklo_epi32(t0, t1);\
+  t5 = _mm256_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm256_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm256_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm256_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm256_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm256_unpackhi_epi64(i1, i4);\
+  i0 = _mm256_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm256_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm256_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm256_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm256_unpacklo_epi64(i4, t6);\
+  i7 = _mm256_unpackhi_epi64(i7, t7);\
+  i6 = _mm256_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+#define Matrix_Transpose_INV_2way(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm256_unpacklo_epi64(i0, i1);\
+  o1 = _mm256_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm256_unpacklo_epi64(i2, i3);\
+  t0 = _mm256_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm256_unpacklo_epi64(i4, i5);\
+  t1 = _mm256_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK_2WAY;\
+  i6 = _mm256_unpacklo_epi64(i6, i7);\
+  t2 = _mm256_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm256_shuffle_epi8(i0, o0);\
+  i2 = _mm256_shuffle_epi8(i2, o0);\
+  i4 = _mm256_shuffle_epi8(i4, o0);\
+  i6 = _mm256_shuffle_epi8(i6, o0);\
+  o1 = _mm256_shuffle_epi8(o1, o0);\
+  t0 = _mm256_shuffle_epi8(t0, o0);\
+  t1 = _mm256_shuffle_epi8(t1, o0);\
+  t2 = _mm256_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm256_unpackhi_epi16(t3, i6);\
+  i4 = _mm256_unpacklo_epi16(i4, i6);\
+  o0 = _mm256_unpackhi_epi16(o0, i2);\
+  i0 = _mm256_unpacklo_epi16(i0, i2);\
+  o2 = _mm256_unpackhi_epi16(o2, t0);\
+  o1 = _mm256_unpacklo_epi16(o1, t0);\
+  t4 = _mm256_unpackhi_epi16(t4, t2);\
+  t1 = _mm256_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm256_shuffle_epi32(i4, 216);\
+  t3 = _mm256_shuffle_epi32(t3, 216);\
+  o1 = _mm256_shuffle_epi32(o1, 216);\
+  o2 = _mm256_shuffle_epi32(o2, 216);\
+  i0 = _mm256_shuffle_epi32(i0, 216);\
+  o0 = _mm256_shuffle_epi32(o0, 216);\
+  t1 = _mm256_shuffle_epi32(t1, 216);\
+  t4 = _mm256_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm256_unpacklo_epi32(i0, i4);\
+  i1 = _mm256_unpackhi_epi32(i1, i4);\
+  o0 = _mm256_unpacklo_epi32(o0, t3);\
+  i3 = _mm256_unpackhi_epi32(i3, t3);\
+  o1 = _mm256_unpacklo_epi32(o1, t1);\
+  i5 = _mm256_unpackhi_epi32(i5, t1);\
+  o2 = _mm256_unpacklo_epi32(o2, t4);\
+  i7 = _mm256_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+void INIT_2way( __m256i *chaining )
+{
+  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024_2way( __m256i *chaining, const __m256i *message )
+{
+  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m256i QTEMP[8];
+  static __m256i TEMP0;
+  static __m256i TEMP1;
+  static __m256i TEMP2;
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
+  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
+  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
+  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
+  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
+  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
+  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
+  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P_2WAY();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
+  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
+  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
+  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
+  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
+  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
+  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
+  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q_2WAY();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
+  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
+  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
+  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
+  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
+  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
+  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
+  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+void OF1024_2way( __m256i* chaining )
+{
+  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m256i TEMP0;
+  static __m256i TEMP1;
+  static __m256i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P_2WAY();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm256_xor_si256( xmm8,  (chaining[0]) );
+  xmm9 = _mm256_xor_si256( xmm9,  (chaining[1]) );
+  xmm10 = _mm256_xor_si256( xmm10, (chaining[2]) );
+  xmm11 = _mm256_xor_si256( xmm11, (chaining[3]) );
+  xmm12 = _mm256_xor_si256( xmm12, (chaining[4]) );
+  xmm13 = _mm256_xor_si256( xmm13, (chaining[5]) );
+  xmm14 = _mm256_xor_si256( xmm14, (chaining[6]) );
+  xmm15 = _mm256_xor_si256( xmm15, (chaining[7]) );
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV_2way(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+
+
 #endif  // VAES
 #endif  // GROESTL512_INTR_4WAY_H__
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index de8ca72..833b87e 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -174,24 +174,19 @@ void allium_16way_hash( void *state, const void *input )
 #if defined(__VAES__)
 
    intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
-
-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
-
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
    dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
+
    intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
-
-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
-   
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
    dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
+
    intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
-
-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
-
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
    dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
-   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
 
-   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
- 
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 32 );
    dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
    
 #else
@@ -262,8 +257,11 @@ typedef struct {
    keccak256_4way_context    keccak;
    cubehashParam             cube;
    skein256_4way_context     skein;
+#if defined(__VAES__)
+   groestl256_2way_context   groestl;
+#else
    hashState_groestl256      groestl;
-
+#endif
 } allium_8way_ctx_holder;
 
 static __thread allium_8way_ctx_holder allium_8way_ctx;
@@ -273,7 +271,11 @@ bool init_allium_8way_ctx()
    keccak256_4way_init( &allium_8way_ctx.keccak );
    cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
    skein256_4way_init( &allium_8way_ctx.skein );
+#if defined(__VAES__)
+   groestl256_2way_init( &allium_8way_ctx.groestl, 32 );
+#else
    init_groestl256( &allium_8way_ctx.groestl, 32 );
+#endif
    return true;
 }
 
@@ -352,9 +354,28 @@ void allium_8way_hash( void *hash, const void *input )
    skein256_4way_update( &ctx.skein, vhashB, 32 );
    skein256_4way_close( &ctx.skein, vhashB );
 
+#if defined(__VAES__)
+
+   uint64_t vhashC[4*2] __attribute__ ((aligned (64)));
+   uint64_t vhashD[4*2] __attribute__ ((aligned (64)));
+   
+   rintrlv_4x64_2x128( vhashC, vhashD, vhashA, 256 );
+   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
+   groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
+   dintrlv_2x128( hash0, hash1, vhashC, 256 );
+   dintrlv_2x128( hash2, hash3, vhashD, 256 );
+
+   rintrlv_4x64_2x128( vhashC, vhashD, vhashB, 256 );
+   groestl256_2way_full( &ctx.groestl, vhashC, vhashC, 32 );
+   groestl256_2way_full( &ctx.groestl, vhashD, vhashD, 32 );
+   dintrlv_2x128( hash4, hash5, vhashC, 256 );
+   dintrlv_2x128( hash6, hash7, vhashD, 256 );
+
+#else
+
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
    dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
-
+   
    groestl256_full( &ctx.groestl, hash0, hash0, 256 );
    groestl256_full( &ctx.groestl, hash1, hash1, 256 );
    groestl256_full( &ctx.groestl, hash2, hash2, 256 );
@@ -363,6 +384,8 @@ void allium_8way_hash( void *hash, const void *input )
    groestl256_full( &ctx.groestl, hash5, hash5, 256 );
    groestl256_full( &ctx.groestl, hash6, hash6, 256 );
    groestl256_full( &ctx.groestl, hash7, hash7, 256 );
+
+#endif
 }
 
 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index ad62d05..c1d70e7 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -187,7 +187,8 @@ bool register_allium_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_allium;
   gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
+	              | VAES_OPT | VAES256_OPT;
   opt_target_factor = 256.0;
   return true;
 };
diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c
index ba7f95d..15ce7db 100644
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,36 +3,38 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t_8way;
     gate->hash       = (void*)&sha256t_8way_hash;
-#elif defined(SHA256T_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+#else
     gate->scanhash   = (void*)&scanhash_sha256t_4way;
     gate->hash       = (void*)&sha256t_4way_hash;
+/*
 #else
     gate->optimizations = SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256t;
     gate->hash       = (void*)&sha256t_hash;
+*/
 #endif
+    gate->optimizations = SSE2_OPT | AVX2_OPT;
     return true;
 }
 
 bool register_sha256q_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256q_8way;
     gate->hash       = (void*)&sha256q_8way_hash;
-#elif defined(SHA256T_4WAY)
-    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+#else
     gate->scanhash   = (void*)&scanhash_sha256q_4way;
     gate->hash       = (void*)&sha256q_4way_hash;
+/*
 #else
     gate->optimizations = SHA_OPT;
     gate->scanhash   = (void*)&scanhash_sha256q;
     gate->hash       = (void*)&sha256q_hash;
+*/
 #endif
+    gate->optimizations = SSE2_OPT | AVX2_OPT;
     return true;
 
 }
diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h
index 0d519aa..cb06f5a 100644
--- a/algo/sha/sha256t-gate.h
+++ b/algo/sha/sha256t-gate.h
@@ -4,13 +4,10 @@
 #include <stdint.h>
 #include "algo-gate-api.h"
 
-// Override multi way on ryzen, SHA is better.
-#if !defined(__SHA__)
- #if defined(__AVX2__)
+#if defined(__AVX2__)
   #define SHA256T_8WAY
- #elif defined(__SSE2__)
+#else
   #define SHA256T_4WAY
- #endif
 #endif
 
 bool register_sha256t_algo( algo_gate_t* gate );
@@ -36,12 +33,13 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr );
 #endif
 
+/*
 void sha256t_hash( void *output, const void *input );
 int scanhash_sha256t( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void sha256q_hash( void *output, const void *input );
 int scanhash_sha256q( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
+*/
 #endif
 
diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c
index 4dbfd33..19f47d6 100644
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,5 +1,7 @@
 #include "sha256t-gate.h"
 
+// Obsolete
+
 #if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
 
 #include <stdlib.h>
diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c
index ba531ce..83f3e66 100644
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -26,7 +26,11 @@ static const uint32_t IV512[] =
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
 {
+#if defined(__VAES__)
+   const __m256i zero = _mm256_setzero_si256();
+#else
    const __m128i zero = _mm_setzero_si128();
+#endif
    __m256i p0, p1, p2, p3, x;
    __m256i k00, k01, k02, k03, k10, k11, k12, k13;
    __m256i *m = (__m256i*)msg;
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index abbe16a..4d12029 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -619,11 +619,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case GROESTL:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-         break;
+#endif
+   	    break;
          case JH:
             if ( i == 0 )
                jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
@@ -711,11 +720,20 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
             }
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             shavite512_full( &ctx.shavite, hash0, in0, size );
             shavite512_full( &ctx.shavite, hash1, in1, size );
             shavite512_full( &ctx.shavite, hash2, in2, size );
             shavite512_full( &ctx.shavite, hash3, in3, size );
-         break;
+#endif
+   	    break;
          case SIMD:
             intrlv_2x128( vhash, in0, in1, size<<3 );
             simd512_2way_full( &ctx.simd, vhash, vhash, size );
@@ -725,6 +743,14 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
             dintrlv_2x128_512( hash2, hash3, vhash );
          break;
          case ECHO:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                               (const BitSequence *)in0, size );
             echo_full( &ctx.echo, (BitSequence *)hash1, 512,
@@ -733,7 +759,8 @@ int x16r_4way_hash_generic( void* output, const void* input, int thrid )
                               (const BitSequence *)in2, size );
             echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                               (const BitSequence *)in3, size );
-         break;
+#endif
+   	    break;
          case HAMSI:
             if ( i == 0 )
                hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 462e264..09315f6 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -61,7 +61,8 @@ bool register_x16r_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -79,7 +80,8 @@ bool register_x16rv2_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rv2;
   gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -97,7 +99,8 @@ bool register_x16s_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
   x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -230,7 +233,8 @@ bool register_x16rt_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
   opt_target_factor = 256.0;
   return true;
 };
@@ -247,7 +251,8 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
   gate->build_extraheader = (void*)&veil_build_extraheader;
   opt_target_factor = 256.0;
   return true;
@@ -277,22 +282,17 @@ bool register_x21s_algo( algo_gate_t* gate )
   gate->scanhash          = (void*)&scanhash_x21s_8way;
   gate->hash              = (void*)&x21s_8way_hash;
   gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
-                            | VAES_OPT;
 #elif defined (X16R_4WAY)
   gate->scanhash          = (void*)&scanhash_x21s_4way;
   gate->hash              = (void*)&x21s_4way_hash;
   gate->miner_thread_init = (void*)&x21s_4way_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                            | AVX512_OPT | VAES_OPT;
 #else
   gate->scanhash          = (void*)&scanhash_x21s;
   gate->hash              = (void*)&x21s_hash;
   gate->miner_thread_init = (void*)&x21s_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                          | AVX512_OPT | VAES_OPT;
 #endif
-//  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                    VAES_OPT | VAES256_OPT;
   x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h
index cbd3899..ed93599 100644
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -41,6 +41,7 @@
 #include "algo/sha/sha-hash-4way.h"
 #if defined(__VAES__)
   #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-2way.h"
   #include "algo/shavite/shavite-hash-4way.h"
   #include "algo/echo/echo-hash-4way.h"
 #endif
@@ -145,15 +146,21 @@ union _x16r_4way_context_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
-    hashState_echo          echo;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    shavite512_2way_context shavite;
+    echo_2way_context       echo;
+#else
     hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
     hashState_luffa         luffa1;
     cubehashParam           cube;
-    shavite512_context      shavite;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index e2d80da..4421636 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -672,14 +672,20 @@ union _x16rv2_4way_context_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
-    hashState_echo          echo;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    shavite512_2way_context shavite;
+    echo_2way_context       echo;
+#else
     hashState_groestl       groestl;
+    shavite512_context      shavite;
+    hashState_echo          echo;
+#endif
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
     cubehashParam           cube;
-    shavite512_context      shavite;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
@@ -745,10 +751,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
          case GROESTL:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            groestl512_2way_full( &ctx.groestl, vhash, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
             groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
+#endif
          break;
          case JH:
             if ( i == 0 )
@@ -887,10 +902,19 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
             }
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            shavite512_2way_full( &ctx.shavite, vhash, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             shavite512_full( &ctx.shavite, hash0, in0, size );
             shavite512_full( &ctx.shavite, hash1, in1, size );
             shavite512_full( &ctx.shavite, hash2, in2, size );
             shavite512_full( &ctx.shavite, hash3, in3, size );
+#endif
          break;
          case SIMD:
             intrlv_2x128( vhash, in0, in1, size<<3 );
@@ -901,6 +925,14 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
             dintrlv_2x128_512( hash2, hash3, vhash );
          break;
          case ECHO:
+#if defined(__VAES__)
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
+            dintrlv_2x128_512( hash0, hash1, vhash );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            echo_2way_full( &ctx.echo, vhash, 512, vhash, size );
+            dintrlv_2x128_512( hash2, hash3, vhash );
+#else
             echo_full( &ctx.echo, (BitSequence *)hash0, 512,
                               (const BitSequence *)in0, size );
             echo_full( &ctx.echo, (BitSequence *)hash1, 512,
@@ -909,6 +941,7 @@ int x16rv2_4way_hash( void* output, const void* input, int thrid )
                               (const BitSequence *)in2, size );
             echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                               (const BitSequence *)in3, size );
+#endif
          break;
          case HAMSI:
             if ( i == 0 )
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index e4fe98b..92deeb8 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -1124,7 +1124,13 @@ union _sonoa_4way_context_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo512_2way_context    echo;
+#else
     hashState_groestl       groestl;
+    hashState_echo          echo;
+#endif
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
@@ -1132,7 +1138,6 @@ union _sonoa_4way_context_overlay
     cube_2way_context       cube;
     shavite512_2way_context shavite;
     simd_2way_context       simd;
-    hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
     shabal512_4way_context  shabal;
@@ -1162,6 +1167,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1171,6 +1187,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1195,6 +1213,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1206,16 +1233,29 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
                      (const BitSequence *)hash2, 64 );
      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                      (const BitSequence *)hash3, 64 );
+
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
      
      if ( work_restart[thr_id].restart ) return 0;
 // 2
 
-     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
-
      bmw512_4way_init( &ctx.bmw );
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1225,6 +1265,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1249,6 +1291,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1263,6 +1314,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1274,6 +1327,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1283,6 +1347,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1307,6 +1373,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1321,6 +1396,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1340,6 +1417,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1349,6 +1437,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1373,6 +1463,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1387,6 +1486,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1410,6 +1511,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      echo_full( &ctx.echo, (BitSequence *)hash0, 512,
@@ -1424,6 +1534,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      intrlv_2x128_512( vhashA, hash0, hash1 );
      intrlv_2x128_512( vhashB, hash2, hash3 );
 
+#endif
+
      shavite512_2way_init( &ctx.shavite );
      shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
      shavite512_2way_init( &ctx.shavite );
@@ -1443,6 +1555,20 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      shabal512_4way_update( &ctx.shabal, vhashB, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
+#if defined(__VAES__)
+
+//     rintrlv_4x32_2x128( vhashA, vhashB, vhash, 512 ); 
+     dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_2x128_512( vhashA, hash0, hash1 );
+     intrlv_2x128_512( vhashB, hash2, hash3 );
+     
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1452,6 +1578,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1476,6 +1604,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1490,6 +1627,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1523,6 +1662,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1532,6 +1682,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1556,6 +1708,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1570,6 +1731,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -1616,6 +1779,17 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 ); 
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+ 
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -1625,6 +1799,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif     
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -1649,6 +1825,15 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -1663,6 +1848,8 @@ int sonoa_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c
index 926beb4..d192b0d 100644
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -12,7 +12,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
   init_sonoa_ctx();
   gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
   return true;
 };
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index cce3894..fcff0b6 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -240,7 +240,13 @@ union _x17_4way_context_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo512_2way_context    echo;
+#else
     hashState_groestl       groestl;
+    hashState_echo          echo;
+#endif
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
@@ -248,7 +254,6 @@ union _x17_4way_context_overlay
     cube_2way_context       cube;
     shavite512_2way_context shavite;
     simd_2way_context       simd;
-    hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
     shabal512_4way_context  shabal;
@@ -275,6 +280,17 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+     
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -284,6 +300,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
      jh512_4way_init( &ctx.jh );
@@ -308,6 +326,15 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
@@ -322,6 +349,8 @@ int x17_4way_hash( void *state, const void *input, int thr_id )
 
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c
index eee3d60..6ab09ff 100644
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
 #else
   gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
   return true;
 };
 
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index beb9df6..e7cd2bf 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -405,15 +405,20 @@ union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
         bmw512_4way_context     bmw;
-        hashState_groestl       groestl;
-        skein512_4way_context   skein;
+#if defined(__VAES__)
+        groestl512_2way_context groestl;
+        echo_2way_context       echo;
+#else
+	hashState_groestl       groestl;
+        hashState_echo          echo;
+#endif
+	skein512_4way_context   skein;
         jh512_4way_context      jh;
         keccak512_4way_context  keccak;
         luffa_2way_context      luffa;
         cube_2way_context       cube;
         shavite512_2way_context shavite;
         simd_2way_context       simd;
-        hashState_echo          echo;
         hamsi512_4way_context   hamsi;
         hashState_fugue         fugue;
         shabal512_4way_context  shabal;
@@ -442,7 +447,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+     
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
@@ -450,9 +465,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, dataLen<<3 );
      groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, dataLen<<3 );
 
-     // Parallel 4way
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+#endif
+
      skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
 
      jh512_4way_init( &ctx.jh );
@@ -477,6 +493,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+     
      dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
      dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
 
@@ -489,9 +514,10 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      echo_full( &ctx.echo, (BitSequence *)hash3, 512,
                      (const BitSequence *)hash3, dataLen );
 
-     // Parallel
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
@@ -542,6 +568,17 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, dataLen );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, dataLen );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, dataLen<<3 );
@@ -551,6 +588,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
 
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+#endif
+
      skein512_4way_full( &ctx.skein, vhash, vhash, dataLen );
 
      jh512_4way_init( &ctx.jh );
@@ -575,6 +614,15 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
      simd512_2way_full( &ctx.simd, vhashA, vhashA, dataLen );
      simd512_2way_full( &ctx.simd, vhashB, vhashB, dataLen );
 
+#if defined(__VAES__)
+
+     echo_2way_full( &ctx.echo, vhashA, 512, vhashA, dataLen );
+     echo_2way_full( &ctx.echo, vhashB, 512, vhashB, dataLen );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+
      dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
      dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
 
@@ -589,6 +637,8 @@ int xevan_4way_hash( void *output, const void *input, int thr_id )
 
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
+#endif
+
      hamsi512_4way_init( &ctx.hamsi );
      hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
index 184ed2d..545a0aa 100644
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -12,7 +12,7 @@ bool register_xevan_algo( algo_gate_t* gate )
   init_xevan_ctx();
   gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT | VAES256_OPT;
   opt_target_factor = 256.0;
   return true;
 };
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index e61d1ad..ba5714b 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -11,7 +11,7 @@
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
-#include "algo/shavite/sph_shavite.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/fugue-aesni.h"
 #include "algo/shabal/shabal-hash-4way.h"
@@ -494,14 +494,19 @@ union _x22i_4way_ctx_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo_2way_context       echo;
+#else
     hashState_groestl       groestl;
     hashState_echo          echo;
+#endif
+    shavite512_2way_context shavite;
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
     luffa_2way_context      luffa;
     cube_2way_context       cube;
-    shavite512_2way_context shavite;
     simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
@@ -535,14 +540,28 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    bmw512_4way_init( &ctx.bmw );
    bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
-   dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
-   groestl512_full( &ctx.groestl, (char*)hash0, (const char*)hash0, 512 );
-   groestl512_full( &ctx.groestl, (char*)hash1, (const char*)hash1, 512 );
-   groestl512_full( &ctx.groestl, (char*)hash2, (const char*)hash2, 512 );
-   groestl512_full( &ctx.groestl, (char*)hash3, (const char*)hash3, 512 );
+#if defined(__VAES__)
 
-   intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+     rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+     groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+     rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
+
+     groestl512_full( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     groestl512_full( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
 
    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
 
@@ -570,6 +589,15 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
    simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
    simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+   echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+   echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_2x128_512( hash0, hash1, vhashA );
    dintrlv_2x128_512( hash2, hash3, vhashB );
    
@@ -584,6 +612,8 @@ int x22i_4way_hash( void *output, const void *input, int thrid )
 
    intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
+#endif
+
    if ( work_restart[thrid].restart ) return false;
    
    hamsi512_4way_init( &ctx.hamsi );
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index 78f23b4..243f69e 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -20,7 +20,7 @@ bool register_x22i_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x22i;
   gate->hash      = (void*)&x22i_hash;
   gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
+                      | AVX512_OPT | VAES_OPT | VAES256_OPT;
 #endif
   return true;
 };
@@ -30,20 +30,15 @@ bool register_x25x_algo( algo_gate_t* gate )
 #if defined (X25X_8WAY)
   gate->scanhash  = (void*)&scanhash_x25x_8way;
   gate->hash      = (void*)&x25x_8way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
-                      | AVX512_OPT | VAES_OPT;
 #elif defined (X25X_4WAY)
   gate->scanhash  = (void*)&scanhash_x25x_4way;
   gate->hash      = (void*)&x25x_4way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
 #else
   gate->scanhash  = (void*)&scanhash_x25x;
   gate->hash      = (void*)&x25x_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
-                      | AVX512_OPT | VAES_OPT;
 #endif
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT |
+	                VAES_OPT | VAES256_OPT;
 
   return true;
 };
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index 1cdea11..e44a82e 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -15,6 +15,7 @@
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/shavite-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -412,9 +413,9 @@ int x25x_8way_hash( void *output, const void *input, int thrid )
    LYRA2X_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
    dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );
 
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash0[20]);
    sph_gost512_init(&ctx.gost);
    sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
    sph_gost512_close(&ctx.gost, (void*) hash1[20]);
@@ -574,68 +575,26 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
-/*
-int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
-   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t first_nonce = pdata[19];
-   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
-   uint32_t n = first_nonce;
-   const uint32_t last_nonce = max_nonce - 4;
-   const int thr_id = mythr->id;
-   const uint32_t Htarg = ptarget[7];
-
-   if (opt_benchmark)
-      ((uint32_t*)ptarget)[7] = 0x08ff;
-
-   InitializeSWIFFTX();
-
-   mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   do
-   {
-      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
-              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
-                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
-      x25x_8way_hash( hash, vdata );
-
-      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
-      {
-         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-         {
-              pdata[19] = n + lane;
-              submit_solution( work, lane_hash, mythr );
-         }
-      }
-      n += 8;
-   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
-
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
-
 #elif defined(X25X_4WAY)
 
 union _x25x_4way_ctx_overlay
 {
     blake512_4way_context   blake;
     bmw512_4way_context     bmw;
+#if defined(__VAES__)
+    groestl512_2way_context groestl;
+    echo_2way_context       echo;
+#else
     hashState_groestl       groestl;
     hashState_echo          echo;
+#endif
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    luffa_2way_context      luffa;
+    cube_2way_context       cube;
+    shavite512_2way_context shavite;
+    simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     hashState_fugue         fugue;
     shabal512_4way_context  shabal;
@@ -658,6 +617,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
    unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
    unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
+   uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
    x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
 
    blake512_4way_full( &ctx.blake, vhash, input, 80 );
@@ -668,11 +629,25 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
    bmw512_4way_close( &ctx.bmw, vhash );
    dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
 
+#if defined(__VAES__)
+
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_2way_full( &ctx.groestl, vhashA, vhashA, 64 );
+   groestl512_2way_full( &ctx.groestl, vhashB, vhashB, 64 );
+
+   dintrlv_2x128_512( hash0[2], hash1[2], vhashA );
+   dintrlv_2x128_512( hash2[2], hash3[2], vhashB );
+
+#else
+
    groestl512_full( &ctx.groestl, (char*)hash0[2], (const char*)hash0[1], 512 );
    groestl512_full( &ctx.groestl, (char*)hash1[2], (const char*)hash1[1], 512 );
    groestl512_full( &ctx.groestl, (char*)hash2[2], (const char*)hash2[1], 512 );
    groestl512_full( &ctx.groestl, (char*)hash3[2], (const char*)hash3[1], 512 );
 
+#endif
+
    intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
    skein512_4way_full( &ctx.skein, vhash, vhash, 64 );
    dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
@@ -689,41 +664,38 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
    keccak512_4way_close( &ctx.keccak, vhash );
    dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
 
-   luffa_full( &ctx.luffa, (BitSequence*)hash0[6], 512,
-                     (const BitSequence*)hash0[5], 64 );
-   luffa_full( &ctx.luffa, (BitSequence*)hash1[6], 512,
-                     (const BitSequence*)hash1[5], 64 );
-   luffa_full( &ctx.luffa, (BitSequence*)hash2[6], 512,
-                     (const BitSequence*)hash2[5], 64 );
-   luffa_full( &ctx.luffa, (BitSequence*)hash3[6], 512,
-                     (const BitSequence*)hash3[5], 64 );
+   rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
 
-   cubehash_full( &ctx.cube, (byte*)hash0[7], 512, (const byte*)hash0[6], 64 );
-   cubehash_full( &ctx.cube, (byte*)hash1[7], 512, (const byte*)hash1[6], 64 );
-   cubehash_full( &ctx.cube, (byte*)hash2[7], 512, (const byte*)hash2[6], 64 );
-   cubehash_full( &ctx.cube, (byte*)hash3[7], 512, (const byte*)hash3[6], 64 );
+   luffa512_2way_full( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa512_2way_full( &ctx.luffa, vhashB, vhashB, 64 );
+   dintrlv_2x128_512( hash0[6], hash1[6], vhashA );
+   dintrlv_2x128_512( hash2[6], hash3[6], vhashB );
    
-   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
-   sph_shavite512_close(&ctx.shavite, hash0[8]);
-   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
-   sph_shavite512_close(&ctx.shavite, hash1[8]);
-   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
-   sph_shavite512_close(&ctx.shavite, hash2[8]);
-   sph_shavite512_init(&ctx.shavite);
-   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
-   sph_shavite512_close(&ctx.shavite, hash3[8]);
+   cube_2way_full( &ctx.cube, vhashA, 512, vhashA, 64 );
+   cube_2way_full( &ctx.cube, vhashB, 512, vhashB, 64 );
+   dintrlv_2x128_512( hash0[7], hash1[7], vhashA );
+   dintrlv_2x128_512( hash2[7], hash3[7], vhashB );
 
-   simd_full( &ctx.simd, (BitSequence*)hash0[9],
-                   (const BitSequence*)hash0[8], 512 );
-   simd_full( &ctx.simd, (BitSequence*)hash1[9],
-                   (const BitSequence*)hash1[8], 512 );
-   simd_full( &ctx.simd, (BitSequence*)hash2[9],
-                   (const BitSequence*)hash2[8], 512 );
-   simd_full( &ctx.simd, (BitSequence*)hash3[9],
-                   (const BitSequence*)hash3[8], 512 );
+   shavite512_2way_full( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_2way_full( &ctx.shavite, vhashB, vhashB, 64 );
+   dintrlv_2x128_512( hash0[8], hash1[8], vhashA );
+   dintrlv_2x128_512( hash2[8], hash3[8], vhashB );
+
+   simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
+   simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
+   dintrlv_2x128_512( hash0[9], hash1[9], vhashA );
+   dintrlv_2x128_512( hash2[9], hash3[9], vhashB );
+
+#if defined(__VAES__)
+
+   echo_2way_full( &ctx.echo, vhashA, 512, vhashA, 64 );
+   echo_2way_full( &ctx.echo, vhashB, 512, vhashB, 64 );
+   dintrlv_2x128_512( hash0[10], hash1[10], vhashA );
+   dintrlv_2x128_512( hash2[10], hash3[10], vhashB );
+
+   rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
+
+#else
 
    echo_full( &ctx.echo, (BitSequence *)hash0[10], 512,
                    (const BitSequence *)hash0[ 9], 64 );
@@ -736,6 +708,8 @@ int x25x_4way_hash( void *output, const void *input, int thrid )
 
    intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
 
+#endif
+
    if ( work_restart[thrid].restart ) return 0;
    
    hamsi512_4way_init( &ctx.hamsi );
diff --git a/build-allarch.sh b/build-allarch.sh
index 50a5865..eb1f71b 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,8 +4,9 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null
 
+# Icelake AVX512 SHA VAES
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
@@ -16,6 +17,20 @@ mv cpuminer.exe cpuminer-avx512-sha-vaes.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512-sha-vaes
 
+# Rocketlake AVX512 AES SHA
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=skylake-avx512 -msha -Wall -fno-common" ./configure --with-curl
+# CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl
+make -j 8
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-avx512-sha.exe
+strip -s cpuminer
+mv cpuminer cpuminer-avx512-sha
+
+# Slylake-X AVX512 AES
+make clean || echo clean
+rm -f config.status
 CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
@@ -23,6 +38,7 @@ mv cpuminer.exe cpuminer-avx512.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx512
 
+# Haswell AVX2 AES
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES with core-avx2
@@ -33,6 +49,7 @@ mv cpuminer.exe cpuminer-avx2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2
 
+# Sandybridge AVX AES
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl
@@ -42,15 +59,17 @@ mv cpuminer.exe cpuminer-avx.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx
 
+# Westmere SSE4.2 AES
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -maes -msse4.2 -Wall -fno-common" ./configure --with-curl
+CFLAGS="-O3 -march=westmere -Wall -fno-common" ./configure --with-curl
 make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-aes-sse42
 
+# Nehalem SSE4.2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl
@@ -60,6 +79,7 @@ mv cpuminer.exe cpuminer-sse42.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse42
 
+# Core2 SSSE3
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl
@@ -69,6 +89,7 @@ mv cpuminer.exe cpuminer-ssse3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-ssse3
 
+# Generic SSE2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl
@@ -78,6 +99,7 @@ mv cpuminer.exe cpuminer-sse2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-sse2
 
+# Zen1 AVX2 SHA
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall -fno-common" ./configure --with-curl
@@ -87,6 +109,7 @@ mv cpuminer.exe cpuminer-zen.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen
 
+# Zen3 AVX2 SHA VAES
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
@@ -97,6 +120,7 @@ mv cpuminer.exe cpuminer-zen3.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen3
 
+# Native to current CPU
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
diff --git a/build-avx2.sh b/build-avx2.sh
deleted file mode 100755
index 7a12473..0000000
--- a/build-avx2.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
-# Linux build
-
-make distclean || echo clean
-
-rm -f config.status
-./autogen.sh || echo done
-
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
-CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-
-make -j 4
-
-strip -s cpuminer
diff --git a/build-no-common.sh b/build-no-common.sh
deleted file mode 100755
index 60b6da5..0000000
--- a/build-no-common.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
-# Linux build
-
-make distclean || echo clean
-
-rm -f config.status
-./autogen.sh || echo done
-
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
-CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-
-make -j 4
-
-strip -s cpuminer
diff --git a/build.sh b/build.sh
index bf713ea..39bf5f6 100755
--- a/build.sh
+++ b/build.sh
@@ -12,15 +12,8 @@ make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
 #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 
 make -j 4
 
diff --git a/buildjdd.sh b/buildjdd.sh
deleted file mode 100755
index df17f22..0000000
--- a/buildjdd.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-#if [ "$OS" = "Windows_NT" ]; then
-#    ./mingw64.sh
-#    exit 0
-#fi
-
-# Linux build
-
-make distclean || echo clean
-
-rm -f config.status
-./autogen.sh || echo done
-
-# Ubuntu 10.04 (gcc 4.4)
-# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
-
-# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
-#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
-
-CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
-
-make -j 4
-
-strip -s cpuminer
diff --git a/clean-all.sh b/clean-all.sh
index 2ca980e..e91bbb5 100755
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -2,8 +2,8 @@
 #
 # make clean and rm all the targetted executables.
 
-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512-sha cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null
 
-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null
 
 make distclean > /dev/null
diff --git a/configure b/configure
index fcbefb8..5955782 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.2.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.1'
-PACKAGE_STRING='cpuminer-opt 3.15.1'
+PACKAGE_VERSION='3.15.2'
+PACKAGE_STRING='cpuminer-opt 3.15.2'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.15.1
+cpuminer-opt configure 3.15.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.15.1, which was
+It was created by cpuminer-opt $as_me 3.15.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.15.1'
+ VERSION='3.15.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.1, which was
+This file was extended by cpuminer-opt $as_me 3.15.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.1
+cpuminer-opt config.status 3.15.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index c0b3c5f..b5c82d2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.1])
+AC_INIT([cpuminer-opt], [3.15.2])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index ebfdb7d..e459e62 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3383,13 +3383,14 @@ bool check_cpu_capability ()
      bool sw_has_sha    = false;
      bool sw_has_vaes   = false;
      set_t algo_features = algo_gate.optimizations;
-     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
-     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
-     bool algo_has_sse42  = set_incl( SSE42_OPT,   algo_features );
-     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
-     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
-     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
-     bool algo_has_vaes   = set_incl( VAES_OPT,    algo_features );
+     bool algo_has_sse2    = set_incl( SSE2_OPT,    algo_features );
+     bool algo_has_aes     = set_incl( AES_OPT,     algo_features );
+     bool algo_has_sse42   = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_avx2    = set_incl( AVX2_OPT,    algo_features );
+     bool algo_has_avx512  = set_incl( AVX512_OPT,  algo_features );
+     bool algo_has_sha     = set_incl( SHA_OPT,     algo_features );
+     bool algo_has_vaes    = set_incl( VAES_OPT,    algo_features );
+     bool algo_has_vaes256 = set_incl( VAES256_OPT, algo_features );
      bool use_aes;
      bool use_sse2;
      bool use_sse42;
@@ -3510,7 +3511,8 @@ bool check_cpu_capability ()
      use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
      use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
      use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512;
+     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes 
+	       && ( use_avx512 || algo_has_vaes256 );   
      use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
                    use_sha || use_vaes );
       
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 155293a..5f94cbc 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -143,8 +143,8 @@ do { \
 // Parallel AES, for when x is expected to be in a 256 bit register.
 // Use same 128 bit key.
 
-//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#if 0
+#if defined(__VAES__)
+
 #define mm256_aesenc_2x128( x, k ) \
    _mm256_aesenc_epi128( x, k )
 
diff --git a/sysinfos.c b/sysinfos.c
index 1d5cdf3..010c78f 100644
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -483,11 +483,13 @@ static inline bool has_avx512()
 // AMD Zen3 added support for 256 bit VAES without requiring AVX512.
 // The original Intel spec requires AVX512F to support 512 bit VAES and 
 // requires AVX512VL to support 256 bit VAES.
-// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient.
-// However, proper detection of VAES512 and VAES256 requires more work:
-// VAES512 = VAES && AVX512F  (may not support VAES256)
-// VAES256 = AVX512VL ? VAES : ( AVX && VAES )   (may not support VAES512) 
-// VAES    = VAES && AVX512F && AVX512VL (supports both)
+// The CPUID VAES bit alone can't distiguish 256 vs 512 bit.
+// If necessary:
+// VAES 256 & 512 = VAES && AVX512VL
+// VAES 512 = VAES && AVX512F  
+// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F )
+// VAES 512 only = VAES && AVX512F && !AVX512VL
+// VAES 256 only = VAES && !AVX512F
 
 static inline bool has_vaes()
 {
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index f19bbb6..5e3542b 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -40,6 +40,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/
 
 # Start building...
 
+# Icelake AVX512 SHA VAES
 ./clean-all.sh || echo clean
 rm -f config.status
 ./autogen.sh || echo done
@@ -48,6 +49,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe
 
+# Zen1 AVX2 SHA
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS
@@ -55,6 +57,16 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-zen.exe
 
+# Zen3 AVX2 SHA VAES
+make clean || echo clean
+rm -f config.status
+CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS
+# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS
+make -j 8
+strip -s cpuminer.exe
+mv cpuminer.exe release/cpuminer-zen3.exe
+
+# Slylake-X AVX512 AES
 # mingw won't compile avx512 without -fno-asynchronous-unwind-tables
 make clean || echo clean
 rm -f config.status
@@ -64,6 +76,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx512.exe
 
+# Haswell AVX2 AES
 make clean || echo clean
 rm -f config.status
 # GCC 9 doesn't include AES in -march=core-avx2
@@ -72,6 +85,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx2.exe
 
+# Sandybridge AVX AES
 make clean || echo clean
 rm -f config.status
 # -march=corei7-avx still includes aes, but just in case
@@ -80,6 +94,7 @@ make -j 8
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-avx.exe
 
+# Westmere SSE4.2 AES
 # -march=westmere is supported in gcc5
 make clean || echo clean
 rm -f config.status
@@ -104,6 +119,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe
 #mv cpuminer.exe release/cpuminer-ssse3.exe
 #make clean || echo clean
 
+# Generic SSE2
 make clean || echo clean
 rm -f config.status
 CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS