From 3572cb53c49d650cdc6407fd605f2d6e20609c1b Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Thu, 2 Jan 2020 23:54:08 -0500
Subject: [PATCH] v3.11.0

---
 Makefile.am                            |   4 +
 RELEASE_NOTES                          |  17 +-
 algo-gate-api.c                        |   1 +
 algo/blake/blake-hash-4way.h           |   2 +-
 algo/blake/blake256-hash-4way.c        |   5 +-
 algo/blake/blake2s-hash-4way.h         |   3 +-
 algo/bmw/bmw512-4way.c                 |   3 +-
 algo/cubehash/cubehash_sse2.c          |  22 +-
 algo/echo/aes_ni/hash.c                |  18 +-
 algo/echo/aes_ni/hash.c.test           | 620 ++++++++++++++++++++++
 algo/echo/echo-hash-4way.c             | 398 +++-----------
 algo/groestl/aes_ni/groestl-intr-aes.h |   6 +-
 algo/groestl/aes_ni/hash-groestl.c     |   1 +
 algo/groestl/groestl-4way.c            |  64 +++
 algo/groestl/groestl-gate.c            |  23 +
 algo/groestl/groestl-gate.h            |  31 ++
 algo/groestl/groestl.c                 |  31 +-
 algo/groestl/groestl256-hash-4way.c    | 280 ++++++++++
 algo/groestl/groestl256-hash-4way.h    | 121 +++++
 algo/groestl/groestl256-intr-4way.h    | 492 ++++++++++++++++++
 algo/groestl/groestl512-hash-4way.c    | 114 ++++
 algo/groestl/groestl512-hash-4way.h    |  94 ++++
 algo/groestl/groestl512-intr-4way.h    | 654 +++++++++++++++++++++++
 algo/groestl/myrgr-4way.c              | 155 +++++-
 algo/groestl/myrgr-gate.c              |  10 +-
 algo/groestl/myrgr-gate.h              |  25 +-
 algo/hamsi/hamsi-hash-4way.c           |   3 +-
 algo/hamsi/hamsi-hash-4way.h           |   2 +-
 algo/haval/haval-4way-helper.c         |   2 +-
 algo/haval/haval-hash-4way.c           |   4 +-
 algo/haval/haval-hash-4way.h           |   2 +-
 algo/jh/jh-hash-4way.h                 |   2 -
 algo/jh/jha-4way.c                     |   6 +-
 algo/keccak/keccak-hash-4way.h         |   2 -
 algo/lyra2/allium-4way.c               |  34 +-
 algo/lyra2/lyra2-gate.c                |   2 +-
 algo/lyra2/lyra2h-4way.c               |   2 +-
 algo/lyra2/lyra2rev2-4way.c            |  12 +-
 algo/lyra2/lyra2rev3-4way.c            |  13 +-
 algo/lyra2/lyra2z-4way.c               |   8 +-
 algo/nist5/nist5-4way.c                |   8 +-
 algo/quark/anime-4way.c                |  20 +-
 algo/quark/hmq1725-4way.c              | 217 ++++++--
 algo/quark/hmq1725-gate.c              |   2 +-
 algo/quark/quark-4way.c                |  77 ++-
 algo/quark/quark-gate.c                |   2 +-
 algo/qubit/qubit-2way.c                |  60 ++-
 algo/qubit/qubit-gate.c                |   2 +-
 algo/ripemd/lbry-4way.c                |  54 +-
 algo/ripemd/lbry-gate.c                |   2 +-
 algo/ripemd/lbry-gate.h                |  13 +-
 algo/ripemd/ripemd-hash-4way.c         |   8 +-
 algo/ripemd/ripemd-hash-4way.h         |  10 +-
 algo/sha/sha-hash-4way.h               |  38 +-
 algo/sha/sha256-hash-4way.c            | 298 +++++++++--
 algo/sha/sha256q-4way.c                |  20 +-
 algo/sha/sha256t-4way.c                |  16 +-
 algo/sha/sha512-hash-4way.c            |  94 ++--
 algo/shabal/shabal-hash-4way.h         |   2 +-
 algo/shavite/shavite-hash-4way.c       | 399 ++++++++++++++
 algo/shavite/shavite-hash-4way.h       |  25 +
 algo/skein/skein-4way.c                |  62 +--
 algo/skein/skein-hash-4way.h           |   4 +-
 algo/skein/skein2-4way.c               |   4 +-
 algo/sm3/sm3-hash-4way.c               | 258 +++++++--
 algo/sm3/sm3-hash-4way.h               |  27 +-
 algo/x11/c11-4way.c                    | 112 ++--
 algo/x11/c11-gate.c                    |   2 +-
 algo/x11/timetravel-4way.c             |  10 +-
 algo/x11/timetravel10-4way.c           |  10 +-
 algo/x11/tribus-4way.c                 |  26 +-
 algo/x11/tribus-gate.c                 |   2 +-
 algo/x11/x11-4way.c                    | 107 +++-
 algo/x11/x11-gate.c                    |   2 +-
 algo/x11/x11evo-4way.c                 |  10 +-
 algo/x11/x11gost-4way.c                | 114 ++--
 algo/x12/x12-4way.c                    | 122 +++--
 algo/x12/x12-gate.c                    |   2 +-
 algo/x13/phi1612-4way.c                |  30 +-
 algo/x13/phi1612-gate.c                |   2 +-
 algo/x13/skunk-4way.c                  |   2 +-
 algo/x13/x13-4way.c                    | 114 ++--
 algo/x13/x13-gate.c                    |   2 +-
 algo/x13/x13bcd-4way.c                 | 389 ++++++++++++--
 algo/x13/x13sm3-4way.c                 |  16 +-
 algo/x13/x13sm3-gate.c                 |   8 +-
 algo/x13/x13sm3-gate.h                 |  38 +-
 algo/x14/polytimos-4way.c              |   4 +-
 algo/x14/veltor-4way.c                 |   4 +-
 algo/x14/x14-4way.c                    | 116 +++--
 algo/x14/x14-gate.c                    |   2 +-
 algo/x15/x15-4way.c                    | 122 +++--
 algo/x15/x15-gate.c                    |   2 +-
 algo/x16/x16r-4way.c                   | 178 ++++---
 algo/x16/x16r-gate.c                   |  19 +-
 algo/x16/x16rt-4way.c                  |  82 ++-
 algo/x16/x16rv2-4way.c                 |  83 ++-
 algo/x16/x21s-4way.c                   | 183 ++++---
 algo/x17/sonoa-4way.c                  | 417 +++++++++++++--
 algo/x17/sonoa-gate.c                  |   2 +-
 algo/x17/x17-4way.c                    | 142 +++--
 algo/x17/x17-gate.c                    |   2 +-
 algo/x17/xevan-4way.c                  | 135 ++++-
 algo/x17/xevan-gate.c                  |   2 +-
 algo/x22/x22i-4way.c                   |  89 +++-
 algo/x22/x22i-gate.c                   |  24 +-
 algo/x22/x22i-gate.h                   |  10 +-
 algo/x22/x25x-4way.c                   | 691 ++++++++++++++++++++++---
 build-allarch.sh                       |   2 +-
 build-avx2.sh                          |  27 +
 clean-all.sh                           |  10 +
 configure                              |  20 +-
 configure.ac                           |   2 +-
 cpu-miner.c                            |   2 +-
 simd-utils/intrlv.h                    |  39 ++
 simd-utils/simd-256.h                  |  21 +-
 simd-utils/simd-512.h                  |  36 +-
 winbuild-cross.sh                      |   7 +-
 118 files changed, 7030 insertions(+), 1575 deletions(-)
 create mode 100644 algo/echo/aes_ni/hash.c.test
 create mode 100644 algo/groestl/groestl-4way.c
 create mode 100644 algo/groestl/groestl-gate.c
 create mode 100644 algo/groestl/groestl-gate.h
 create mode 100644 algo/groestl/groestl256-hash-4way.c
 create mode 100644 algo/groestl/groestl256-hash-4way.h
 create mode 100644 algo/groestl/groestl256-intr-4way.h
 create mode 100644 algo/groestl/groestl512-hash-4way.c
 create mode 100644 algo/groestl/groestl512-hash-4way.h
 create mode 100644 algo/groestl/groestl512-intr-4way.h
 create mode 100644 algo/shavite/shavite-hash-4way.c
 create mode 100644 algo/shavite/shavite-hash-4way.h
 create mode 100755 build-avx2.sh
 create mode 100755 clean-all.sh

diff --git a/Makefile.am b/Makefile.am
index 916bccb..fe5bcf7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -87,8 +87,11 @@ cpuminer_SOURCES = \
   algo/echo/echo-hash-4way.c \
   algo/echo/aes_ni/hash.c\
   algo/gost/sph_gost.c \
+  algo/groestl/groestl-gate.c \
+  algo/groestl/groestl512-hash-4way.c \
   algo/groestl/sph_groestl.c \
   algo/groestl/groestl.c \
+  algo/groestl/groestl-4way.c \
   algo/groestl/myrgr-gate.c \
   algo/groestl/myrgr-4way.c \
   algo/groestl/myr-groestl.c \
@@ -188,6 +191,7 @@ cpuminer_SOURCES = \
   algo/shavite/sph_shavite.c \
   algo/shavite/sph-shavite-aesni.c \
   algo/shavite/shavite-hash-2way.c \
+  algo/shavite/shavite-hash-4way.c \
   algo/shavite/shavite.c \
   algo/simd/sph_simd.c \
   algo/simd/nist.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 51ded93..0228c77 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -21,7 +21,7 @@ required.
 Compile Instructions
 --------------------
 
-See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions
 
 Requirements
 ------------
@@ -35,13 +35,26 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------
 
+v3.11.0
+
+Fixed x25x AVX512 lane 4 invalid shares.
+
+AVX512 for hex, phi2.
+
+VAES optimzation for Intel Icelake CPUs for most algos recently optimized
+with AVX512, source code only.
+
+v3.10.7
+
+AVX512 for x25x, lbry, x13bcd (bcd).
+
 v3.10.6
 
 Added support for SSL stratum: stratum+tcps://
 
 Added job id reporting again, but leaner, suppressed with --quiet.
 
-AVX512 for x21s, x22i, lyra2z, allium
+AVX512 for x21s, x22i, lyra2z, allium.
 
 Fixed share overflow warnings mining lbry with Ryzen (SHA).
 
diff --git a/algo-gate-api.c b/algo-gate-api.c
index cebfc8b..f77ee29 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -317,6 +317,7 @@ const char* const algo_alias_map[][2] =
   { "argon2d-crds",      "argon2d250"   },
   { "argon2d-dyn",       "argon2d500"   },
   { "argon2d-uis",       "argon2d4096"  },
+  { "bcd",               "x13bcd"       },
   { "bitcore",           "timetravel10" },
   { "bitzeny",           "yescryptr8"   },
   { "blake256r8",        "blakecoin"    },
diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h
index 9f389f6..091a537 100644
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -104,7 +104,7 @@ typedef struct {
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
 void blake256_8way_update(void *cc, const void *data, size_t len);
-#define blake256_8way blake256_8way_update
+//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);
 
 // 14 rounds, blake, decred
diff --git a/algo/blake/blake256-hash-4way.c b/algo/blake/blake256-hash-4way.c
index f958659..3de0363 100644
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
 }
 
 static void
-blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
+blake32_4way( blake_4way_small_context *ctx, const void *data,
+              size_t len )
 {
    __m128i *buf = (__m128i*)ctx->buf;
    size_t  bptr = ctx->ptr<<2;
@@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx)
 }
 
 void
-blake256_4way(void *ctx, const void *data, size_t len)
+blake256_4way_update(void *ctx, const void *data, size_t len)
 {
 	blake32_4way(ctx, data, len);
 }
diff --git a/algo/blake/blake2s-hash-4way.h b/algo/blake/blake2s-hash-4way.h
index 953841f..baf2865 100644
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,6 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1
 
-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)
 
 #include "simd-utils.h"
@@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 }
 #endif
 
-#endif  // __SSE4_2__
+#endif  // __SSE2__
 
 #endif
diff --git a/algo/bmw/bmw512-4way.c b/algo/bmw/bmw512-4way.c
index 2757fdd..795be11 100644
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -41,7 +41,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
 
       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
       {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) )
@@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input)
 {
     bmw512_4way_context ctx;
     bmw512_4way_init( &ctx );
-    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_update( &ctx, input, 80 );
     bmw512_4way_close( &ctx, state );
 }
 
diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c
index 8b9d010..c508248 100644
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -21,7 +21,27 @@ static void transform( cubehashParam *sp )
     int r;
     const int rounds = sp->rounds;
 
-#ifdef __AVX2__
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+    register __m512i x0, x1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->x     );
+    x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
+
+    for ( r = 0; r < rounds; ++r )
+    { 
+        x1 = _mm512_add_epi32( x0, x1 );
+        x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
+        x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
+        x0 = _mm512_xor_si512( mm512_rol_32(
+                                         mm512_swap256_128( x0 ), 11 ), x1 );
+        x1 = mm512_swap64_32( x1 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->x,     x0 );
+    _mm512_store_si512( (__m512i*)sp->x + 1, x1 );
+
+#elif defined(__AVX2__)
 
     register __m256i x0, x1, x2, x3, y0, y1;
 
diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c
index 7dd48e4..f736697 100644
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -186,7 +186,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
 	{
 	   for(i = 0; i < 4; i++)
 	   {
-		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+		_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
 	   }
 	}
 
@@ -390,13 +390,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	}
 
 	// Store the hash value
-	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+	_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
 
 	if(state->uHashSize == 512)
 	{
-		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+		_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
 	}
 
 	return SUCCESS;
@@ -513,13 +513,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
    }
 
    // Store the hash value
-   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
 
    if( state->uHashSize == 512 )
    {
-        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
 
    }
    return SUCCESS;
diff --git a/algo/echo/aes_ni/hash.c.test b/algo/echo/aes_ni/hash.c.test
new file mode 100644
index 0000000..41d5a57
--- /dev/null
+++ b/algo/echo/aes_ni/hash.c.test
@@ -0,0 +1,620 @@
+/*
+ * file        : echo_vperm.c
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * - vperm and aes_ni implementations of hash function ECHO
+ * - implements NIST hash api
+ * - assumes that message lenght is multiple of 8-bits
+ * - _ECHO_VPERM_ must be defined if compiling with ../main.c
+ * -  define NO_AES_NI for aes_ni version
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+#if defined(__AES__)
+
+#include <memory.h>
+#include "miner.h"
+#include "hash_api.h"
+//#include "vperm.h"
+#include <immintrin.h>
+/*
+#ifndef NO_AES_NI
+#include <wmmintrin.h>
+#else
+#include <tmmintrin.h>
+#endif
+*/
+
+MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
+MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
+MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
+MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
+MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
+MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
+MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
+MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
+MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
+MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
+MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
+MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
+MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
+
+
+MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
+
+
+#define ECHO_SUBBYTES(state, i, j) \
+	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
+	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
+	k1 = _mm_add_epi32(k1, M128(const1))
+
+#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
+	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
+	t1 = _mm_srli_epi16(state1[0][j], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = s2;\
+	state2[1][j] = state1[0][j];\
+	state2[2][j] = state1[0][j];\
+	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
+	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
+	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
+	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+
+
+#define ECHO_ROUND_UNROLL2 \
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+
+
+#define SAVESTATE(dst, src)\
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]
+
+
+void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+{
+   unsigned int r, b, i, j;
+   __m128i t1, t2, s2, k1;
+   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+   for(i = 0; i < 4; i++)
+	for(j = 0; j < ctx->uHashSize / 256; j++)
+		_state[i][j] = ctx->state[i][j];
+
+   for(b = 0; b < uBlockCount; b++)
+   {
+	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
+
+	// load message
+	for(j = ctx->uHashSize / 256; j < 4; j++)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	   }
+	}
+
+uint64_t *b = (uint64_t*)_state;
+//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+   
+	// save state
+	SAVESTATE(_statebackup, _state);
+
+	k1 = ctx->k;
+
+	for(r = 0; r < ctx->uRounds / 2; r++)
+	{
+		ECHO_ROUND_UNROLL2;
+	}
+
+//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+   
+   
+	if(ctx->uHashSize == 256)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	   }
+	}
+	else
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+           }
+	}
+	pmsg += ctx->uBlockLength;
+   }
+	SAVESTATE(ctx->state, _state);
+
+}
+
+
+
+HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+{
+	int i, j;
+
+        ctx->k = _mm_setzero_si128(); 
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch(nHashSize)
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
+			break;
+
+		default:
+			return BAD_HASHBITLEN;
+	}
+
+
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < nHashSize / 256; j++)
+			ctx->state[i][j] = ctx->hashsize;
+
+	for(i = 0; i < 4; i++)
+		for(j = nHashSize / 256; j < 4; j++)
+			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
+
+	return SUCCESS;
+}
+
+HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+{
+	unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+	uByteLength = (unsigned int)(databitlen / 8);
+
+	if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
+	{
+		if(state->uBufferBytes != 0)
+		{
+			// Fill the buffer
+			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+
+			// Process buffer
+			Compress(state, state->buffer, 1);
+			state->processed_bits += state->uBlockLength * 8;
+
+			data += state->uBlockLength - state->uBufferBytes;
+			uByteLength -= state->uBlockLength - state->uBufferBytes;
+		}
+
+		// buffer now does not contain any unprocessed bytes
+
+		uBlockCount = uByteLength / state->uBlockLength;
+		uRemainingBytes = uByteLength % state->uBlockLength;
+
+		if(uBlockCount > 0)
+		{
+			Compress(state, data, uBlockCount);
+
+			state->processed_bits += uBlockCount * state->uBlockLength * 8;
+			data += uBlockCount * state->uBlockLength;
+		}
+
+		if(uRemainingBytes > 0)
+		{
+			memcpy(state->buffer, (void*)data, uRemainingBytes);
+		}
+
+		state->uBufferBytes = uRemainingBytes;
+	}
+	else
+	{
+		memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
+		state->uBufferBytes += uByteLength;
+	}
+
+	return SUCCESS;
+}
+
+HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+{
+	__m128i remainingbits;
+
+	// Add remaining bytes in the buffer
+	state->processed_bits += state->uBufferBytes * 8;
+
+	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
+
+	// Pad with 0x80
+	state->buffer[state->uBufferBytes++] = 0x80;
+	
+	// Enough buffer space for padding in this block?
+	if((state->uBlockLength - state->uBufferBytes) >= 18)
+	{
+		// Pad with zeros
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Last block contains message bits?
+		if(state->uBufferBytes == 1)
+		{
+			state->k = _mm_xor_si128(state->k, state->k);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+		else
+		{
+			state->k = _mm_add_epi64(state->k, remainingbits);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+
+		// Compress
+		Compress(state, state->buffer, 1);
+	}
+	else
+	{
+		// Fill with zero and compress
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
+		state->k = _mm_add_epi64(state->k, remainingbits);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+
+		// Last block
+		memset(state->buffer, 0, state->uBlockLength - 18);
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Compress the last block
+		state->k = _mm_xor_si128(state->k, state->k);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+	}
+
+	// Store the hash value
+	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+
+	if(state->uHashSize == 512)
+	{
+		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+	}
+
+	return SUCCESS;
+}
+
+HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+                              const BitSequence *data, DataLength databitlen )
+{
+   unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+   uByteLength = (unsigned int)(databitlen / 8);
+
+/*   
+   if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
+   {
+printf("full block\n");
+      if( state->uBufferBytes != 0 )
+        {
+           // Fill the buffer
+           memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+           // Process buffer
+           Compress( state, state->buffer, 1 );
+           state->processed_bits += state->uBlockLength * 8;
+
+           data += state->uBlockLength - state->uBufferBytes;
+           uByteLength -= state->uBlockLength - state->uBufferBytes;
+        }
+
+        // buffer now does not contain any unprocessed bytes
+
+        uBlockCount = uByteLength / state->uBlockLength;
+        uRemainingBytes = uByteLength % state->uBlockLength;
+
+        if( uBlockCount > 0 )
+        {
+           Compress( state, data, uBlockCount );
+           state->processed_bits += uBlockCount * state->uBlockLength * 8;
+           data += uBlockCount * state->uBlockLength;
+        }
+
+        if( uRemainingBytes > 0 )
+        memcpy(state->buffer, (void*)data, uRemainingBytes);
+
+        state->uBufferBytes = uRemainingBytes;
+   }
+   else
+   {
+*/
+   memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+        state->uBufferBytes += uByteLength;
+//   }
+
+   __m128i remainingbits;
+
+   // Add remaining bytes in the buffer
+   state->processed_bits += state->uBufferBytes * 8;
+
+   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+   // Pad with 0x80
+   state->buffer[state->uBufferBytes++] = 0x80;
+
+   // Enough buffer space for padding in this block?
+
+//   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
+//   {
+        // Pad with zeros
+
+        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+
+        // Last block contains message bits?
+        if( state->uBufferBytes == 1 )
+        {
+           state->k = _mm_xor_si128( state->k, state->k );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+        else
+        {
+           state->k = _mm_add_epi64( state->k, remainingbits );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+
+uint64_t *b = (uint64_t*)&state->k;
+/*
+printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+b = (uint64_t*)state->buffer;
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
+
+b = (uint64_t*)state->state;
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
+*/        
+        // Compress
+        Compress( state, state->buffer, 1 );
+
+//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+
+        
+/*
+   }
+   else
+   {
+        // Fill with zero and compress
+        memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1 );
+
+        // Last block
+        memset( state->buffer, 0, state->uBlockLength - 18 );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+        // Compress the last block
+        state->k = _mm_xor_si128( state->k, state->k );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1) ;
+   }
+*/
+
+   // Store the hash value
+   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
+
+   if( state->uHashSize == 512 )
+   {
+        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
+
+   }
+   return SUCCESS;
+}
+
+
+HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
+{
+	HashReturn hRet;
+	hashState_echo hs;
+
+	/////
+	/*
+	__m128i a, b, c, d, t[4], u[4], v[4];
+
+	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
+	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
+	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
+	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
+
+	t[0] = _mm_unpacklo_epi8(a, b);
+	t[1] = _mm_unpackhi_epi8(a, b);
+	t[2] = _mm_unpacklo_epi8(c, d);
+	t[3] = _mm_unpackhi_epi8(c, d);
+
+	u[0] = _mm_unpacklo_epi16(t[0], t[2]);
+	u[1] = _mm_unpackhi_epi16(t[0], t[2]);
+	u[2] = _mm_unpacklo_epi16(t[1], t[3]);
+	u[3] = _mm_unpackhi_epi16(t[1], t[3]);
+
+
+	t[0] = _mm_unpacklo_epi16(u[0], u[1]);
+	t[1] = _mm_unpackhi_epi16(u[0], u[1]);
+	t[2] = _mm_unpacklo_epi16(u[2], u[3]);
+	t[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+	u[0] = _mm_unpacklo_epi8(t[0], t[1]);
+	u[1] = _mm_unpackhi_epi8(t[0], t[1]);
+	u[2] = _mm_unpacklo_epi8(t[2], t[3]);
+	u[3] = _mm_unpackhi_epi8(t[2], t[3]);
+
+	a = _mm_unpacklo_epi8(u[0], u[1]);
+	b = _mm_unpackhi_epi8(u[0], u[1]);
+	c = _mm_unpacklo_epi8(u[2], u[3]);
+	d = _mm_unpackhi_epi8(u[2], u[3]);
+	*/
+	/////
+
+	hRet = init_echo(&hs, hashbitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = update_echo(&hs, data, databitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = final_echo(&hs, hashval);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	return SUCCESS;
+}
+
+#endif
diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c
index f8f408a..10a4f71 100644
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -1,78 +1,37 @@
-#if defined(__AVX512VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+//#if 0
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 #include "simd-utils.h"
 #include "echo-hash-4way.h"
 
 /*
-#include <memory.h>
-#include "miner.h"
-#include "hash_api.h"
-//#include "vperm.h"
-#include <immintrin.h>
+static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
+{  
+   0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
+   0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
+};
 */
-/*
-#ifndef NO_AES_NI
-#include <wmmintrin.h>
-#else
-#include <tmmintrin.h>
-#endif
-*/
-
-// not used
-/*
-const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
-const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
-const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
-const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
-const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
-const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
-const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
-const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
-const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
-const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
-const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
-const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
-const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
-const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
-const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
-const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
-const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
-const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
-const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
-const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
-const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
-*/
-
-/*
-MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
-MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
-*/
-
-MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
-
 // do these need to be reversed?
 
 #define mul2mask \
-   m512_const4_32( 0x00001b00, 0, 0, 0 ) 
+   _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+//   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
 
 #define lsbmask    m512_const1_32( 0x01010101 ) 
 
 #define ECHO_SUBBYTES( state, i, j ) \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
 	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
-	k1 = _mm512_add_epi32( k1, m512_one_32 )
+	k1 = _mm512_add_epi32( k1, m512_one_128 );
 
 #define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
 { \
-   const int j1 = ( j+1 ) & 3; \
-   const int j2 = ( j+2 ) & 3; \
-   const int j3 = ( j+3 ) & 3; \
+   const int j1 = ( (j)+1 ) & 3; \
+   const int j2 = ( (j)+2 ) & 3; \
+   const int j3 = ( (j)+3 ) & 3; \
    s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
 	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
-	t1 = _mm512_and_si128( t1, lsbmask );\
+	t1 = _mm512_and_si512( t1, lsbmask );\
 	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
 	s2 = _mm512_xor_si512( s2, t2 ); \
 	state2[ 0 ] [j ] = s2; \
@@ -97,7 +56,7 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
 	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
                             _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
-	state2[ 2 ][ j ] = _mm512_xor_si512128( state2[ 2 ][ j ], s2 ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
 	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
 	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
 	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
@@ -108,12 +67,12 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
 	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
                             _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
-	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 )
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
 } while(0)
 
 #define ECHO_ROUND_UNROLL2 \
 	ECHO_SUBBYTES(_state, 0, 0);\
-	ECHO_SUBBYTES(_state, 1, 0);\
+   ECHO_SUBBYTES(_state, 1, 0);\
 	ECHO_SUBBYTES(_state, 2, 0);\
 	ECHO_SUBBYTES(_state, 3, 0);\
 	ECHO_SUBBYTES(_state, 0, 1);\
@@ -153,8 +112,6 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
 	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
 
-
-
 #define SAVESTATE(dst, src)\
 	dst[0][0] = src[0][0];\
 	dst[0][1] = src[0][1];\
@@ -173,33 +130,44 @@ MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
 	dst[3][2] = src[3][2];\
 	dst[3][3] = src[3][3]
 
-
-void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
+// blockcount always 1
+void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
                unsigned int uBlockCount )
 {
   unsigned int r, b, i, j;
   __m512i t1, t2, s2, k1;
   __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
 
-// unroll   
-  for ( i = 0; i < 4; i++ )
-  for ( j = 0; j < ctx->uHashSize / 256; j++ )
-	 _state[ i ][ j ] = ctx->state[ i ][ j ];
+  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
+  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
+  _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
+  _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
+  _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
+  _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
+  _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
+  _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
+  _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
+  _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
+  _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
+  _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
+  _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
+  _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
+  _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
+  _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
 
   for ( b = 0; b < uBlockCount; b++ )
   {
     ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
 
-    // load message, make aligned, remove loadu
     for( j = ctx->uHashSize / 256; j < 4; j++ )
     {
       for ( i = 0; i < 4; i++ )
 	   {
-        _state[ i ][ j ] = _mm512_loadu_si512( 
-                     (__m512i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+        _state[ i ][ j ] = _mm512_load_si512( 
+                     pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
 	   }
 	 }
-
+    
     // save state
 	 SAVESTATE( _statebackup, _state );
 
@@ -254,8 +222,6 @@ void echo_4way_compress( echo_4way_context *ctx, const unsigned char *pmsg,
 
 }
 
-
-
 int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 {
 	int i, j;
@@ -270,23 +236,22 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 			ctx->uHashSize = 256;
 			ctx->uBlockLength = 192;
 			ctx->uRounds = 8;
-			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x100 );
-			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x600 );
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
 			break;
 
 		case 512:
 			ctx->uHashSize = 512;
 			ctx->uBlockLength = 128;
 			ctx->uRounds = 10;
-			ctx->hashsize = _mm512_const4_32( 0, 0, 0, 0x200 );
-			ctx->const1536 = _mm512_const4_32( 0, 0, 0, 0x400);
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
 			break;
 
 		default:
-			return BAD_HASHBITLEN;
+			return 1;
 	}
 
-
 	for( i = 0; i < 4; i++ )
 		for( j = 0; j < nHashSize / 256; j++ )
 			ctx->state[ i ][ j ] = ctx->hashsize;
@@ -295,263 +260,56 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 		for( j = nHashSize / 256; j < 4; j++ )
 			ctx->state[ i ][ j ] = m512_zero;
 
-	return SUCCESS;
-}
-
-int echo_4way_update( echo_4way_context *state, const BitSequence *data, DataLength databitlen )
-{
-	unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-	uByteLength = (unsigned int)(databitlen / 8);
-
-	if ( ( state->uBufferBytes + uByteLength ) >= state->uBlockLength )
-	{
-		if ( state->uBufferBytes != 0 )
-		{
-			// Fill the buffer
-			memcpy( state->buffer + state->uBufferBytes,
-               (void*)data, state->uBlockLength - state->uBufferBytes );
-
-			// Process buffer
-			echo_4way_compress( state, state->buffer, 1 );
-			state->processed_bits += state->uBlockLength * 8;
-
-			data += state->uBlockLength - state->uBufferBytes;
-			uByteLength -= state->uBlockLength - state->uBufferBytes;
-		}
-
-		// buffer now does not contain any unprocessed bytes
-
-		uBlockCount = uByteLength / state->uBlockLength;
-		uRemainingBytes = uByteLength % state->uBlockLength;
-
-		if ( uBlockCount > 0 )
-		{
-			echo_4way_compress( state, data, uBlockCount );
-
-			state->processed_bits += uBlockCount * state->uBlockLength * 8;
-			data += uBlockCount * state->uBlockLength;
-		}
-
-		if ( uRemainingBytes > 0 )
-		{
-			memcpy( state->buffer, (void*)data, uRemainingBytes );
-		}
-
-		state->uBufferBytes = uRemainingBytes;
-	}
-	else
-	{
-		memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-		state->uBufferBytes += uByteLength;
-	}
-
 	return 0;
 }
 
-echo_4way_close( echo_4way_context *state, BitSequence *hashval )
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen )
 {
-	__m512i remainingbits;
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
 
-	// Add remaining bytes in the buffer
-	state->processed_bits += state->uBufferBytes * 8;
+   int vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+   const int vblen = state->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;
 
-	remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-	// Pad with 0x80
-	state->buffer[ state->uBufferBytes++ ] = 0x80;
-	
-	// Enough buffer space for padding in this block?
-	if ( ( state->uBlockLength - state->uBufferBytes ) >= 18)
-	{
-		// Pad with zeros
-		memset( state->buffer + state->uBufferBytes, 0,
-                         state->uBlockLength - ( state->uBufferBytes + 18 ) );
-
-		// Hash size
-		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
-           = state->uHashSize;
-
-		// Processed bits
-		*( ( DataLength*)( state->buffer + state->uBlockLength - 16 ) )
-           = state->processed_bits;
-		*( ( DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
-
-		// Last block contains message bits?
-		if ( state->uBufferBytes == 1 )
-		{
-			state->k = _mm512_xor_si512( state->k, state->k );
-			state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		}
-		else
-		{
-			state->k = _mm512_add_epi64( state->k, remainingbits );
-			state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		}
-
-		// Compress
-		echo_4way_compress( state, state->buffer, 1 );
-	}
-	else
-	{
-		// Fill with zero and compress
-		memset( state->buffer + state->uBufferBytes, 0,
-                        state->uBlockLength - state->uBufferBytes );
-		state->k = _mm512_add_epi64( state->k, remainingbits );
-		state->k = _mm512_sub_epi64( state->k, state->const1536 );
-		echo_4way_compress( state, state->buffer, 1 );
-
-		// Last block
-		memset( state->buffer, 0, state->uBlockLength - 18 );
-
-		// Hash size
-		*( (unsigned short*)( state->buffer + state->uBlockLength - 18 ) )
-            = state->uHashSize;
-
-		// Processed bits
-		*( (DataLength*)( state->buffer + state->uBlockLength - 16 ) )
-            = state->processed_bits;
-		*( (DataLength*)( state->buffer + state->uBlockLength - 8 ) ) = 0;
-
-		// Compress the last block
-		state->k = _mm512_xor_si512(state->k, state->k);
-		state->k = _mm512_sub_epi64(state->k, state->const1536);
-		echo_4way_compress(state, state->buffer, 1);
-	}
-
-	// Store the hash value
-	_mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0][ 0 ]);
-	_mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1][ 0 ]);
-
-	if ( state->uHashSize == 512 )
-	{
-		_mm512_storeu_si512((__m512i*)hashval + 2, state->state[ 2 ][ 0 ]);
-		_mm512_storeu_si512((__m512i*)hashval + 3, state->state[ 3 ][ 0 ]);
-	}
-
-	return 0;
-}
-
-int echo_4way_update_close( echo_4way_context *state, BitSequence *hashval,
-                              const BitSequence *data, DataLength databitlen )
-{
-  unsigned int uByteLength, uBlockCount, uRemainingBytes;
-
-  uByteLength = (unsigned int)(databitlen / 8);
-
-  if ( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
-  {
-     if ( state->uBufferBytes != 0 )
-     {
-        // Fill the buffer
-        memcpy( state->buffer + state->uBufferBytes,
-                   (void*)data, state->uBlockLength - state->uBufferBytes );
-
-        // Process buffer
-        echo_4way_compress( state, state->buffer, 1 );
-        state->processed_bits += state->uBlockLength * 8;
-
-        data += state->uBlockLength - state->uBufferBytes;
-        uByteLength -= state->uBlockLength - state->uBufferBytes;
-     }
-
-     // buffer now does not contain any unprocessed bytes
-
-     uBlockCount = uByteLength / state->uBlockLength;
-     uRemainingBytes = uByteLength % state->uBlockLength;
-
-     if ( uBlockCount > 0 )
-     {
-        echo_4way_compress( state, data, uBlockCount );
-        state->processed_bits += uBlockCount * state->uBlockLength * 8;
-        data += uBlockCount * state->uBlockLength;
-     }
-
-     if ( uRemainingBytes > 0 )
-     memcpy(state->buffer, (void*)data, uRemainingBytes);
-     state->uBufferBytes = uRemainingBytes;
-  }
-  else
-  {
-     memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
-     state->uBufferBytes += uByteLength;
-  } 
-
-  __m512i remainingbits;
-
-  // Add remaining bytes in the buffer
-  state->processed_bits += state->uBufferBytes * 8;
-
-  remainingbits = _mm512_set4_epi32( 0, 0, 0, state->uBufferBytes * 8 );
-
-  // Pad with 0x80
-  state->buffer[ state->uBufferBytes++ ] = 0x80;
-  // Enough buffer space for padding in this block?
-  if ( (state->uBlockLength - state->uBufferBytes) >= 18 )
+   if ( databitlen == 1024 )
    {
-     // Pad with zeros
-     memset( state->buffer + state->uBufferBytes, 0,i
-                        state->uBlockLength - (state->uBufferBytes + 18) );
+      echo_4way_compress( state, data, 1 );
+      state->processed_bits = 1024;
+      remainingbits = m512_zero;
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( state->buffer, data, vlen );
+   
+      state->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
 
-     // Hash size
-     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) )
-                   = state->uHashSize;
+   }
 
-     // Processed bits
-     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                   state->processed_bits;
-     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+  state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+  memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
+  state->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
+  state->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, state->processed_bits,
+                                      0, state->processed_bits );  
 
-     // Last block contains message bits?
-     if( state->uBufferBytes == 1 )
-     {
-        state->k = _mm512_xor_si512( state->k, state->k );
-        state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     }
-     else
-     {
-        state->k = _mm_add_epi64( state->k, remainingbits );
-        state->k = _mm_sub_epi64( state->k, state->const1536 );
-     }
+  state->k = _mm512_add_epi64( state->k, remainingbits );
+  state->k = _mm512_sub_epi64( state->k, state->const1536 );
 
-     // Compress
-     echo_4way_compress( state, state->buffer, 1 );
-  }
-  else
-  {
-     // Fill with zero and compress
-     memset( state->buffer + state->uBufferBytes, 0,
-                state->uBlockLength - state->uBufferBytes );
-     state->k = _mm512_add_epi64( state->k, remainingbits );
-     state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     echo_4way_compress( state, state->buffer, 1 );
+  echo_4way_compress( state, state->buffer, 1 );
 
-     // Last block
-     memset( state->buffer, 0, state->uBlockLength - 18 );
-
-     // Hash size
-     *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
-                 state->uHashSize;
-
-     // Processed bits
-     *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
-                  state->processed_bits;
-     *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
-     // Compress the last block
-     state->k = _mm512_xor_si512( state->k, state->k );
-     state->k = _mm512_sub_epi64( state->k, state->const1536 );
-     echo_4way_compress( state, state->buffer, 1) ;
-  }
-
-  // Store the hash value
-  _mm512_storeu_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
-  _mm512_storeu_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
+  _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
+  _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
 
   if ( state->uHashSize == 512 )
   {
-     _mm512_storeu_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
-     _mm512_storeu_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
-
+     _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
+     _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
   }
   return 0;
 }
diff --git a/algo/groestl/aes_ni/groestl-intr-aes.h b/algo/groestl/aes_ni/groestl-intr-aes.h
index 2a56aad..10092da 100644
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -73,7 +73,7 @@ __m128i ALL_FF;
   b5 = a7;\
   a6 = _mm_xor_si128(a6, a7);\
   a7 = _mm_xor_si128(a7, b6);\
-  \
+   \
   /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
   b0 = _mm_xor_si128(b0, a4);\
   b6 = _mm_xor_si128(b6, a4);\
@@ -195,7 +195,7 @@ __m128i ALL_FF;
   for(round_counter = 0; round_counter < 14; round_counter+=2) {\
     /* AddRoundConstant P1024 */\
     xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
+     /* ShiftBytes P1024 + pre-AESENCLAST */\
     xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
     xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
     xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
@@ -209,7 +209,6 @@ __m128i ALL_FF;
     \
     /* AddRoundConstant P1024 */\
     xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
     xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
     xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
     xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
@@ -218,7 +217,6 @@ __m128i ALL_FF;
     xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
     xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
     xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    /* SubBytes + MixBytes */\
     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
   }\
 }
diff --git a/algo/groestl/aes_ni/hash-groestl.c b/algo/groestl/aes_ni/hash-groestl.c
index e77aab9..2adffd9 100644
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -230,6 +230,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
 
    // digest final padding block and do output transform
    TF1024( ctx->chaining, ctx->buffer );
+
    OF1024( ctx->chaining );
 
    // store hash result in output 
diff --git a/algo/groestl/groestl-4way.c b/algo/groestl/groestl-4way.c
new file mode 100644
index 0000000..b545146
--- /dev/null
+++ b/algo/groestl/groestl-4way.c
@@ -0,0 +1,64 @@
+#include "groestl-gate.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(GROESTL_4WAY_VAES)
+
+#include "groestl512-hash-4way.h"
+
+void groestl_4way_hash( void *output, const void *input )
+{
+     uint32_t hash[16*4] __attribute__ ((aligned (128)));
+     groestl512_4way_context ctx;
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, input, 640 );
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, hash, 512 );
+
+     dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
+ }
+
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*4] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 4;
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+     do
+     {
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );
+
+        groestl_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#endif
diff --git a/algo/groestl/groestl-gate.c b/algo/groestl/groestl-gate.c
new file mode 100644
index 0000000..92c79bc
--- /dev/null
+++ b/algo/groestl/groestl-gate.c
@@ -0,0 +1,23 @@
+#include "groestl-gate.h"
+
+bool register_dmd_gr_algo( algo_gate_t *gate )
+{
+#if defined (GROESTL_4WAY_VAES)
+  gate->scanhash  = (void*)&scanhash_groestl_4way;
+  gate->hash      = (void*)&groestl_4way_hash;
+#else
+  init_groestl_ctx();
+  gate->scanhash  = (void*)&scanhash_groestl;
+  gate->hash      = (void*)&groestlhash;
+#endif
+  gate->optimizations = AES_OPT | VAES_OPT;
+  return true;
+};
+
+bool register_groestl_algo( algo_gate_t* gate )
+{
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    return true;
+};
+
diff --git a/algo/groestl/groestl-gate.h b/algo/groestl/groestl-gate.h
new file mode 100644
index 0000000..25551e6
--- /dev/null
+++ b/algo/groestl/groestl-gate.h
@@ -0,0 +1,31 @@
+#ifndef GROESTL_GATE_H__
+#define GROESTL_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define GROESTL_4WAY_VAES 1
+#endif
+
+bool register_dmd_gr_algo( algo_gate_t* gate );
+
+bool register_groestl_algo( algo_gate_t* gate );
+
+#if defined(GROESTL_4WAY_VAES)
+
+void groestl_4way_hash( void *state, const void *input );
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
+void groestlhash( void *state, const void *input );
+int scanhash_groestl( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+void init_groestl_ctx();
+
+#endif
+
+#endif
+
diff --git a/algo/groestl/groestl.c b/algo/groestl/groestl.c
index df1c2c3..517dfb8 100644
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,5 +1,4 @@
-#include "algo-gate-api.h"
-
+#include "groestl-gate.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -78,15 +77,12 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 		groestlhash(hash, endiandata);
 
 		if (hash[7] <= Htarg )
-                   if ( fulltest(hash, ptarget))
-                   {
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-	           }
-         
+         submit_solution( work, hash, mythr );
+	   }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);
 
 	pdata[19] = nonce;
@@ -94,20 +90,3 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	return 0;
 }
 
-bool register_dmd_gr_algo( algo_gate_t* gate )
-{
-    init_groestl_ctx();
-    gate->optimizations   = SSE2_OPT | AES_OPT;
-    gate->scanhash        = (void*)&scanhash_groestl;
-    gate->hash            = (void*)&groestlhash;
-    opt_target_factor = 256.0;
-    return true;
-};
-
-bool register_groestl_algo( algo_gate_t* gate )
-{
-    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-    return true;
-};
-
diff --git a/algo/groestl/groestl256-hash-4way.c b/algo/groestl/groestl256-hash-4way.c
new file mode 100644
index 0000000..cee3eac
--- /dev/null
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -0,0 +1,280 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include <memory.h>
+#include "hash-groestl256.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#ifndef NO_AES_NI
+
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl256-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl256-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl256-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl256-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl256-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl256-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif
+
+/* initialise context */
+HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256( ctx->chaining );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return SUCCESS_GR;
+}
+
+
+HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
+ {
+  int i;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256(ctx->chaining);
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return SUCCESS_GR;
+}
+
+// Use this only for midstate and never for cryptonight
+HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
+                                 DataLength_gr databitlen )
+{
+   __m128i* in = (__m128i*)input;
+   const int len = (int)databitlen / 128;  // bits to __m128i
+   const int blocks = len / SIZE256;    // __M128i to blocks
+   int rem = ctx->rem_ptr;
+   int i;
+
+   ctx->blk_count = blocks;
+   ctx->databitlen = databitlen;
+
+   // digest any full blocks 
+   for ( i = 0; i < blocks; i++ )
+       TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   // adjust buf_ptr to last block
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // Copy any remainder to buffer
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   // adjust rem_ptr for new data
+   ctx->rem_ptr += i;
+
+   return SUCCESS_GR;
+}
+
+// don't use this at all
+HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
+{
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
+   int i;
+
+   // first pad byte = 0x80, last pad byte = block count
+   // everything in between is zero
+
+   if ( rem_ptr == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+
+   return SUCCESS_GR;
+}
+
+HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m128i* in = (__m128i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // cryptonight has 200 byte input, an odd number of __m128i
+   // remainder is only 8 bytes, ie u64.
+   if ( databitlen % 128 !=0 )
+   {
+      // must be cryptonight, copy 64 bits of data
+      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
+      i = -1; // signal for odd length
+   }
+   else   
+   { 
+      // Copy any remaining data to buffer for final transform
+      for ( i = 0; i < len % SIZE256; i++ )
+          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+      i += rem;   // use i as rem_ptr in final
+   }
+
+   //--- final ---
+
+   // adjust for final block
+   blocks++;
+
+   if ( i == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+      if ( i == -1 )
+      {
+         // cryptonight odd length
+         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
+         // finish the block with zero and length padding as normal
+         i = 0;
+       }
+       else
+       {
+          // add first padding
+          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                         0,0,0,0, 0,0,0,0x80 );
+       }
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return SUCCESS_GR;
+}
+
+/* hash bit sequence */
+HashReturn_gr hash_groestl256(int hashbitlen,
+                const BitSequence_gr* data,
+                DataLength_gr databitlen,
+                BitSequence_gr* hashval) {
+  HashReturn_gr ret;
+  hashState_groestl256 context;
+
+  /* initialise */
+  if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
+    return ret;
+
+  /* process message */
+  if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
+    return ret;
+
+  /* finalise */
+  ret = final_groestl256(&context, hashval);
+
+  return ret;
+}
+
+/* eBash API */
+//#ifdef crypto_hash_BYTES
+//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
+//{
+//  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
+//  return -1;
+//}
+//#endif
+
+#endif
diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h
new file mode 100644
index 0000000..f82c1de
--- /dev/null
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -0,0 +1,121 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __hash_h
+#define __hash_h
+
+#include <immintrin.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+/* eBash API begin */
+/*
+#include "crypto_hash.h"
+#ifdef crypto_hash_BYTES
+
+#include <crypto_uint8.h>
+#include <crypto_uint32.h>
+#include <crypto_uint64.h>
+typedef crypto_uint8 u8;
+typedef crypto_uint32 u32;
+typedef crypto_uint64 u64;
+#endif
+ */
+/* eBash API end */
+
+//#define LENGTH (512)
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "algo/sha/brg_types.h"
+
+#ifdef IACA_TRACE
+  #include IACA_MARKS
+#endif
+
+#define LENGTH (256)
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+#define COLS512 (8)
+//#define COLS1024 (16)
+#define SIZE_512 ((ROWS)*(COLS512))
+//#define SIZE1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
+//#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+#define COLS (COLS512)
+//#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+//#else
+//#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+//#define ROUNDS (ROUNDS1024)
+//#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum
+{
+    SUCCESS_GR = 0,
+    FAIL_GR = 1,
+    BAD_HASHBITLEN_GR = 2
+} HashReturn_gr;
+
+#define SIZE256 (SIZE_512/16)
+
+typedef struct {
+  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
+  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
+//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
+//  u64 block_counter;        /* message block counter */
+  int hashlen;              // bytes
+  int blk_count;
+  int buf_ptr;              /* data buffer pointer */
+  int rem_ptr;
+  int databitlen;
+} hashState_groestl256;
+
+HashReturn_gr init_groestl256( hashState_groestl256*, int );
+
+HashReturn_gr reinit_groestl256( hashState_groestl256* );
+
+HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
+                              DataLength_gr );
+
+HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+
+HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
+                            BitSequence_gr* );
+
+HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
+                                           const void*, DataLength_gr );
+
+#endif /* __hash_h */
diff --git a/algo/groestl/groestl256-intr-4way.h b/algo/groestl/groestl256-intr-4way.h
new file mode 100644
index 0000000..57dd930
--- /dev/null
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -0,0 +1,492 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+//__m128i ROUND_CONST_P[ROUNDS1024];
+//__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8(j, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  a2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b0 = _mm_xor_si128(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm_xor_si128(b3, a7);\
+  b1 = _mm_xor_si128(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm_xor_si128(b4, a0);\
+  b2 = _mm_xor_si128(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm_xor_si128(b5, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b1 = a1;\
+  b6 = _mm_xor_si128(b6, a2);\
+  b4 = _mm_xor_si128(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm_xor_si128(b7, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256( __m128i* chaining )
+{
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512( __m128i* chaining, __m128i* message )
+{
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512( __m128i* chaining )
+{
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
diff --git a/algo/groestl/groestl512-hash-4way.c b/algo/groestl/groestl512-hash-4way.c
new file mode 100644
index 0000000..96389f8
--- /dev/null
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -0,0 +1,114 @@
+/* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+// Optimized for hash and data length that are integrals of __m128i 
+
+
+#include <memory.h>
+#include "groestl512-intr-4way.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#if defined(__VAES__)
+
+#define ROTL64(a,n) \
+   ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
+     
+#define U64BIG(a) \
+  ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
+    ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
+    ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
+    ( ROTL64(a,56) & 0xFF000000FF000000 ) )
+
+int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }
+
+  uint64_t len = U64BIG((uint64_t)LENGTH);
+  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+  INIT_4way(ctx->chaining);
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE512 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }   
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+// digest final padding block and do output transform
+   TF1024_4way( ctx->chaining, ctx->buffer );
+
+   OF1024_4way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+#endif   // VAES
+
diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h
new file mode 100644
index 0000000..ab3acc6
--- /dev/null
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -0,0 +1,94 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#if !defined(GROESTL512_HASH_4WAY_H__)
+#define GROESTL512_HASH_4WAY_H__ 1
+
+#include "simd-utils.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+#define LENGTH (512)
+
+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+//#define COLS512 (8)
+#define COLS1024 (16)
+//#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE_1024 ((ROWS)*(COLS1024))
+//#define ROUNDS512 (10)
+#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+//#define COLS (COLS512)
+//#define SIZE (SIZE512)
+//#define ROUNDS (ROUNDS512)
+//#else
+#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+#define ROUNDS (ROUNDS1024)
+//#endif
+
+/*
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif // IS_BIG_ENDIAN 
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif // IS_LITTLE_ENDIAN 
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
+*/
+
+#define SIZE512 (SIZE_1024/16)
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+  int databitlen;    // bits
+} groestl512_4way_context;
+
+
+int groestl512_4way_init( groestl512_4way_context*, uint64_t );
+
+//int reinit_groestl( hashState_groestl* );
+
+int groestl512_4way_update( groestl512_4way_context*, const void*,
+                              uint64_t );
+
+int groestl512_4way_close( groestl512_4way_context*, void* );
+
+int groestl512_4way_update_close( groestl512_4way_context*,  void*,
+                                        const void*, uint64_t );
+
+#endif /* __hash_h */
diff --git a/algo/groestl/groestl512-intr-4way.h b/algo/groestl/groestl512-intr-4way.h
new file mode 100644
index 0000000..e8c243a
--- /dev/null
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -0,0 +1,654 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+
+#if !defined(GROESTL512_INTR_4WAY_H__)
+#define GROESTL512_INTR_4WAY_H__ 1
+      
+#include "groestl512-hash-4way.h"
+
+#if defined(__VAES__)
+
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m512i ROUND_CONST_P[ROUNDS1024];
+__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm512_xor_si512(a0, a1);\
+  b0 = a2;\
+  a1 = _mm512_xor_si512(a1, a2);\
+  b1 = a3;\
+  a2 = _mm512_xor_si512(a2, a3);\
+  b2 = a4;\
+  a3 = _mm512_xor_si512(a3, a4);\
+  b3 = a5;\
+  a4 = _mm512_xor_si512(a4, a5);\
+  b4 = a6;\
+  a5 = _mm512_xor_si512(a5, a6);\
+  b5 = a7;\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
+  b1 = a1;\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm512_xor_si512(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm512_xor_si512(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm512_xor_si512(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm512_xor_si512(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm512_xor_si512(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm512_xor_si512(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm512_xor_si512(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm512_xor_si512(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm512_xor_si512(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm512_xor_si512(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm512_xor_si512(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm512_xor_si512(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
+}/*MixBytes*/
+
+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
+                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
+                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
+                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
+                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
+                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
+                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
+                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
+                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
+                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
+                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
+                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
+                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
+                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
+                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
+                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
+                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
+                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
+                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
+                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
+                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
+                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
+                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
+                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
+                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
+  for( i = 0; i < ROUNDS1024; i++ ) \
+  { \
+    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
+                                          0xb0a09080 ^ (i * 0x01010101), \
+                                          0x70605040 ^ (i * 0x01010101), \
+                                          0x30201000 ^ (i * 0x01010101) ); \
+    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
+                                          0x4f5f6f7f ^ (i * 0x01010101), \
+                                          0x8f9fafbf ^ (i * 0x01010101), \
+                                          0xcfdfefff ^ (i * 0x01010101));\
+  } \
+}while(0);\
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_aesenclast_epi128( a0, b0 );\
+  a1 = _mm512_aesenclast_epi128( a1, b0 );\
+  a2 = _mm512_aesenclast_epi128( a2, b0 );\
+  a3 = _mm512_aesenclast_epi128( a3, b0 );\
+  a4 = _mm512_aesenclast_epi128( a4, b0 );\
+  a5 = _mm512_aesenclast_epi128( a5, b0 );\
+  a6 = _mm512_aesenclast_epi128( a6, b0 );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
+  { \
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+     /* AddRoundConstant P1024 */\
+    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
+  { \
+    /* AddRoundConstant Q1024 */\
+    xmm1 = ALL_FF;\
+    xmm8  = _mm512_xor_si512( xmm8,  xmm1 );\
+    xmm9  = _mm512_xor_si512( xmm9,  xmm1 );\
+    xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
+    xmm11 = _mm512_xor_si512( xmm11, xmm1 );\
+    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
+    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
+    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
+    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = ALL_FF;\
+    xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
+    xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
+    xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
+    xmm3 = _mm512_xor_si512( xmm3, xmm9 );\
+    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
+    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
+    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
+    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm512_shuffle_epi8(i6, t0);\
+  i0 = _mm512_shuffle_epi8(i0, t0);\
+  i1 = _mm512_shuffle_epi8(i1, t0);\
+  i2 = _mm512_shuffle_epi8(i2, t0);\
+  i3 = _mm512_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm512_shuffle_epi8(i4, t0);\
+  i5 = _mm512_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm512_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm512_unpackhi_epi16(t2, i5);\
+  i4 = _mm512_unpacklo_epi16(i4, i5);\
+  t3 = _mm512_unpackhi_epi16(t3, i7);\
+  i6 = _mm512_unpacklo_epi16(i6, i7);\
+  t0 = _mm512_unpackhi_epi16(t0, i1);\
+  t1 = _mm512_unpackhi_epi16(t1, i3);\
+  i2 = _mm512_unpacklo_epi16(i2, i3);\
+  i0 = _mm512_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm512_shuffle_epi32(t0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t2 = _mm512_shuffle_epi32(t2, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  i2 = _mm512_shuffle_epi32(i2, 216);\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  i6 = _mm512_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm512_unpacklo_epi32(i0, i2);\
+  t4 = _mm512_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm512_unpacklo_epi32(t0, t1);\
+  t5 = _mm512_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm512_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm512_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm512_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm512_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm512_unpackhi_epi64(i1, i4);\
+  i0 = _mm512_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm512_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm512_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm512_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm512_unpacklo_epi64(i4, t6);\
+  i7 = _mm512_unpackhi_epi64(i7, t7);\
+  i6 = _mm512_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm512_unpacklo_epi64(i0, i1);\
+  o1 = _mm512_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm512_unpacklo_epi64(i2, i3);\
+  t0 = _mm512_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm512_unpacklo_epi64(i4, i5);\
+  t1 = _mm512_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm512_unpacklo_epi64(i6, i7);\
+  t2 = _mm512_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm512_shuffle_epi8(i0, o0);\
+  i2 = _mm512_shuffle_epi8(i2, o0);\
+  i4 = _mm512_shuffle_epi8(i4, o0);\
+  i6 = _mm512_shuffle_epi8(i6, o0);\
+  o1 = _mm512_shuffle_epi8(o1, o0);\
+  t0 = _mm512_shuffle_epi8(t0, o0);\
+  t1 = _mm512_shuffle_epi8(t1, o0);\
+  t2 = _mm512_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm512_unpackhi_epi16(t3, i6);\
+  i4 = _mm512_unpacklo_epi16(i4, i6);\
+  o0 = _mm512_unpackhi_epi16(o0, i2);\
+  i0 = _mm512_unpacklo_epi16(i0, i2);\
+  o2 = _mm512_unpackhi_epi16(o2, t0);\
+  o1 = _mm512_unpacklo_epi16(o1, t0);\
+  t4 = _mm512_unpackhi_epi16(t4, t2);\
+  t1 = _mm512_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  o1 = _mm512_shuffle_epi32(o1, 216);\
+  o2 = _mm512_shuffle_epi32(o2, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  o0 = _mm512_shuffle_epi32(o0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t4 = _mm512_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm512_unpacklo_epi32(i0, i4);\
+  i1 = _mm512_unpackhi_epi32(i1, i4);\
+  o0 = _mm512_unpacklo_epi32(o0, t3);\
+  i3 = _mm512_unpackhi_epi32(i3, t3);\
+  o1 = _mm512_unpacklo_epi32(o1, t1);\
+  i5 = _mm512_unpackhi_epi32(i5, t1);\
+  o2 = _mm512_unpacklo_epi32(o2, t4);\
+  i7 = _mm512_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+
+void INIT_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024_4way( __m512i* chaining, const __m512i* message )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i QTEMP[8];
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+void OF1024_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c
index 17f0cf1..7f8d3ba 100644
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -1,14 +1,159 @@
 #include "myrgr-gate.h"
-
-#if defined(MYRGR_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
 #include "aes_ni/hash-groestl.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "groestl512-hash-4way.h"
+#endif
+
+#if defined(MYRGR_8WAY)
+
+typedef struct {
+#if defined(__VAES__)
+   groestl512_4way_context groestl;
+#else
+   hashState_groestl       groestl;
+#endif
+   sha256_8way_context     sha;
+} myrgr_8way_ctx_holder;
+
+myrgr_8way_ctx_holder myrgr_8way_ctx;
+
+void init_myrgr_8way_ctx()
+{
+#if defined(__VAES__)
+     groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &myrgr_8way_ctx.groestl, 64 );
+#endif
+     sha256_8way_init( &myrgr_8way_ctx.sha );
+}
+
+void myriad_8way_hash( void *output, const void *input )
+{
+     uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vhashA[20*8] __attribute__ ((aligned (64)));
+     uint32_t vhashB[20*8] __attribute__ ((aligned (64)));
+     myrgr_8way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+//     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                       hash6, hash7 );
+
+#else
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));  
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                         hash4, hash5, hash6, hash7, 512 );
+
+#endif
+
+     sha256_8way_update( &ctx.sha, vhash, 64 );
+     sha256_8way_close( &ctx.sha, output );
+}
+
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+   do
+   {
+      be32enc( noncep,    n   );
+      be32enc( noncep+ 8, n+1 );
+      be32enc( noncep+16, n+2 );
+      be32enc( noncep+24, n+3 );
+      be32enc( noncep+32, n+4 );
+      be32enc( noncep+40, n+5 );
+      be32enc( noncep+48, n+6 );
+      be32enc( noncep+64, n+7 );
+
+      myriad_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane ] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(MYRGR_4WAY)
 
 typedef struct {
     hashState_groestl       groestl;
@@ -45,7 +190,7 @@ void myriad_4way_hash( void *output, const void *input )
 
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
 
-     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_update( &ctx.sha, vhash, 64 );
      sha256_4way_close( &ctx.sha, output );
 }
 
diff --git a/algo/groestl/myrgr-gate.c b/algo/groestl/myrgr-gate.c
index 7f8e185..f82aafb 100644
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -2,16 +2,22 @@
 
 bool register_myriad_algo( algo_gate_t* gate )
 {
-#if defined (MYRGR_4WAY)
+#if defined (MYRGR_8WAY)
+  init_myrgr_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_8way;
+  gate->hash      = (void*)&myriad_8way_hash;
+  gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT;
+#elif defined (MYRGR_4WAY)
   init_myrgr_4way_ctx();
   gate->scanhash  = (void*)&scanhash_myriad_4way;
   gate->hash      = (void*)&myriad_4way_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT;
 #else
   init_myrgr_ctx();
   gate->scanhash  = (void*)&scanhash_myriad;
   gate->hash      = (void*)&myriad_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
 #endif
-  gate->optimizations = AES_OPT | AVX2_OPT;
   return true;
 };
 
diff --git a/algo/groestl/myrgr-gate.h b/algo/groestl/myrgr-gate.h
index 706bdb7..80cc3fd 100644
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -1,30 +1,35 @@
 #ifndef MYRGR_GATE_H__
-#define MYRGR_GATE_H__
+#define MYRGR_GATE_H__ 1
 
 #include "algo-gate-api.h"
 #include <stdint.h>
 
-#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
-  #define MYRGR_4WAY
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define MYRGR_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
+  #define MYRGR_4WAY 1
 #endif
 
-#if defined(MYRGR_4WAY)
+#if defined(MYRGR_8WAY)
+
+void myriad_8way_hash( void *state, const void *input );
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_myrgr_8way_ctx();
+
+#elif defined(MYRGR_4WAY)
 
 void myriad_4way_hash( void *state, const void *input );
-
 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_4way_ctx();
 
-#endif
+#else
 
 void myriad_hash( void *state, const void *input );
-
 int scanhash_myriad( struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_ctx();
 
 #endif
-
+#endif
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 0a1e6e2..d86bd42 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -1171,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
    sc->h[7] = m256_const1_64( 0x6769756d2042656c );
 }
 
-void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+      size_t len )
 {
    __m256i *vdata = (__m256i*)data;
 
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index 4e57f10..60e33b2 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -62,7 +62,7 @@ typedef hamsi_4way_big_context hamsi512_4way_context;
 void hamsi512_4way_init( hamsi512_4way_context *sc );
 void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
       size_t len );
-#define hamsi512_4way hamsi512_4way_update
+//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c
index c9e7ad8..313b23f 100644
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -38,7 +38,7 @@
 #define SPH_XCAT_(a, b)   a ## b
 
 static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
    __m128i *vdata = (__m128i*)data;
diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c
index 02df40f..6b45e10 100644
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
 } \
  \
 void \
-haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
 { \
-	haval ## y ## _4way(cc, data, len); \
+	haval ## y ## _4way_update(cc, data, len); \
 } \
  \
 void \
diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h
index 9bd37ba..9164d2f 100644
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -85,7 +85,7 @@ typedef haval_4way_context haval256_5_4way_context;
 void haval256_5_4way_init( void *cc );
 
 void haval256_5_4way_update( void *cc, const void *data, size_t len );
-#define haval256_5_4way haval256_5_4way_update
+//#define haval256_5_4way haval256_5_4way_update
 
 void haval256_5_4way_close( void *cc, void *dst );
 
diff --git a/algo/jh/jh-hash-4way.h b/algo/jh/jh-hash-4way.h
index 5cccebd..562fd5e 100644
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -103,14 +103,12 @@ typedef jh_4way_context jh512_4way_context;
 void jh256_4way_init( jh_4way_context *sc);
 
 void jh256_4way_update(void *cc, const void *data, size_t len);
-#define jh256_4way jh256_4way_update
 
 void jh256_4way_close(void *cc, void *dst);
 
 void jh512_4way_init( jh_4way_context *sc );
 
 void jh512_4way_update(void *cc, const void *data, size_t len);
-#define jh512_4way jh512_4way_update
 
 void jh512_4way_close(void *cc, void *dst);
 
diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c
index 2c76a33..68ffe7f 100644
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
     keccak512_4way_context ctx_keccak;
 
     keccak512_4way_init( &ctx_keccak );
-    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_update( &ctx_keccak, input, 80 );
     keccak512_4way_close( &ctx_keccak, vhash );
 
     // Heavy & Light Pair Loop
@@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input )
        intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
 
        skein512_4way_init( &ctx_skein );
-       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_update( &ctx_skein, vhash, 64 );
        skein512_4way_close( &ctx_skein, vhashB );
 
        for ( int i = 0; i < 8; i++ )
@@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input )
        blake512_4way_close( &ctx_blake, vhashA );
 
        jh512_4way_init( &ctx_jh );
-       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_update( &ctx_jh, vhash, 64 );
        jh512_4way_close( &ctx_jh, vhashB );
 
        for ( int i = 0; i < 8; i++ )
diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h
index d8500a6..a353856 100644
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -99,14 +99,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
 void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-#define keccak256_4way keccak256_4way_update
 
 void keccak512_4way_init(void *cc);
 void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
         void *cc, unsigned ub, unsigned n, void *dst);
-#define keccak512_4way keccak512_4way_update
 
 #endif
 
diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
index a02d0f1..c06f813 100644
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -55,7 +55,6 @@ void allium_8way_hash( void *state, const void *input )
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 256 );
 
-
    intrlv_2x256( vhash, hash0, hash1, 256 );
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash0, hash1, vhash, 256 );
@@ -69,19 +68,6 @@ void allium_8way_hash( void *state, const void *input )
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash6, hash7, vhash, 256 );
   
-/* 
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
-   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
-   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
-   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
    intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
    intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
 
@@ -105,20 +91,6 @@ void allium_8way_hash( void *state, const void *input )
    LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
    dintrlv_2x256( hash6, hash7, vhash, 256 );
 
-
-/*
-   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
-   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
-   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
-   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
-   LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
-   LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
-   LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
-   LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
-*/
-
-
-
    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                 hash7, 256 );
 
@@ -232,11 +204,11 @@ void allium_4way_hash( void *state, const void *input )
    allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
 
    memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
    blake256_4way_close( &ctx.blake, vhash32 );
 
    rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
    keccak256_4way_close( &ctx.keccak, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -261,7 +233,7 @@ void allium_4way_hash( void *state, const void *input )
 
    intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
 
-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
    skein256_4way_close( &ctx.skein, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
index 4f9cc1a..caa6fb0 100644
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
 //   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
    gate->get_work_data_size = (void*)&phi2_get_work_data_size;
    gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
    gate->build_extraheader  = (void*)&phi2_build_extraheader;
diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c
index a76e68c..b86f514 100644
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
 void lyra2h_4way_midstate( const void* input )
 {
        blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
 }
 
 void lyra2h_4way_hash( void *state, const void *input )
diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c
index f2954c3..0ed53c5 100644
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -44,7 +44,7 @@ void lyra2rev2_8way_hash( void *state, const void *input )
    lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
 
-   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
    blake256_8way_close( &ctx.blake, vhash );
 
    rintrlv_8x32_8x64( vhashA, vhash, 256 );
@@ -176,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
    lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
    memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
 
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
    blake256_4way_close( &ctx.blake, vhash );
 
    rintrlv_4x32_4x64( vhash64, vhash, 256 );
 
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
    keccak256_4way_close( &ctx.keccak, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -201,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 
    intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
 
-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
    skein256_4way_close( &ctx.skein, vhash64 );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -217,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
 
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
    bmw256_4way_close( &ctx.bmw, state );
 }
 
@@ -242,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
 
    blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
 
    do
    {
diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
index 6e560be..a7a9a3c 100644
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -209,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
    lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
    memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );
 
-   blake256_8way( &ctx.blake, input + (64*8), 16 );
+   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
    blake256_8way_close( &ctx.blake, vhash );
 
    dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -252,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
    intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                              hash4, hash5, hash6, hash7, 256 );
 
-   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
    bmw256_8way_close( &ctx.bmw, state );
 
    }
@@ -277,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
 
    blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
 
    do
    {
@@ -334,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
    lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
    memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
 
-//   blake256_4way( &ctx.blake, input, 80 );
-   blake256_4way( &ctx.blake, input + (64*4), 16 );
+   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
    blake256_4way_close( &ctx.blake, vhash );
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
 
@@ -358,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
    LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
    bmw256_4way_close( &ctx.bmw, state );
 }
 
@@ -383,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
 
    blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );
 
    do
    {
diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
index 3f5e56e..7273ebe 100644
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -149,7 +149,7 @@ static __thread blake256_8way_context l2z_8way_blake_mid;
 void lyra2z_8way_midstate( const void* input )
 {
        blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
 }
 
 void lyra2z_8way_hash( void *state, const void *input )
@@ -166,7 +166,7 @@ void lyra2z_8way_hash( void *state, const void *input )
      blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
 
      memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
      blake256_8way_close( &ctx_blake, vhash );
 
      dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -247,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
 void lyra2z_4way_midstate( const void* input )
 {
        blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
 }
 
 void lyra2z_4way_hash( void *state, const void *input )
@@ -260,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input )
      blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
 
      memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
      blake256_4way_close( &ctx_blake, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
diff --git a/algo/nist5/nist5-4way.c b/algo/nist5/nist5-4way.c
index c4aa73d..9b8687b 100644
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
      keccak512_4way_context ctx_keccak;
 
      blake512_4way_init( &ctx_blake );
-     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_update( &ctx_blake, input, 80 );
      blake512_4way_close( &ctx_blake, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      jh512_4way_init( &ctx_jh );
-     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_update( &ctx_jh, vhash, 64 );
      jh512_4way_close( &ctx_jh, vhash );
 
      keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
      keccak512_4way_close( &ctx_keccak, vhash );
 
      skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_update( &ctx_skein, vhash, 64 );
      skein512_4way_close( &ctx_skein, out );
 }
 
diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c
index 2c5d561..be1c19f 100644
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input )
     anime_4way_ctx_holder ctx;
     memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
 
-    bmw512_4way( &ctx.bmw, input, 80 );
+    bmw512_4way_update( &ctx.bmw, input, 80 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
-    blake512_4way( &ctx.blake, vhash, 64 );
+    blake512_4way_update( &ctx.blake, vhash, 64 );
     blake512_4way_close( &ctx.blake, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input )
 
     if ( mm256_anybits0( vh_mask ) )
     {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
     }
 
@@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input )
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input )
     if ( mm256_anybits1( vh_mask ) )
     {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
     }
     if ( mm256_anybits0( vh_mask ) )
     {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
     }
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
     skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input )
     if ( mm256_anybits1( vh_mask ) )
     {
        keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
     if ( mm256_anybits0( vh_mask ) )
     {
        jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
     }
 
diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c
index 9f22d29..4eac923 100644
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -21,6 +21,11 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(HMQ1725_8WAY)
 
@@ -28,21 +33,27 @@ union _hmq1725_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_8way_context     sha512;
     haval256_5_8way_context haval;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _hmq1725_8way_context_overlay hmq1725_8way_context_overlay;
@@ -52,6 +63,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    uint32_t vhash [16<<3] __attribute__ ((aligned (128)));
    uint32_t vhashA[16<<3] __attribute__ ((aligned (64)));
    uint32_t vhashB[16<<3] __attribute__ ((aligned (64)));
+   uint32_t vhashC[16<<3] __attribute__ ((aligned (64)));
    uint32_t hash0 [16]    __attribute__ ((aligned (64)));
    uint32_t hash1 [16]    __attribute__ ((aligned (64)));
    uint32_t hash2 [16]    __attribute__ ((aligned (64)));
@@ -67,6 +79,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    __m512i* vh  = (__m512i*)vhash;
    __m512i* vhA = (__m512i*)vhashA;
    __m512i* vhB = (__m512i*)vhashB;
+   __m512i* vhC = (__m512i*)vhashC;
 
    bmw512_8way_init( &ctx.bmw );
    bmw512_8way_update( &ctx.bmw, input, 80 );
@@ -106,6 +119,28 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                                        m512_zero );
 
    // A
+
+#if defined(__VAES__)
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
+   {
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+   }
+   if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
+   {
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+   }
+   rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
    if ( hash0[0] & mask )
    {
      init_groestl( &ctx.groestl, 64 );
@@ -140,13 +175,13 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    {
      init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                               (char*)hash5, 512 );
+                                             (char*)hash5, 512 );
    }
    if ( hash6[0] & mask )
    {
      init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                               (char*)hash6, 512 );
+                                             (char*)hash6, 512 );
    }
    if ( hash7[0] & mask )
    {
@@ -155,9 +190,11 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                                              (char*)hash7, 512 );
    }
 
-   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7 );
 
+#endif
+
    // B
    if ( likely( vh_mask & 0xff ) )
    {
@@ -166,7 +203,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
       skein512_8way_close( &ctx.skein, vhashB );
    }
 
-   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
 
    jh512_8way_init( &ctx.jh );
    jh512_8way_update( &ctx.jh, vhash, 64 );
@@ -225,6 +262,20 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    }
 
    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+     
+#else
+
    dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
 
@@ -256,6 +307,8 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
    intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
    simd_4way_init( &ctx.simd, 512 );
    simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
    simd_4way_init( &ctx.simd, 512 );
@@ -334,6 +387,20 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    }
 
    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+   rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                      vhash );
 
@@ -365,17 +432,38 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                     hash7 );
 
+#endif
+
    blake512_8way_init( &ctx.blake );
    blake512_8way_update( &ctx.blake, vhash, 64 );
    blake512_8way_close( &ctx.blake, vhash );
 
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
-   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                  vhash );
-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
    // A
+#if defined(__VAES__)
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
+   {
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+   }
+   if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
+   {
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+   }
+
+   rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+
+   dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                  vhash );
+
    if ( hash0[0] & mask )
    {
       sph_shavite512_init( &ctx.shavite );
@@ -425,19 +513,28 @@ extern void hmq1725_8way_hash(void *state, const void *input)
       sph_shavite512_close( &ctx.shavite, hash7 ); //8
    }
 
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+
+#endif
+
    // B
-   if ( likely( vh_mask & 0xff ) )
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( likely( vh_mask & 0x0f ) )
    {
       luffa_4way_init( &ctx.luffa, 512 );
       luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   }
+   if ( likely( vh_mask & 0xf0 ) )
+   {
       luffa_4way_init( &ctx.luffa, 512 );
       luffa_4way_update_close( &ctx.luffa, vhash, vhashB, 64 );
-      rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
    }
 
-   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
-   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
+
+   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
 
    hamsi512_8way_init( &ctx.hamsi );
    hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
@@ -475,8 +572,27 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                          hash7 );
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], vmask ),
                                        m512_zero );
-   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
+     // A   
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( likely( ( vh_mask & 0x0f ) != 0x0f ) )
+   {
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+   }
+   if ( likely( ( vh_mask & 0xf0 ) != 0xf0 ) )
+   {
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+   }
+
+   rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+   
    if ( hash0[0] & mask ) //4
    {
        init_echo( &ctx.echo, 512 );
@@ -526,19 +642,29 @@ extern void hmq1725_8way_hash(void *state, const void *input)
                                (const BitSequence *)hash7, 512 );
    }
 
+   intrlv_8x64_512( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                         hash7 );
+   
+#endif
+
    // B
-   if ( likely( vh_mask & 0xff ) )
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   if ( likely( vh_mask & 0x0f ) )
    {
       simd_4way_init( &ctx.simd, 512 );
       simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   }
+   if ( likely( vh_mask & 0xf0 ) )
+   {
       simd_4way_init( &ctx.simd, 512 );
       simd_4way_update_close( &ctx.simd, vhash, vhashB, 512 );
-      rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
    }
 
-   intrlv_8x64_512( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                         hash7 );
-   mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+   rintrlv_4x128_8x64( vhashB, vhashA, vhash, 512 );
+
+   mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
+
    rintrlv_8x64_8x32( vhashA, vhash, 512 );
 
    shabal512_8way_init( &ctx.shabal );
@@ -641,6 +767,20 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    }
 
    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
 
@@ -664,6 +804,8 @@ extern void hmq1725_8way_hash(void *state, const void *input)
    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7 );
    
+#endif
+
    sha512_8way_init( &ctx.sha512 );
    sha512_8way_update( &ctx.sha512, vhash, 64 );
    sha512_8way_close( &ctx.sha512, vhash );
@@ -830,7 +972,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      __m256i* vhB = (__m256i*)vhashB;
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, input, 80 );
+     bmw512_4way_update( &ctx.bmw, input, 80 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -889,18 +1031,18 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits1( vh_mask ) )
      {
        skein512_4way_init( &ctx.skein );
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
      }
 
      mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
 // second fork, A = blake parallel, B= bmw parallel.
@@ -911,14 +1053,14 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits0( vh_mask ) )
      {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
      }
 
      if ( mm256_anybits1( vh_mask ) )
      {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
      }
 
@@ -962,14 +1104,14 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits0( vh_mask ) )
      {
         keccak512_4way_init( &ctx.keccak );
-        keccak512_4way( &ctx.keccak, vhash, 64 );
+        keccak512_4way_update( &ctx.keccak, vhash, 64 );
         keccak512_4way_close( &ctx.keccak, vhashA );
      }
 
      if ( mm256_anybits1( vh_mask ) )
      {
         jh512_4way_init( &ctx.jh );
-        jh512_4way( &ctx.jh, vhash, 64 );
+        jh512_4way_update( &ctx.jh, vhash, 64 );
         jh512_4way_close( &ctx.jh, vhashB );
      }
 
@@ -990,7 +1132,6 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      sph_shavite512 ( &ctx.shavite, hash3, 64 );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-
      intrlv_2x128_512( vhashA, hash0, hash1 );
      intrlv_2x128_512( vhashB, hash2, hash3 );
 
@@ -1042,7 +1183,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      if ( mm256_anybits1( vh_mask ) )
      {
         haval256_5_4way_init( &ctx.haval );
-        haval256_5_4way( &ctx.haval, vhash, 64 );
+        haval256_5_4way_update( &ctx.haval, vhash, 64 );
         haval256_5_4way_close( &ctx.haval, vhash );
         memset( &vhash[8<<2], 0, 32<<2 );
         rintrlv_4x32_4x64( vhashB, vhash, 512 );
@@ -1068,7 +1209,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, vhash, 64 );
+     blake512_4way_update( &ctx.blake, vhash, 64 );
      blake512_4way_close( &ctx.blake, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1130,7 +1271,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
 
    dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1214,7 +1355,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
 
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -1269,7 +1410,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    if ( mm256_anybits1( vh_mask ) )
    {
       sha512_4way_init( &ctx.sha512 );
-      sha512_4way( &ctx.sha512, vhash, 64 );
+      sha512_4way_update( &ctx.sha512, vhash, 64 );
       sha512_4way_close( &ctx.sha512, vhashB );
    }
 
@@ -1289,7 +1430,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
    sha512_4way_init( &ctx.sha512 ); 
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash ); 
 
 // A = haval parallel, B = Whirlpool serial
@@ -1305,7 +1446,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    if ( mm256_anybits0( vh_mask ) )
    {
       haval256_5_4way_init( &ctx.haval );
-      haval256_5_4way( &ctx.haval, vhash, 64 );
+      haval256_5_4way_update( &ctx.haval, vhash, 64 );
       haval256_5_4way_close( &ctx.haval, vhash );
       memset( &vhash[8<<2], 0, 32<<2 );
       rintrlv_4x32_4x64( vhashA, vhash, 512 );
@@ -1341,7 +1482,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
 
  	memcpy(state, vhash, 32<<2 );
diff --git a/algo/quark/hmq1725-gate.c b/algo/quark/hmq1725-gate.c
index 9cc2784..4c64bff 100644
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -13,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_hmq1725;
   gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   opt_target_factor = 65536.0;
   return true;
 };
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index 180d636..3181866 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -9,16 +9,23 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+#endif
 
 #if defined (QUARK_8WAY)
 
 typedef struct {
     blake512_8way_context  blake;
     bmw512_8way_context    bmw;
-    hashState_groestl      groestl;
     jh512_8way_context     jh;
     skein512_8way_context  skein;
     keccak512_8way_context keccak;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+#else
+    hashState_groestl       groestl;
+#endif
 } quark_8way_ctx_holder;
 
 quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
@@ -27,10 +34,14 @@ void init_quark_8way_ctx()
 {
      blake512_8way_init( &quark_8way_ctx.blake );
      bmw512_8way_init( &quark_8way_ctx.bmw );
-     init_groestl( &quark_8way_ctx.groestl, 64 );
      skein512_8way_init( &quark_8way_ctx.skein );
      jh512_8way_init( &quark_8way_ctx.jh );
      keccak512_8way_init( &quark_8way_ctx.keccak );
+#if defined(__VAES__)
+     groestl512_4way_init( &quark_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &quark_8way_ctx.groestl, 64 );
+#endif
 }
 
 void quark_8way_hash( void *state, const void *input )
@@ -38,6 +49,7 @@ void quark_8way_hash( void *state, const void *input )
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -49,6 +61,7 @@ void quark_8way_hash( void *state, const void *input )
     __m512i* vh  = (__m512i*)vhash;
     __m512i* vhA = (__m512i*)vhashA;
     __m512i* vhB = (__m512i*)vhashB;
+    __m512i* vhC = (__m512i*)vhashC;
     __mmask8 vh_mask;
     quark_8way_ctx_holder ctx;
     const uint32_t mask = 8;
@@ -66,6 +79,25 @@ void quark_8way_hash( void *state, const void *input )
     vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                        zero );
 
+    
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     if ( ( vh_mask & 0x0f ) != 0x0f )
+     {
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     }
+     if ( ( vh_mask & 0xf0 ) != 0xf0 )
+     {     
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     }
+     rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
 
@@ -117,16 +149,31 @@ void quark_8way_hash( void *state, const void *input )
                                                (char*)hash7, 512 );
     }
 
-    intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+    intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, 512 );
 
+#endif
+
     if ( vh_mask & 0xff )
     {
        skein512_8way_update( &ctx.skein, vhash, 64 );
        skein512_8way_close( &ctx.skein, vhashB );
     }
 
-    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
 
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );
@@ -151,6 +198,8 @@ void quark_8way_hash( void *state, const void *input )
     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  512 );
 
+#endif
+
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
 
@@ -289,10 +338,10 @@ void quark_4way_hash( void *state, const void *input )
 
     memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
 
-    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );
 
-    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -327,7 +376,7 @@ void quark_4way_hash( void *state, const void *input )
 
     if ( mm256_anybits1( vh_mask ) )   
     {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
        skein512_4way_close( &ctx.skein, vhashB );
     }
 
@@ -346,7 +395,7 @@ void quark_4way_hash( void *state, const void *input )
 
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -354,24 +403,24 @@ void quark_4way_hash( void *state, const void *input )
     if ( mm256_anybits0( vh_mask ) )   
     {
        blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
        blake512_4way_close( &ctx.blake, vhashA );
     }
 
     if ( mm256_anybits1( vh_mask ) )
     {
        bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
        bmw512_4way_close( &ctx.bmw, vhashB );
     }
 
     mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );
 
-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
 
     skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -379,14 +428,14 @@ void quark_4way_hash( void *state, const void *input )
     if ( mm256_anybits0( vh_mask ) )    
     {
        keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
        keccak512_4way_close( &ctx.keccak, vhashA );
     }
 
     if ( mm256_anybits1( vh_mask ) )
     {
        jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
        jh512_4way_close( &ctx.jh, vhashB );
     }
 
diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c
index ee4842f..0c26473 100644
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -15,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_quark;
   gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c
index 2b5d603..630c1ee 100644
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -9,6 +9,10 @@
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(QUBIT_4WAY)
 
@@ -16,10 +20,14 @@ typedef struct
 {
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    simd_2way_context       simd2;
+#if defined(__VAES__)
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    sph_shavite512_context  shavite;
     hashState_echo          echo;
+#endif
 } qubit_4way_ctx_holder;
 
 qubit_4way_ctx_holder qubit_4way_ctx;
@@ -27,10 +35,14 @@ qubit_4way_ctx_holder qubit_4way_ctx;
 void init_qubit_4way_ctx()
 {
     cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
-    sph_shavite512_init(&qubit_4way_ctx.shavite);
     simd_4way_init( &qubit_4way_ctx.simd, 512 );
-    simd_2way_init( &qubit_4way_ctx.simd2, 512 );
-    init_echo(&qubit_4way_ctx.echo, 512);
+#if defined(__VAES__)
+    shavite512_4way_init( &qubit_4way_ctx.shavite );
+    echo_4way_init( &qubit_4way_ctx.echo, 512 );
+#else
+    sph_shavite512_init( &qubit_4way_ctx.shavite );
+    init_echo( &qubit_4way_ctx.echo, 512 );
+#endif
 };
 
 void qubit_4way_hash( void *output, const void *input )
@@ -48,6 +60,13 @@ void qubit_4way_hash( void *output, const void *input )
      luffa_4way_close( &ctx.luffa, vhash );
      
      cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
      
      sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -66,31 +85,44 @@ void qubit_4way_hash( void *output, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
+
      simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+
+     dintrlv_4x128( output, output+32, output+64, output+96, vhash, 256 );
+    
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
 
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash0,
+                            (const BitSequence*)hash0, 512 );
      memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 512 );
      memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 512 );
      memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 512 );
 
      memcpy( output,    hash0, 32 );
      memcpy( output+32, hash1, 32 );
      memcpy( output+64, hash2, 32 );
      memcpy( output+96, hash3, 32 );
+#endif
 }
 
 int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (128)));
+     uint32_t hash[8*4] __attribute__ ((aligned (128)));
      uint32_t vdata[24*4] __attribute__ ((aligned (64)));
      uint32_t *pdata = work->data;
      uint32_t *ptarget = work->target;
diff --git a/algo/qubit/qubit-gate.c b/algo/qubit/qubit-gate.c
index b3592a5..0d547c8 100644
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_qubit;
   gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c
index 78a6f5a..0228c86 100644
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -7,7 +7,7 @@
 #include "ripemd-hash-4way.h"
 
 #define LBRY_INPUT_SIZE 112
-#define LBRY_MIDSTATE    64
+#define LBRY_MIDSTATE    96
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
 
 #if defined(LBRY_16WAY)
@@ -35,9 +35,9 @@ void lbry_16way_hash( void* output, const void* input )
    uint32_t _ALIGN(64) h13[32];
    uint32_t _ALIGN(64) h14[32];
    uint32_t _ALIGN(64) h15[32];
-   sha256_16way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha256_16way_context    ctx_sha256 __attribute__ ((aligned (64)));
    sha512_8way_context     ctx_sha512;
-   ripemd160_16way_context  ctx_ripemd;
+   ripemd160_16way_context ctx_ripemd;
 
    memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
    sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
@@ -62,7 +62,7 @@ void lbry_16way_hash( void* output, const void* input )
    sha512_8way_close( &ctx_sha512, vhashB );
 
    // back to 8-way 32 bit
-   dintrlv_8x64( h0, h1, h2, h3,h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
    dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
    intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
                          h8, h9, h10, h11, h12, h13, h14, h15, 512 );
@@ -90,14 +90,15 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
 {
    uint32_t hash[8*16] __attribute__ ((aligned (128)));
    uint32_t vdata[32*16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<4]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[27];
    const uint32_t first_nonce = pdata[27];
+   const uint32_t last_nonce = max_nonce - 16;
    const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
    __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
@@ -114,14 +115,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
         edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
 
    sha256_16way_init( &sha256_16w_mid );
-   sha256_16way( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+   sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
 
    do
    {
-      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
-                                                  n+11, n+10, n+ 9, n+ 8,
-                                                  n+ 7, n+ 6, n+ 5, n+ 4,
-                                                  n+ 3, n+ 2, n+ 1, n ) );
+      *noncev = mm512_bswap_32( _mm512_set_epi32(
+                         n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                         n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
       lbry_16way_hash( hash, vdata );
 
       for ( int i = 0; i < 16; i++ )
@@ -129,27 +129,25 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
       {
          // deinterleave hash for lane
          extr_lane_16x32( lane_hash, hash, i, 256 );
-         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
          {
             pdata[27] = n + i;
             submit_lane_solution( work, lane_hash, mythr, i );
          }
       }
       n += 16;
-   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
-
-
 #elif defined(LBRY_8WAY)
 
 static __thread sha256_8way_context sha256_8w_mid;
 
 void lbry_8way_hash( void* output, const void* input )
 {
-   uint32_t _ALIGN(64) vhashA[16<<3];
+   uint32_t _ALIGN(128) vhashA[16<<3];
    uint32_t _ALIGN(64) vhashB[16<<3];
    uint32_t _ALIGN(64) vhashC[16<<3];
    uint32_t _ALIGN(32) h0[32];
@@ -165,11 +163,11 @@ void lbry_8way_hash( void* output, const void* input )
    ripemd160_8way_context  ctx_ripemd;
 
    memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
-   sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    // reinterleave to do sha512 4-way 64 bit twice.
@@ -178,11 +176,11 @@ void lbry_8way_hash( void* output, const void* input )
    intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );
 
    sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashA, 32 );
+   sha512_4way_update( &ctx_sha512, vhashA, 32 );
    sha512_4way_close( &ctx_sha512, vhashA );
 
    sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_update( &ctx_sha512, vhashB, 32 );
    sha512_4way_close( &ctx_sha512, vhashB );
 
    // back to 8-way 32 bit
@@ -191,20 +189,20 @@ void lbry_8way_hash( void* output, const void* input )
    intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
 
    ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
    ripemd160_8way_close( &ctx_ripemd, vhashB );
 
    ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
    ripemd160_8way_close( &ctx_ripemd, vhashC );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashB, 20 );
-   sha256_8way( &ctx_sha256, vhashC, 20 );
+   sha256_8way_update( &ctx_sha256, vhashB, 20 );
+   sha256_8way_update( &ctx_sha256, vhashC, 20 );
    sha256_8way_close( &ctx_sha256, vhashA );
 
    sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
    sha256_8way_close( &ctx_sha256, output );
 }
 
@@ -214,13 +212,13 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
    uint32_t hash[8*8] __attribute__ ((aligned (64)));
    uint32_t vdata[32*8] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<3]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint32_t n = pdata[27];
    const uint32_t first_nonce = pdata[27];
    const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
    __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
 
@@ -237,7 +235,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
                        edata, edata, edata, edata, 1024 );
 
    sha256_8way_init( &sha256_8w_mid );
-   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+   sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
 
    do
    {
diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c
index ac94c64..f4080a8 100644
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,7 +98,7 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
 
 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #if defined (LBRY_16WAY)
   gate->scanhash              = (void*)&scanhash_lbry_16way;
   gate->hash                  = (void*)&lbry_16way_hash;
diff --git a/algo/ripemd/lbry-gate.h b/algo/ripemd/lbry-gate.h
index 603b5b5..2aedd6b 100644
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -5,11 +5,10 @@
 #include <stdint.h>
 
 
-// 16 way needs sha256 16 way
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-//  #define LBRY_16WAY
-#if defined(__AVX2__)
-  #define LBRY_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LBRY_16WAY 1
+#elif defined(__AVX2__)
+  #define LBRY_8WAY 1
 #endif
 /*
 #if !defined(__SHA__)
@@ -37,13 +36,13 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr );
-/*
+
 #elif defined(LBRY_4WAY)
 
 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done );
-*/
+
 #else
 
 void lbry_hash( void *state, const void *input );
diff --git a/algo/ripemd/ripemd-hash-4way.c b/algo/ripemd/ripemd-hash-4way.c
index 42c0d2d..38de159 100644
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -259,7 +259,8 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len )
 {
    __m128i *vdata = (__m128i*)data;
    size_t ptr;
@@ -559,7 +560,8 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len )
 {
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
@@ -859,7 +861,7 @@ void ripemd160_16way_init( ripemd160_16way_context *sc )
    sc->count_high = sc->count_low = 0;
 }
 
-void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
                       size_t len )
 {
    __m512i *vdata = (__m512i*)data;
diff --git a/algo/ripemd/ripemd-hash-4way.h b/algo/ripemd/ripemd-hash-4way.h
index c565ad7..71fb3d7 100644
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -16,7 +16,8 @@ typedef struct
 } __attribute__ ((aligned (64))) ripemd160_4way_context;
 
 void ripemd160_4way_init( ripemd160_4way_context *sc );
-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
 
 #if defined (__AVX2__)
@@ -26,10 +27,11 @@ typedef struct
    __m256i buf[64>>2];
    __m256i val[5];
    uint32_t count_high, count_low;
-} __attribute__ ((aligned (64))) ripemd160_8way_context;
+} __attribute__ ((aligned (128))) ripemd160_8way_context;
 
 void ripemd160_8way_init( ripemd160_8way_context *sc );
-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
 
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -42,7 +44,7 @@ typedef struct
 } __attribute__ ((aligned (128))) ripemd160_16way_context;
 
 void ripemd160_16way_init( ripemd160_16way_context *sc );
-void ripemd160_16way( ripemd160_16way_context *sc, const void *data,
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
                       size_t len );
 void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
 
diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h
index 2ac2a7e..3635dd9 100644
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -41,13 +41,9 @@
 #define SHA2_HASH_4WAY_H__ 1
 
 #include <stddef.h>
-#include "sph_types.h"
 #include "simd-utils.h"
 
 #if defined(__SSE2__)
-//#if defined(__SSE4_2__)
-
-//#define SPH_SIZE_sha256   256
 
 // SHA-256 4 way
 
@@ -59,9 +55,12 @@ typedef struct {
 } sha256_4way_context __attribute__ ((aligned (64)));
 
 void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );
 
+#endif  // SSE2
+
 #if defined (__AVX2__)
 
 // SHA-256 8 way
@@ -75,10 +74,28 @@ typedef struct {
 
 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
-#define sha256_8way sha256_8way_update
 void sha256_8way_close( sha256_8way_context *sc, void *dst );
 
-//#define SPH_SIZE_sha512   512
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct {
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+   bool initialized;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+
+#endif // AVX512
+
+#if defined (__AVX2__)
 
 // SHA-512 4 way
 
@@ -92,9 +109,10 @@ typedef struct {
 void sha512_4way_init( sha512_4way_context *sc);
 void sha512_4way_update( sha512_4way_context *sc, const void *data,
                          size_t len );
-#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );
 
+#endif  // AVX2
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 
 // SHA-512 8 way
@@ -111,8 +129,6 @@ void sha512_8way_update( sha512_8way_context *sc, const void *data,
                          size_t len );
 void sha512_8way_close( sha512_8way_context *sc, void *dst );
 
-
 #endif  // AVX512
-#endif  // __AVX2__
-#endif  // __SSE2__
+
 #endif  // SHA256_4WAY_H__
diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c
index ba6b952..2167407 100644
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -39,47 +39,31 @@
 // SHA-256 32 bit
 
 /*
-static const sph_u32 H256[8] = {
-        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
-        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
-        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
-        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+static const uint32_t H256[8] =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 */
 
-static const sph_u32 K256[64] = {
-        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
-        SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
-        SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
-        SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
-        SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
-        SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
-        SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
-        SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
-        SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
-        SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
-        SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
-        SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
-        SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
-        SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
-        SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
-        SPH_C32(0x06CA6351), SPH_C32(0x14292967),
-        SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
-        SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
-        SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
-        SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
-        SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
-        SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
-        SPH_C32(0xD192E819), SPH_C32(0xD6990624),
-        SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
-        SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
-        SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
-        SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
-        SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
-        SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
-        SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
-        SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
-        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+static const uint32_t K256[64] =
+{
+   0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+   0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+   0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+   0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+   0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+   0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+   0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+   0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+   0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+   0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+   0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+   0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+   0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+   0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+   0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 
 // SHA-256 4 way
@@ -248,7 +232,7 @@ void sha256_4way_init( sha256_4way_context *sc )
 */
 }
 
-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
+void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
 {
    __m128i *vdata = (__m128i*)data;
    size_t ptr;
@@ -273,7 +257,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
          ptr = 0;
       }
       clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
       sc->count_low = clow2;
       if ( clow2 < clow )
          sc->count_high++;
@@ -306,10 +290,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
 
     sc->buf[ pad >> 2 ] =
                  mm128_bswap_32( m128_const1_32( high ) );
-//                 mm128_bswap_32( _mm_set1_epi32( high ) );
     sc->buf[ ( pad+4 ) >> 2 ] =
                  mm128_bswap_32( m128_const1_32( low ) );
-//                 mm128_bswap_32( _mm_set1_epi32( low ) );
     sha256_4way_round( sc, sc->buf, sc->val );
 
     mm128_block_bswap_32( dst, sc->val );
@@ -483,7 +465,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 */
 }
 
-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
 {
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
@@ -508,7 +490,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
          ptr = 0;
       }
       clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
       sc->count_low = clow2;
       if ( clow2 < clow )
          sc->count_high++;
@@ -549,5 +531,233 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
     mm256_block_bswap_32( dst, sc->val );
 }
 
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+#define CHx16(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJx16(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
+
+#define BSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
+
+#define SSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
+
+#define SSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
+
+#define SHA2x16_MEXP( a, b, c, d ) \
+     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
+
+#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
+  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+  D  = _mm512_add_epi32( D,  T1 ); \
+  H  = _mm512_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   register  __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   mm512_block_bswap_32( W  , in   );
+   mm512_block_bswap_32( W+8, in+8 );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E6676A09E667 );
+      B = m512_const1_64( 0xBB67AE85BB67AE85 );
+      C = m512_const1_64( 0x3C6EF3723C6EF372 );
+      D = m512_const1_64( 0xA54FF53AA54FF53A );
+      E = m512_const1_64( 0x510E527F510E527F );
+      F = m512_const1_64( 0x9B05688C9B05688C );
+      G = m512_const1_64( 0x1F83D9AB1F83D9AB );
+      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   }
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi32( r[0], A );
+      r[1] = _mm512_add_epi32( r[1], B );
+      r[2] = _mm512_add_epi32( r[2], C );
+      r[3] = _mm512_add_epi32( r[3], D );
+      r[4] = _mm512_add_epi32( r[4], E );
+      r[5] = _mm512_add_epi32( r[5], F );
+      r[6] = _mm512_add_epi32( r[6], G );
+      r[7] = _mm512_add_epi32( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
+      r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
+      r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
+      r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
+      r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
+      r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
+      r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
+      r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
+   }
+}
+
+void sha256_16way_init( sha256_16way_context *sc )
+{
+   sc->initialized = false;
+   sc->count_high = sc->count_low = 0;
+}
+
+
+void sha256_16way_update( sha256_16way_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_16way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_16way_close( sha256_16way_context *sc, void *dst )
+{
+    unsigned ptr;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+    sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         sha256_16way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( low ) );
+
+    sha256_16way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_32( dst, sc->val );
+}
+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c
index 41c3458..2cecfcc 100644
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input )
    sha256_8way_context ctx;
    memcpy( &ctx, &sha256_ctx8, sizeof ctx );
 
-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, output );
 }
 
@@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
    // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
    sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
@@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input )
    sha256_4way_context ctx;
    memcpy( &ctx, &sha256_ctx4, sizeof ctx );
 
-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, output );
 }
 
@@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
 
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
    sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
index 5c4dd68..b48633b 100644
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input )
    sha256_8way_context ctx;
    memcpy( &ctx, &sha256_ctx8, sizeof ctx );
 
-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, vhash );
 
    sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
    sha256_8way_close( &ctx, output );
 }
 
@@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
    // Need big endian data
    mm256_bswap32_intrlv80_8x32( vdata, pdata );
    sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
@@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input )
    sha256_4way_context ctx;
    memcpy( &ctx, &sha256_ctx4, sizeof ctx );
 
-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, vhash );
 
    sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
    sha256_4way_close( &ctx, output );
 }
 
@@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 
    mm128_bswap32_intrlv80_4x32( vdata, pdata );
    sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );
 
    for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
    {
diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c
index 3ee8194..d056da0 100644
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -37,55 +37,57 @@
 #include "sha-hash-4way.h"
 
 /*
-static const sph_u64 H512[8] = {
-        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+static const uit64_t H512[8] =
+{
+   0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+   0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+   0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+   0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };
 */
 
-static const sph_u64 K512[80] = {
-	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
-	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
-	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
-	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
-	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
-	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
-	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
-	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
-	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
-	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
-	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
-	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
-	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
-	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
-	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
-	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
-	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
-	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
-	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
-	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
-	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
-	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
-	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
-	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
-	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
-	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
-	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
-	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
-	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
-	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
-	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
-	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
-	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
-	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
-	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
-	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
-	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
-	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
-	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
-	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+static const uint64_t K512[80] =
+{
+	0x428A2F98D728AE22, 0x7137449123EF65CD,
+	0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019,
+	0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE,
+	0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
+	0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
+	0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
+	0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210,
+	0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
+	0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926,
+	0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
+	0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001,
+	0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910,
+	0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
+	0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
+	0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
+	0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
+	0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207,
+	0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
+	0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493,
+	0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
+	0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };
 
 
diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h
index c296f8c..0efec0b 100644
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -97,7 +97,7 @@ void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
 
 void shabal512_4way_init( void *cc );
 void shabal512_4way_update( void *cc, const void *data, size_t len );
-#define shabal512_4way shabal512_4way_update
+//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                        void *dst );
diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c
new file mode 100644
index 0000000..a8b439c
--- /dev/null
+++ b/algo/shavite/shavite-hash-4way.c
@@ -0,0 +1,399 @@
+#include "shavite-hash-4way.h"
+#include <stdint.h>
+
+static const uint32_t IV512[] =
+{
+        0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+        0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+        0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
+};
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define mm512_ror2x512hi_1x32( a, b ) \
+   _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
+                                    mm512_ror128_32( b ) )
+
+static void
+c512_4way( shavite512_4way_context *ctx, const void *msg )
+{
+   register __m512i X;
+   register __m512i P0, P1, P2, P3;
+   register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
+   __m512i *M = (__m512i*)msg;
+   __m512i *H = (__m512i*)ctx->h;
+   int r;
+
+   P0 = H[0];
+   P1 = H[1];
+   P2 = H[2];
+   P3 = H[3];
+
+   K0 = M[0];
+   K1 = M[1];
+   K2 = M[2];
+   K3 = M[3];
+   K4 = M[4];
+   K5 = M[5];
+   K6 = M[6];
+   K7 = M[7];
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+
+   P0 = _mm512_xor_si512( P0, X );
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+
+   P2 = _mm512_xor_si512( P2, X );
+
+   // round
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+
+     K0 = _mm512_xor_si512( K7, mm512_ror128_32(
+                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+
+     if ( r == 0 )
+        K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( 
+		              ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K0,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+
+     if ( r == 1 )
+        K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
+	                 ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); 
+
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K1,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K2,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P3 = _mm512_xor_si512( P3, X );
+
+     K4 = _mm512_xor_si512( K3,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K4,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K5,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K6,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+
+     if ( r == 2 )
+        K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
+                    ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+ 
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+     P1 = _mm512_xor_si512( P1, X );
+     
+     // round 2, 6, 10
+
+     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P2 = _mm512_xor_si512( P2, X );
+
+     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P0 = _mm512_xor_si512( P0, X );
+
+     // round 3, 7, 11
+
+     K0 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P1 = _mm512_xor_si512( P1, X );
+
+     K4 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P3 = _mm512_xor_si512( P3, X );
+
+     // round 4, 8, 12
+
+     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P0 = _mm512_xor_si512( P0, X );
+
+     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P2 = _mm512_xor_si512( P2, X );
+   }
+
+   // round 13
+
+   K0 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+   K1 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+   K2 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+   K3 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+   P3 = _mm512_xor_si512( P3, X );
+
+   K4 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+   K5 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+
+   K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
+   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
+	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+   K7= _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+   P1 = _mm512_xor_si512( P1, X );
+
+   H[0] = _mm512_xor_si512( H[0], P2 );
+   H[1] = _mm512_xor_si512( H[1], P3 );
+   H[2] = _mm512_xor_si512( H[2], P0 );
+   H[3] = _mm512_xor_si512( H[3], P1 );
+}
+
+void shavite512_4way_init( shavite512_4way_context *ctx )
+{
+    __m512i *h = (__m512i*)ctx->h;
+    __m128i *iv = (__m128i*)IV512;
+   
+   h[0] = m512_const1_128( iv[0] );
+   h[1] = m512_const1_128( iv[1] );
+   h[2] = m512_const1_128( iv[2] );
+   h[3] = m512_const1_128( iv[3] );
+
+   ctx->ptr    = 0;
+   ctx->count0 = 0;
+   ctx->count1 = 0;
+   ctx->count2 = 0;
+   ctx->count3 = 0;
+}
+
+// not tested, use update_close
+void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
+                             size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 2 )
+         clen = len << 2;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= clen >> 2;
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_4way( ctx, buf );
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+}
+
+// not tested
+void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
+{
+    unsigned char *buf;
+    union 
+    {
+       uint32_t u32[4];
+       uint16_t u16[8];
+    } count;
+
+    buf = ctx->buf;
+    uint32_t vp = ctx->ptr>>6;
+
+    // Terminating byte then zero pad
+    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+
+    // Zero pad full vectors up to count
+    for ( ; vp < 6; vp++ )      
+        casti_m512i( buf, vp ) = m512_zero;
+
+    // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+    // Count is misaligned to 16 bits and straddles a vector.
+    // Use u32 overlay to stage then u16 to load buf.
+    count.u32[0] = ctx->count0 += (ctx->ptr << 1);  // ptr/4 * 8
+    count.u32[1] = ctx->count1;
+    count.u32[2] = ctx->count2;
+    count.u32[3] = ctx->count3;
+
+    casti_m512i( buf, 6 ) = m512_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+                
+    c512_4way( ctx, buf);
+
+    casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
+    casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
+    casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
+    casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
+}
+
+void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
+                                   const void *data, size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   // process full blocks and load buf with remainder.
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 2 )
+         clen = len << 2;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= (clen >> 2);
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_4way( ctx, buf );
+         ptr = 0;
+      }
+   }
+
+   uint32_t vp = ptr>>6;
+   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+   // Count is misaligned to 16 bits and straddles 2 vectors.
+   // Use u32 overlay to stage then u16 to load buf.
+   union
+   {
+      uint32_t u32[4];
+      uint16_t u16[8];
+   } count;
+
+   count.u32[0] = ctx->count0 += (ptr << 1);  // ptr/4 * 8
+   count.u32[1] = ctx->count1;
+   count.u32[2] = ctx->count2;
+   count.u32[3] = ctx->count3;
+
+   if ( vp == 0 )    // empty buf, xevan.
+   { 
+      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
+   }
+
+    casti_m512i( buf, 6 ) = m512_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+
+   c512_4way( ctx, buf);
+
+   casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
+   casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
+   casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
+   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
+}
+
+#endif // VAES
diff --git a/algo/shavite/shavite-hash-4way.h b/algo/shavite/shavite-hash-4way.h
new file mode 100644
index 0000000..c179566
--- /dev/null
+++ b/algo/shavite/shavite-hash-4way.h
@@ -0,0 +1,25 @@
+#ifndef SHAVITE_HASH_4WAY_H__
+#define SHAVITE_HASH_4WAY_H__ 1
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  
+#include "simd-utils.h"
+
+typedef struct {
+        unsigned char buf[128<<2];
+        uint32_t h[16<<2];
+        size_t ptr;
+        uint32_t count0, count1, count2, count3;
+} shavite512_4way_context __attribute__ ((aligned (64)));
+
+void shavite512_4way_init( shavite512_4way_context *ctx );
+void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
+	                     size_t len );
+void shavite512_4way_close( shavite512_4way_context *ctx, void *dst );
+void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
+		                   const void *data, size_t len );
+
+#endif // VAES
+
+#endif // SHAVITE_HASH_4WAY_H__
+
diff --git a/algo/skein/skein-4way.c b/algo/skein/skein-4way.c
index a992789..c040e15 100644
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -18,76 +18,18 @@ void skeinhash_8way( void *state, const void *input )
      uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
      skein512_8way_context ctx_skein;
 
-//#if defined(__SHA__)
-//     uint32_t hash0[16] __attribute__ ((aligned (64)));
-//     uint32_t hash1[16] __attribute__ ((aligned (64)));
-//     uint32_t hash2[16] __attribute__ ((aligned (64)));
-//     uint32_t hash3[16] __attribute__ ((aligned (64)));
-//     uint32_t hash4[16] __attribute__ ((aligned (64)));
-//     uint32_t hash5[16] __attribute__ ((aligned (64)));
-//     uint32_t hash6[16] __attribute__ ((aligned (64)));
-//     uint32_t hash7[16] __attribute__ ((aligned (64)));
-//     SHA256_CTX           ctx_sha256;
-//#else
      uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
      sha256_8way_context ctx_sha256;
-//#endif
 
      skein512_8way_init( &ctx_skein );
      skein512_8way_update( &ctx_skein, input, 80 );
      skein512_8way_close( &ctx_skein, vhash64 );
-/*
-#if defined(__SHA__)      
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash64, 512 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
-     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
-     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
-     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
-     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
-     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
-     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
-     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
-     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
-     
-     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, 256 );
-#else
-*/
 
      rintrlv_8x64_8x32( vhash32, vhash64, 512 );
-//     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-//                   vhash64, 512 );
-//     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-//                   hash7, 512 );
 
      sha256_8way_init( &ctx_sha256 );
-     sha256_8way( &ctx_sha256, vhash32, 64 );
+     sha256_8way_update( &ctx_sha256, vhash32, 64 );
      sha256_8way_close( &ctx_sha256, state );
-//#endif
 }
 
 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -176,7 +118,7 @@ void skeinhash_4way( void *state, const void *input )
      rintrlv_4x64_4x32( vhash32, vhash64, 512 );
 
      sha256_4way_init( &ctx_sha256 );
-     sha256_4way( &ctx_sha256, vhash32, 64 );
+     sha256_4way_update( &ctx_sha256, vhash32, 64 );
      sha256_4way_close( &ctx_sha256, state );
 #endif
 }
diff --git a/algo/skein/skein-hash-4way.h b/algo/skein/skein-hash-4way.h
index 4f828a1..3f58e95 100644
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context;
 void skein512_4way_init( skein512_4way_context *sc );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-#define skein512_4way skein512_4way_update
+//#define skein512_4way skein512_4way_update
 
 void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-#define skein256_4way skein256_4way_update
+//#define skein256_4way skein256_4way_update
 
 #ifdef __cplusplus
 }
diff --git a/algo/skein/skein2-4way.c b/algo/skein/skein2-4way.c
index a51508b..b2a7962 100644
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input )
    uint64_t hash[16*4] __attribute__ ((aligned (64)));
 
    skein512_4way_init( &ctx );
-   skein512_4way( &ctx, input, 80 );
+   skein512_4way_update( &ctx, input, 80 );
    skein512_4way_close( &ctx, hash );
 
    skein512_4way_init( &ctx );
-   skein512_4way( &ctx, hash, 64 );
+   skein512_4way_update( &ctx, hash, 64 );
    skein512_4way_close( &ctx, output );
 }
 
diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c
index f900aba..6e17d1b 100644
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -50,41 +50,138 @@
 #include <string.h>
 #include "sm3-hash-4way.h"
 
-#ifdef __SSE4_2__
+#ifdef __AVX2__
 
-void sm3_4way_init( sm3_4way_ctx_t *ctx )
+#define P0_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x,  9 ), \
+                                          mm256_rol_32( x, 17 ) ) ) 
+
+#define P1_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \
+                                          mm256_rol_32( x, 23 ) ) ) 
+
+#define FF0_8W(x,y,z) \
+   _mm256_xor_si256( x, _mm256_xor_si256( y, z ) )
+
+#define FF1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \
+                                     _mm256_and_si256( x, z ) ), \
+                                     _mm256_and_si256( y, z ) )
+
+#define GG0_8W(x,y,z)  FF0_8W(x,y,z)
+
+#define GG1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_and_si256( x, y ), \
+                    _mm256_andnot_si256( x, z ) )
+
+void sm3_8way_compress( __m256i *digest, __m256i *block )
 {
-	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
-	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
-	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
-	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
-	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
-	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
-	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
-	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
-	ctx->nblocks = 0;
-	ctx->num = 0;
+   __m256i W[68], W1[64];
+   __m256i A = digest[ 0 ];
+   __m256i B = digest[ 1 ];
+   __m256i C = digest[ 2 ];
+   __m256i D = digest[ 3 ];
+   __m256i E = digest[ 4 ];
+   __m256i F = digest[ 5 ];
+   __m256i G = digest[ 6 ];
+   __m256i H = digest[ 7 ];
+   __m256i SS1, SS2, TT1, TT2, T;
+   int j;
+
+   for ( j = 0; j < 16; j++ )
+      W[j] = mm256_bswap_32( block[j] );
+
+   for ( j = 16; j < 68; j++ )
+      W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256(
+                                      _mm256_xor_si256( W[ j-16 ], W[ j-9 ] ),
+                                      mm256_rol_32( W[ j-3 ], 15 ) ) ),
+                  _mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) );
+
+   for( j = 0; j < 64; j++ )
+       W1[j] = _mm256_xor_si256( W[j], W[j+4] );
+
+   T = _mm256_set1_epi32( 0x79CC4519UL );
+   for( j =0; j < 16; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32(
+                      mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF0_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG0_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   T = _mm256_set1_epi32( 0x7A879D8AUL );
+   for( j =16; j < 64; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32(
+                  mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF1_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG1_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   digest[0] = _mm256_xor_si256( digest[0], A );
+   digest[1] = _mm256_xor_si256( digest[1], B );
+   digest[2] = _mm256_xor_si256( digest[2], C );
+   digest[3] = _mm256_xor_si256( digest[3], D );
+   digest[4] = _mm256_xor_si256( digest[4], E );
+   digest[5] = _mm256_xor_si256( digest[5], F );
+   digest[6] = _mm256_xor_si256( digest[6], G );
+   digest[7] = _mm256_xor_si256( digest[7], H );
 }
 
-void sm3_4way( void *cc, const void *data, size_t len )
+void sm3_8way_init( sm3_8way_ctx_t *ctx )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *block = (__m128i*)ctx->block;
-   __m128i *vdata = (__m128i*)data;
+   ctx->digest[0] = _mm256_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm256_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}
 
+void sm3_8way_update( void *cc, const void *data, size_t len )
+{
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *block = (__m256i*)ctx->block;
+   __m256i *vdata = (__m256i*)data;
    if ( ctx->num )
    {
       unsigned int left = SM3_BLOCK_SIZE - ctx->num;
       if ( len < left )
       {
-         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
+         memcpy_256( block + (ctx->num >> 2), vdata , len>>2 );
          ctx->num += len;
          return;
       }
       else
       {
-         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
-         sm3_4way_compress( ctx->digest, block );
+         memcpy_256( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_8way_compress( ctx->digest, block );
          ctx->nblocks++;
          vdata += left>>2;
          len -= left;
@@ -92,49 +189,53 @@ void sm3_4way( void *cc, const void *data, size_t len )
    }
    while ( len >= SM3_BLOCK_SIZE )
    {
-      sm3_4way_compress( ctx->digest, vdata );
+      sm3_8way_compress( ctx->digest, vdata );
       ctx->nblocks++;
       vdata += SM3_BLOCK_SIZE>>2;
       len -= SM3_BLOCK_SIZE;
    }
    ctx->num = len;
    if ( len )
-      memcpy_128( block, vdata, len>>2 );
+      memcpy_256( block, vdata, len>>2 );
 }
 
-void sm3_4way_close( void *cc, void *dst )
+void sm3_8way_close( void *cc, void *dst )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *hash = (__m128i*)dst;
-   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
-   __m128i *block = (__m128i*)ctx->block;
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *hash = (__m256i*)dst;
+   __m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m256i *block = (__m256i*)ctx->block;
    int i;
 
-   block[ctx->num] = _mm_set1_epi32( 0x80 );
+   block[ctx->num] = _mm256_set1_epi32( 0x80 );
 
    if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
    {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
-                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
    }
    else
    {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
                              ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
-      sm3_4way_compress( ctx->digest, block );
-      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+      sm3_8way_compress( ctx->digest, block );
+      memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
    }
 
-   count[0] = mm128_bswap_32(
-                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[0] = mm256_bswap_32(
+                  _mm256_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) +
                                               ( ctx->num     << 3 ) ) );
-   sm3_4way_compress( ctx->digest, block );
+   sm3_8way_compress( ctx->digest, block );
 
    for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = mm256_bswap_32( ctx->digest[i] );
 }
 
+#endif
+
+#if defined(__SSE2__)
+
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
                                                mm128_rol_32( x, 17 ) ) ) 
 #define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
@@ -227,5 +328,88 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
    digest[7] = _mm_xor_si128( digest[7], H );
 }
 
+void sm3_4way_init( sm3_4way_ctx_t *ctx )
+{
+   ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}
+
+void sm3_4way_update( void *cc, const void *data, size_t len )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *block = (__m128i*)ctx->block;
+   __m128i *vdata = (__m128i*)data;
+
+   if ( ctx->num )
+   {
+      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+      if ( len < left )
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
+         ctx->num += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_4way_compress( ctx->digest, block );
+         ctx->nblocks++;
+         vdata += left>>2;
+         len -= left;
+      }
+   }
+   while ( len >= SM3_BLOCK_SIZE )
+   {
+      sm3_4way_compress( ctx->digest, vdata );
+      ctx->nblocks++;
+      vdata += SM3_BLOCK_SIZE>>2;
+      len -= SM3_BLOCK_SIZE;
+   }
+   ctx->num = len;
+   if ( len )
+      memcpy_128( block, vdata, len>>2 );
+}
+
+void sm3_4way_close( void *cc, void *dst )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *hash = (__m128i*)dst;
+   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m128i *block = (__m128i*)ctx->block;
+   int i;
+
+   block[ctx->num] = _mm_set1_epi32( 0x80 );
+
+   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
+   }
+   else
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
+      sm3_4way_compress( ctx->digest, block );
+      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+   }
+
+   count[0] = mm128_bswap_32(
+                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                              ( ctx->num     << 3 ) ) );
+   sm3_4way_compress( ctx->digest, block );
+
+   for ( i = 0; i < 8 ; i++ )
+     hash[i] = mm128_bswap_32( ctx->digest[i] );
+}
+
 #endif
 
diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h
index 06159d8..abe1dfd 100644
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -48,14 +48,13 @@
  */
 
 #ifndef SPH_SM3_HASH_4WAY_H
-#define SPH_SM3_HASH_4WAY_H
+#define SPH_SM3_HASH_4WAY_H 1
 
 #define SM3_DIGEST_LENGTH	32
 #define SM3_BLOCK_SIZE		64
 #define SM3_CBLOCK		(SM3_BLOCK_SIZE)
 #define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
 
-
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
@@ -65,7 +64,6 @@
 extern "C" {
 #endif
 
-
 typedef struct {
    __m128i block[16] __attribute__ ((aligned (64)));
    __m128i digest[8];
@@ -74,15 +72,24 @@ typedef struct {
 } sm3_4way_ctx_t;
 
 void sm3_4way_init( sm3_4way_ctx_t *ctx );
-//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
-//                      size_t data_len );
-//void sm3_4way_final( sm3_4way_ctx_t *ctx,
-//                      unsigned char digest[SM3_DIGEST_LENGTH] );
-void sm3_4way_compress( __m128i *digest, __m128i *block );
-
-void sm3_4way(void *cc, const void *data, size_t len);
+void sm3_4way_update(void *cc, const void *data, size_t len);
 void sm3_4way_close(void *cc, void *dst);
 
+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i block[16] __attribute__ ((aligned (64)));
+   __m256i digest[8];
+   uint32_t nblocks;
+   uint32_t num;
+} sm3_8way_ctx_t;
+
+void sm3_8way_init( sm3_8way_ctx_t *ctx );
+void sm3_8way_update(void *cc, const void *data, size_t len);
+void sm3_8way_close(void *cc, void *dst);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c
index fcae00c..529bac4 100644
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -14,21 +14,32 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined (C11_8WAY)
 
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
     hashState_echo          echo;
+#endif
 } c11_8way_ctx_holder;
 
 c11_8way_ctx_holder c11_8way_ctx;
@@ -37,22 +48,28 @@ void init_c11_8way_ctx()
 {
      blake512_8way_init( &c11_8way_ctx.blake );
      bmw512_8way_init( &c11_8way_ctx.bmw );
-     init_groestl( &c11_8way_ctx.groestl, 64 );
      skein512_8way_init( &c11_8way_ctx.skein );
      jh512_8way_init( &c11_8way_ctx.jh );
      keccak512_8way_init( &c11_8way_ctx.keccak );
      luffa_4way_init( &c11_8way_ctx.luffa, 512 );
      cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &c11_8way_ctx.shavite );
      simd_4way_init( &c11_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &c11_8way_ctx.shavite );
+     echo_4way_init( &c11_8way_ctx.echo, 512 );
+#else
+     init_groestl( &c11_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &c11_8way_ctx.shavite );
      init_echo( &c11_8way_ctx.echo, 512 );
+#endif
 }
 
 void c11_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -72,11 +89,21 @@ void c11_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+     
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
-     // 3 Groestl
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
      memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -93,10 +120,11 @@ void c11_8way_hash( void *state, const void *input )
      memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
-     // 4way
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
      // 4 JH
      jh512_8way_update( &ctx.jh, vhash, 64 );
      jh512_8way_close( &ctx.jh, vhash );
@@ -109,20 +137,27 @@ void c11_8way_hash( void *state, const void *input )
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     // 9 Shavite
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
      memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
@@ -154,16 +189,29 @@ void c11_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     // 10 Simd
-     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
-     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
-     // 11 Echo
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
@@ -188,6 +236,8 @@ void c11_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
+#endif
+
      memcpy( state,     hash0, 32 );
      memcpy( state+ 32, hash1, 32 );
      memcpy( state+ 64, hash2, 32 );
@@ -282,11 +332,11 @@ void c11_4way_hash( void *state, const void *input )
      memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
 
      // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -305,15 +355,15 @@ void c11_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 5 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // 6 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // Serial
diff --git a/algo/x11/c11-gate.c b/algo/x11/c11-gate.c
index be0750e..f9d50ce 100644
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -15,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_c11;
   gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c
index d1f51c5..94d36f7 100644
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input)
       switch ( permutation[i] )
       {
         case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
            blake512_4way_close( &ctx.blake, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
            bmw512_4way_close( &ctx.bmw, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input)
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
         case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
            skein512_4way_close( &ctx.skein, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
            jh512_4way_close( &ctx.jh, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
            keccak512_4way_close( &ctx.keccak, vhashB );
            if ( i == 7 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c
index f4c016d..9353124 100644
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input)
       switch ( permutation[i] )
       {
         case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
            blake512_4way_close( &ctx.blake, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
            bmw512_4way_close( &ctx.bmw, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input)
               intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
         break;
         case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
            skein512_4way_close( &ctx.skein, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
            jh512_4way_close( &ctx.jh, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
         break;
         case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
            keccak512_4way_close( &ctx.keccak, vhashB );
            if ( i == 9 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c
index 4be0286..df49600 100644
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -6,6 +6,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(TRIBUS_8WAY)
 
@@ -14,6 +17,8 @@ static __thread jh512_8way_context ctx_mid;
 void tribus_hash_8way( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -24,7 +29,11 @@ void tribus_hash_8way( void *state, const void *input )
      uint64_t hash7[8] __attribute__ ((aligned (64)));
      jh512_8way_context     ctx_jh;
      keccak512_8way_context ctx_keccak;
+#if defined(__VAES__)
+     echo_4way_context      ctx_echo;
+#else
      hashState_echo         ctx_echo;
+#endif
 
      memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
      jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
@@ -34,10 +43,23 @@ void tribus_hash_8way( void *state, const void *input )
      keccak512_8way_update( &ctx_keccak, vhash, 64 );
      keccak512_8way_close( &ctx_keccak, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+     
+     echo_4way_init( &ctx_echo, 512 );
+     echo_4way_update_close( &ctx_echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx_echo, 512 );
+     echo_4way_update_close( &ctx_echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, 512 );
 
-     // hash echo serially
      init_echo( &ctx_echo, 512 );
      update_final_echo( &ctx_echo, (BitSequence *) hash0,
                         (const BitSequence *) hash0, 512 );
@@ -63,6 +85,8 @@ void tribus_hash_8way( void *state, const void *input )
      update_final_echo( &ctx_echo, (BitSequence *) hash7,
                         (const BitSequence *) hash7, 512 );
 
+#endif
+
      memcpy( state,       hash0, 32 );
      memcpy( state+32,    hash1, 32 );
      memcpy( state+64,    hash2, 32 );
diff --git a/algo/x11/tribus-gate.c b/algo/x11/tribus-gate.c
index 794ec31..3d8d171 100644
--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,7 +2,7 @@
 
 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
 #if defined (TRIBUS_8WAY)
   gate->scanhash      = (void*)&scanhash_tribus_8way;
   gate->hash          = (void*)&tribus_hash_8way;
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
index a30cbc0..2fe47a7 100644
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -14,21 +14,32 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined (X11_8WAY)
 
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
     hashState_echo          echo;
+#endif
 } x11_8way_ctx_holder;
 
 x11_8way_ctx_holder x11_8way_ctx;
@@ -37,22 +48,28 @@ void init_x11_8way_ctx()
 {
      blake512_8way_init( &x11_8way_ctx.blake );
      bmw512_8way_init( &x11_8way_ctx.bmw );
-     init_groestl( &x11_8way_ctx.groestl, 64 );
      skein512_8way_init( &x11_8way_ctx.skein );
      jh512_8way_init( &x11_8way_ctx.jh );
      keccak512_8way_init( &x11_8way_ctx.keccak );
      luffa_4way_init( &x11_8way_ctx.luffa, 512 );
      cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x11_8way_ctx.shavite );
      simd_4way_init( &x11_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &x11_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x11_8way_ctx.shavite );
+     echo_4way_init( &x11_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x11_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x11_8way_ctx.shavite );
      init_echo( &x11_8way_ctx.echo, 512 );
+#endif
 }
 
 void x11_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -61,7 +78,6 @@ void x11_8way_hash( void *state, const void *input )
      uint64_t hash5[8] __attribute__ ((aligned (64)));
      uint64_t hash6[8] __attribute__ ((aligned (64)));
      uint64_t hash7[8] __attribute__ ((aligned (64)));
-
      x11_8way_ctx_holder ctx;
      memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
      blake512_8way_update( &ctx.blake, input, 80 );
@@ -70,7 +86,18 @@ void x11_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
@@ -97,10 +124,11 @@ void x11_8way_hash( void *state, const void *input )
              sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
-     // 4way
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -110,18 +138,26 @@ void x11_8way_hash( void *state, const void *input )
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
@@ -154,13 +190,28 @@ void x11_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
@@ -186,6 +237,8 @@ void x11_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
+#endif
+
      memcpy( state,     hash0, 32 );
      memcpy( state+ 32, hash1, 32 );
      memcpy( state+ 64, hash2, 32 );
@@ -282,11 +335,11 @@ void x11_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
 
      // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -305,15 +358,15 @@ void x11_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c
index 132996a..50b5480 100644
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -15,7 +15,7 @@ bool register_x11_algo( algo_gate_t *gate )
   gate->scanhash  = (void*)&scanhash_x11;
   gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ;
   return true;
 };
 
diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c
index 8fe1512..11e5366 100644
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input )
       switch ( idx )
       {
          case 0:
-            blake512_4way( &ctx.blake, input, 80 );
+            blake512_4way_update( &ctx.blake, input, 80 );
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 1:
-            bmw512_4way( &ctx.bmw, vhash, 64 );
+            bmw512_4way_update( &ctx.bmw, vhash, 64 );
             bmw512_4way_close( &ctx.bmw, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
@@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input )
                intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 );
          break;
          case 3:
-            skein512_4way( &ctx.skein, vhash, 64 );
+            skein512_4way_update( &ctx.skein, vhash, 64 );
             skein512_4way_close( &ctx.skein, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 4:
-            jh512_4way( &ctx.jh, vhash, 64 );
+            jh512_4way_update( &ctx.jh, vhash, 64 );
             jh512_4way_close( &ctx.jh, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
          break;
          case 5:
-            keccak512_4way( &ctx.keccak, vhash, 64 );
+            keccak512_4way_update( &ctx.keccak, vhash, 64 );
             keccak512_4way_close( &ctx.keccak, vhash );
             if ( i >= len-1 )
                dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c
index f3713d7..3cf5b67 100644
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -15,22 +15,33 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined (X11GOST_8WAY)
 
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;    
     keccak512_8way_context  keccak;    
     sph_gost512_context     gost;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
     hashState_echo          echo;
+#endif
 } x11gost_8way_ctx_holder;
 
 x11gost_8way_ctx_holder x11gost_8way_ctx;
@@ -39,21 +50,29 @@ void init_x11gost_8way_ctx()
 {
      blake512_8way_init( &x11gost_8way_ctx.blake );
      bmw512_8way_init( &x11gost_8way_ctx.bmw );
-     init_groestl( &x11gost_8way_ctx.groestl, 64 );
      skein512_8way_init( &x11gost_8way_ctx.skein );
      jh512_8way_init( &x11gost_8way_ctx.jh );
      keccak512_8way_init( &x11gost_8way_ctx.keccak );
      sph_gost512_init( &x11gost_8way_ctx.gost );
      luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
      cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x11gost_8way_ctx.shavite );
      simd_4way_init( &x11gost_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &x11gost_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x11gost_8way_ctx.shavite );
+     echo_4way_init( &x11gost_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x11gost_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x11gost_8way_ctx.shavite );
      init_echo( &x11gost_8way_ctx.echo, 512 );
+#endif
 }
 
 void x11gost_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -72,7 +91,18 @@ void x11gost_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
@@ -99,10 +129,11 @@ void x11gost_8way_hash( void *state, const void *input )
              sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
-     // 4way
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -140,20 +171,28 @@ void x11gost_8way_hash( void *state, const void *input )
      sph_gost512( &ctx.gost, hash7, 64 );
      sph_gost512_close( &ctx.gost, hash7 );
 
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     sph_shavite512( &ctx.shavite, hash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      sph_shavite512_close( &ctx.shavite, hash0 );
      memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
              sizeof(sph_shavite512_context) );
@@ -184,14 +223,29 @@ void x11gost_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
@@ -216,6 +270,8 @@ void x11gost_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
+#endif
+
      memcpy( state,     hash0, 32 );
      memcpy( state+ 32, hash1, 32 );
      memcpy( state+ 64, hash2, 32 );
@@ -310,10 +366,10 @@ void x11gost_4way_hash( void *state, const void *input )
      x11gost_4way_ctx_holder ctx;
      memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
 
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -333,13 +389,13 @@ void x11gost_4way_hash( void *state, const void *input )
      // 4way
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
diff --git a/algo/x12/x12-4way.c b/algo/x12/x12-4way.c
index ed4d131..8ae7960 100644
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -16,6 +16,11 @@
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(X12_8WAY)
 
@@ -23,16 +28,22 @@
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } x12_8way_ctx_holder;
 
 x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
@@ -41,31 +52,29 @@ void init_x12_8way_ctx()
 {
      blake512_8way_init( &x12_8way_ctx.blake );
      bmw512_8way_init( &x12_8way_ctx.bmw );
-     init_groestl( &x12_8way_ctx.groestl, 64 );
      skein512_8way_init( &x12_8way_ctx.skein );
      jh512_8way_init( &x12_8way_ctx.jh );
      keccak512_8way_init( &x12_8way_ctx.keccak );
      luffa_4way_init( &x12_8way_ctx.luffa, 512 );
      cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x12_8way_ctx.shavite );
      simd_4way_init( &x12_8way_ctx.simd, 512 );
-     init_echo( &x12_8way_ctx.echo, 512 );
      hamsi512_8way_init( &x12_8way_ctx.hamsi );
+#if defined(__VAES__)
+     groestl512_4way_init( &x12_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x12_8way_ctx.shavite );
+     echo_4way_init( &x12_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x12_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x12_8way_ctx.shavite );
+     init_echo( &x12_8way_ctx.echo, 512 );
+#endif
 };
 
 void x12_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
-     uint64_t hash0[8] __attribute__ ((aligned (64)));
-     uint64_t hash1[8] __attribute__ ((aligned (64)));
-     uint64_t hash2[8] __attribute__ ((aligned (64)));
-     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t hash4[8] __attribute__ ((aligned (64)));
-     uint64_t hash5[8] __attribute__ ((aligned (64)));
-     uint64_t hash6[8] __attribute__ ((aligned (64)));
-     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
 
      x12_8way_ctx_holder ctx;
      memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
@@ -75,18 +84,36 @@ void x12_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
@@ -119,14 +146,35 @@ void x12_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      update_final_echo( &ctx.echo, (BitSequence *)hash1,
                        (const BitSequence *) hash1, 512 );
      memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
@@ -174,6 +222,8 @@ void x12_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -272,10 +322,10 @@ void x12_4way_hash( void *state, const void *input )
      x12_4way_ctx_holder ctx;
      memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
 
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
@@ -328,16 +378,16 @@ void x12_4way_hash( void *state, const void *input )
 
      // Parallel 4way 64 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
diff --git a/algo/x12/x12-gate.c b/algo/x12/x12-gate.c
index f495747..706cf54 100644
--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -15,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x12;
   gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c
index 7750e75..eb143e8 100644
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -10,6 +10,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(PHI1612_8WAY)
 
@@ -19,7 +22,11 @@ typedef struct {
     cube_4way_context       cube;
     sph_fugue512_context    fugue;
     sph_gost512_context     gost;
+#if defined(__VAES__)
+    echo_4way_context       echo;
+#else
     hashState_echo          echo;
+#endif
 } phi1612_8way_ctx_holder;
 
 phi1612_8way_ctx_holder phi1612_8way_ctx __attribute__ ((aligned (64)));
@@ -31,7 +38,11 @@ void init_phi1612_8way_ctx()
      cube_4way_init( &phi1612_8way_ctx.cube, 512, 16, 32 );
      sph_fugue512_init( &phi1612_8way_ctx.fugue );
      sph_gost512_init( &phi1612_8way_ctx.gost );
+#if defined(__VAES__)
+     echo_4way_init( &phi1612_8way_ctx.echo, 512 );
+#else
      init_echo( &phi1612_8way_ctx.echo, 512 );
+#endif
 };
 
 void phi1612_8way_hash( void *state, const void *input )
@@ -118,6 +129,19 @@ void phi1612_8way_hash( void *state, const void *input )
      sph_gost512_close( &ctx.gost, hash7 );
 
      // Echo
+
+#if defined(__VAES__)
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+#else
+
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      init_echo( &ctx.echo, 512 );
@@ -142,6 +166,8 @@ void phi1612_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
+#endif
+
      memcpy( state,     hash0, 32 );
      memcpy( state+ 32, hash1, 32 );
      memcpy( state+ 64, hash2, 32 );
@@ -225,11 +251,11 @@ void phi1612_4way_hash( void *state, const void *input )
      memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
 
      // Skein parallel 4way
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Serial to the end
diff --git a/algo/x13/phi1612-gate.c b/algo/x13/phi1612-gate.c
index 1cfe3fa..ef3e772 100644
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -15,7 +15,7 @@ bool register_phi1612_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_phi1612;
   gate->hash      = (void*)&phi1612_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c
index 81899d0..566f545 100644
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -168,7 +168,7 @@ void skunk_4way_hash( void *output, const void *input )
      skunk_4way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &skunk_4way_ctx, sizeof(skunk_4way_ctx) );
 
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c
index 40b4b5b..2173f01 100644
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -17,23 +17,34 @@
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(X13_8WAY)
 
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } x13_8way_ctx_holder;
 
 x13_8way_ctx_holder x13_8way_ctx;
@@ -42,24 +53,30 @@ void init_x13_8way_ctx()
 {
      blake512_8way_init( &x13_8way_ctx.blake );
      bmw512_8way_init( &x13_8way_ctx.bmw );
-     init_groestl( &x13_8way_ctx.groestl, 64 );
      skein512_8way_init( &x13_8way_ctx.skein );
      jh512_8way_init( &x13_8way_ctx.jh );
      keccak512_8way_init( &x13_8way_ctx.keccak );
      luffa_4way_init( &x13_8way_ctx.luffa, 512 );
      cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x13_8way_ctx.shavite );
      simd_4way_init( &x13_8way_ctx.simd, 512 );
-     init_echo( &x13_8way_ctx.echo, 512 );
      hamsi512_8way_init( &x13_8way_ctx.hamsi );
      sph_fugue512_init( &x13_8way_ctx.fugue );
+#if defined(__VAES__)
+     groestl512_4way_init( &x13_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x13_8way_ctx.shavite );
+     echo_4way_init( &x13_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x13_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x13_8way_ctx.shavite );
+     init_echo( &x13_8way_ctx.echo, 512 );
+#endif
 }
 
 void x13_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -76,6 +93,19 @@ void x13_8way_hash( void *state, const void *input )
 
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
@@ -104,6 +134,9 @@ void x13_8way_hash( void *state, const void *input )
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
+
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -112,21 +145,27 @@ void x13_8way_hash( void *state, const void *input )
 
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
@@ -159,13 +198,27 @@ void x13_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
@@ -193,6 +246,9 @@ void x13_8way_hash( void *state, const void *input )
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
+
+#endif
+
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
@@ -321,11 +377,11 @@ void x13_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x13_4way_ctx, sizeof(x13_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -344,15 +400,15 @@ void x13_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
@@ -416,7 +472,7 @@ void x13_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c
index 366185c..ad8abdb 100644
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -15,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x13;
   gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x13/x13bcd-4way.c b/algo/x13/x13bcd-4way.c
index 706ea6f..01fb8a6 100644
--- a/algo/x13/x13bcd-4way.c
+++ b/algo/x13/x13bcd-4way.c
@@ -1,7 +1,4 @@
 #include "x13sm3-gate.h"
-
-#if defined(X13SM3_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,12 +10,328 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
+
+#if defined(X13BCD_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    cube_4way_context       cube;
+    simd_4way_context       simd;
+    sm3_8way_ctx_t          sm3;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
+} x13bcd_8way_ctx_holder;
+
+x13bcd_8way_ctx_holder x13bcd_8way_ctx __attribute__ ((aligned (64)));
+static __thread blake512_8way_context x13bcd_8way_ctx_mid;
+
+void init_x13bcd_8way_ctx()
+{
+     blake512_8way_init( &x13bcd_8way_ctx.blake );
+     bmw512_8way_init( &x13bcd_8way_ctx.bmw );
+     skein512_8way_init( &x13bcd_8way_ctx.skein );
+     jh512_8way_init( &x13bcd_8way_ctx.jh );
+     keccak512_8way_init( &x13bcd_8way_ctx.keccak );
+     cube_4way_init( &x13bcd_8way_ctx.cube, 512, 16, 32 );
+     simd_4way_init( &x13bcd_8way_ctx.simd, 512 );
+     sm3_8way_init( &x13bcd_8way_ctx.sm3 );
+     hamsi512_8way_init( &x13bcd_8way_ctx.hamsi );
+     sph_fugue512_init( &x13bcd_8way_ctx.fugue );
+#if defined(__VAES__)
+     groestl512_4way_init( &x13bcd_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x13bcd_8way_ctx.shavite );
+     echo_4way_init( &x13bcd_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x13bcd_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x13bcd_8way_ctx.shavite );
+     init_echo( &x13bcd_8way_ctx.echo, 512 );
+#endif
+};
+
+void x13bcd_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x13bcd_8way_ctx_holder ctx;
+     memcpy( &ctx, &x13bcd_8way_ctx, sizeof(x13bcd_8way_ctx) );
+
+     // Blake
+     memcpy( &ctx.blake, &x13bcd_8way_ctx_mid, sizeof(x13bcd_8way_ctx_mid) );
+     blake512_8way_update( &ctx.blake, input + (64<<3), 16 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, 
+                       hash4, hash5, hash6, hash7, vhash );
+                       
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     reinit_groestl( &ctx.groestl );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+
+#endif
+     
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     // SM3 parallel 32 bit
+     rintrlv_8x64_8x32( vhashA, vhash, 512 );
+     memset( vhash, 0, sizeof vhash );
+     sm3_8way_update( &ctx.sm3, vhashA, 64 );
+     sm3_8way_close( &ctx.sm3, vhash );
+
+     rintrlv_8x32_4x128( vhashA, vhashB, vhash, 512 );
+
+     // Cube
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x13bcd_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+     
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x13bcd_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
+                             hash4, hash5, hash6, hash7 );
+
+#endif
+
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3,
+                       hash4, hash5, hash6, hash7, vhash );
+
+     // Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, state );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, state+32 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, state+64 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, state+96 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, state+128 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, state+160 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, state+192 );
+     memcpy( &ctx.fugue, &x13bcd_8way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, state+224 );
+}
+
+int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;  // thr_id arg is deprecated
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     blake512_8way_init( &x13bcd_8way_ctx_mid );
+     blake512_8way_update( &x13bcd_8way_ctx_mid, vdata, 64 );
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+        _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x13bcd_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+              pdata[19] = n+i;
+              submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined(X13BCD_4WAY)
 
 typedef struct {
     blake512_4way_context   blake;
@@ -68,11 +381,11 @@ void x13bcd_4way_hash( void *state, const void *input )
 
      // Blake
      memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
-     blake512_4way( &ctx.blake, input + (64<<2), 16 );
+     blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -91,15 +404,15 @@ void x13bcd_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -118,7 +431,7 @@ void x13bcd_4way_hash( void *state, const void *input )
      uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
      memset( sm3_hash3, 0, sizeof sm3_hash3 );
 
-     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_update( &ctx.sm3, vhash, 64 );
      sm3_4way_close( &ctx.sm3, sm3_vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 
@@ -171,20 +484,23 @@ void x13bcd_4way_hash( void *state, const void *input )
 
      // Hamsi parallel 4x32x2
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // Fugue serial
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash1, 64 );
      sph_fugue512_close( &ctx.fugue, hash1 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash2, 64 );
      sph_fugue512_close( &ctx.fugue, hash2 );
-     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue, sizeof(sph_fugue512_context) );
+     memcpy( &ctx.fugue, &x13bcd_4way_ctx.fugue,
+                         sizeof(sph_fugue512_context) );
      sph_fugue512( &ctx.fugue, hash3, 64 );
      sph_fugue512_close( &ctx.fugue, hash3 );
 
@@ -203,44 +519,33 @@ int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
      uint32_t *ptarget = work->target;
      uint32_t n = pdata[19];
      const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 4;
      __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;
      const uint32_t Htarg = ptarget[7];
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
 
      mm256_bswap32_intrlv80_4x64( vdata, pdata );
 
      blake512_4way_init( &x13bcd_ctx_mid );
      blake512_4way( &x13bcd_ctx_mid, vdata, 64 );
+     do
+     {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
 
-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
-         do
-         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+        x13bcd_4way_hash( hash, vdata );
+        pdata[19] = n;
 
-            x13bcd_4way_hash( hash, vdata );
-            pdata[19] = n;
-
-            for ( int i = 0; i < 4; i++ )
-            if ( ( ( (hash+(i<<3))[7] & mask ) == 0 ) )
-            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
-            {
-               pdata[19] = n+i;
-              submit_lane_solution( work, hash+(i<<3), mythr, i );
-            }
-            n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-
-     *hashes_done = n - first_nonce + 1;
+        for ( int i = 0; i < 4; i++ )
+        if ( (hash+(i<<3))[7] <= Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+            pdata[19] = n+i;
+            submit_lane_solution( work, hash+(i<<3), mythr, i );
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
      return 0;
 }
 
diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c
index a107627..9cafa76 100644
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -71,13 +71,11 @@ void x13sm3_4way_hash( void *state, const void *input )
 
      // Blake
      memcpy( &ctx.blake, &x13sm3_ctx_mid, sizeof(x13sm3_ctx_mid) );
-     blake512_4way( &ctx.blake, input + (64<<2), 16 );
-
-//     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input + (64<<2), 16 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -96,15 +94,15 @@ void x13sm3_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial to the end
@@ -180,13 +178,13 @@ void x13sm3_4way_hash( void *state, const void *input )
      uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
      memset( sm3_hash3, 0, sizeof sm3_hash3 );
 
-     sm3_4way( &ctx.sm3, vhash, 64 );
+     sm3_4way_update( &ctx.sm3, vhash, 64 );
      sm3_4way_close( &ctx.sm3, sm3_vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 
      // Hamsi parallel 4x32x2
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c
index bc0fb92..fc2f934 100644
--- a/algo/x13/x13sm3-gate.c
+++ b/algo/x13/x13sm3-gate.c
@@ -17,7 +17,11 @@ bool register_x13sm3_algo( algo_gate_t* gate )
 
 bool register_x13bcd_algo( algo_gate_t* gate )
 {
-#if defined (X13SM3_4WAY)
+#if defined (X13BCD_8WAY)
+  init_x13bcd_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13bcd_8way;
+  gate->hash      = (void*)&x13bcd_8way_hash;
+#elif defined (X13BCD_4WAY)
   init_x13bcd_4way_ctx();
   gate->scanhash  = (void*)&scanhash_x13bcd_4way;
   gate->hash      = (void*)&x13bcd_4way_hash;
@@ -26,7 +30,7 @@ bool register_x13bcd_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x13bcd;
   gate->hash      = (void*)&x13bcd_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h
index f0047bf..fc6154a 100644
--- a/algo/x13/x13sm3-gate.h
+++ b/algo/x13/x13sm3-gate.h
@@ -5,13 +5,11 @@
 #include <stdint.h>
 
 #if defined(__AVX2__) && defined(__AES__)
-  #define X13SM3_4WAY
+  #define X13SM3_4WAY 1
 #endif
 
 bool register_x13sm3_algo( algo_gate_t* gate );
 
-bool register_x13bcd_algo( algo_gate_t* gate );
-
 #if defined(X13SM3_4WAY)
 
 void x13sm3_4way_hash( void *state, const void *input );
@@ -19,18 +17,39 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_4way_ctx();
 
-void x13bcd_4way_hash( void *state, const void *input );
-int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
-                       uint64_t *hashes_done, struct thr_info *mythr );
-void init_x13bcd_4way_ctx();
-
-#endif
+#else
 
 void x13sm3_hash( void *state, const void *input );
 int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13sm3_ctx();
 
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X13BCD_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X13BCD_4WAY 1
+#endif
+
+bool register_x13bcd_algo( algo_gate_t* gate );
+
+#if defined(X13BCD_8WAY)
+
+void x13bcd_8way_hash( void *state, const void *input );
+int scanhash_x13bcd_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_8way_ctx();
+
+#elif defined(X13BCD_4WAY)
+
+void x13bcd_4way_hash( void *state, const void *input );
+int scanhash_x13bcd_4way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13bcd_4way_ctx();
+
+#else
+
 void x13bcd_hash( void *state, const void *input );
 int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
@@ -38,3 +57,4 @@ void init_x13bcd_ctx();
 
 #endif
 
+#endif
diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c
index 3e1cc69..09f99b1 100644
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -34,14 +34,14 @@ void polytimos_4way_hash( void *output, const void *input )
      poly_4way_context_overlay ctx;
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // Need to convert from 64 bit interleaved to 32 bit interleaved.
      uint32_t vhash32[16*4];
      rintrlv_4x64_4x32( vhash32, vhash, 512 );
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash32, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash32, 64 );
      shabal512_4way_close( &ctx.shabal, vhash32 );
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
 
diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c
index 4f35161..1f8ea39 100644
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -38,7 +38,7 @@ void veltor_4way_hash( void *output, const void *input )
      veltor_4way_ctx_holder ctx __attribute__ ((aligned (64)));
      memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) );
 
-     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_update( &ctx.skein, input, 80 );
      skein512_4way_close( &ctx.skein, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
@@ -55,7 +55,7 @@ void veltor_4way_hash( void *output, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c
index 9de05d3..70d7c06 100644
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -19,24 +19,35 @@
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(X14_8WAY)
 
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } x14_8way_ctx_holder;
 
 x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
@@ -45,25 +56,31 @@ void init_x14_8way_ctx()
 {
      blake512_8way_init( &x14_8way_ctx.blake );
      bmw512_8way_init( &x14_8way_ctx.bmw );
-     init_groestl( &x14_8way_ctx.groestl, 64 );
      skein512_8way_init( &x14_8way_ctx.skein );
      jh512_8way_init( &x14_8way_ctx.jh );
      keccak512_8way_init( &x14_8way_ctx.keccak );
      luffa_4way_init( &x14_8way_ctx.luffa, 512 );
      cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x14_8way_ctx.shavite );
      simd_4way_init( &x14_8way_ctx.simd, 512 );
-     init_echo( &x14_8way_ctx.echo, 512 );
      hamsi512_8way_init( &x14_8way_ctx.hamsi );
      sph_fugue512_init( &x14_8way_ctx.fugue );
      shabal512_8way_init( &x14_8way_ctx.shabal );
+#if defined(__VAES__)
+     groestl512_4way_init( &x14_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x14_8way_ctx.shavite );
+     echo_4way_init( &x14_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x14_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x14_8way_ctx.shavite );
+     init_echo( &x14_8way_ctx.echo, 512 );
+#endif
 };
 
 void x14_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -80,6 +97,19 @@ void x14_8way_hash( void *state, const void *input )
 
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
@@ -108,6 +138,9 @@ void x14_8way_hash( void *state, const void *input )
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
+
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -117,18 +150,26 @@ void x14_8way_hash( void *state, const void *input )
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-     
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
@@ -161,14 +202,28 @@ void x14_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
@@ -195,6 +250,9 @@ void x14_8way_hash( void *state, const void *input )
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
+
+#endif
+
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
@@ -325,11 +383,11 @@ void x14_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -348,15 +406,15 @@ void x14_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial
@@ -420,7 +478,7 @@ void x14_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c
index 851b7c3..d454f79 100644
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -15,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x14;
   gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c
index a761af0..4af9499 100644
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -20,26 +20,36 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(X15_8WAY)
 
-
 typedef struct {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } x15_8way_ctx_holder;
 
 x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
@@ -48,26 +58,32 @@ void init_x15_8way_ctx()
 {
      blake512_8way_init( &x15_8way_ctx.blake );
      bmw512_8way_init( &x15_8way_ctx.bmw );
-     init_groestl( &x15_8way_ctx.groestl, 64 );
      skein512_8way_init( &x15_8way_ctx.skein );
      jh512_8way_init( &x15_8way_ctx.jh );
      keccak512_8way_init( &x15_8way_ctx.keccak );
      luffa_4way_init( &x15_8way_ctx.luffa, 512 );
      cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x15_8way_ctx.shavite );
      simd_4way_init( &x15_8way_ctx.simd, 512 );
-     init_echo( &x15_8way_ctx.echo, 512 );
      hamsi512_8way_init( &x15_8way_ctx.hamsi );
      sph_fugue512_init( &x15_8way_ctx.fugue );
      shabal512_8way_init( &x15_8way_ctx.shabal );
      sph_whirlpool_init( &x15_8way_ctx.whirlpool );
+#if defined(__VAES__)
+     groestl512_4way_init( &x15_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x15_8way_ctx.shavite );
+     echo_4way_init( &x15_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x15_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x15_8way_ctx.shavite );
+     init_echo( &x15_8way_ctx.echo, 512 );
+#endif
 };
 
 void x15_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -86,10 +102,22 @@ void x15_8way_hash( void *state, const void *input )
      // 2 Bmw
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
-     // 3 Groestl
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
      memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
      update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -108,6 +136,9 @@ void x15_8way_hash( void *state, const void *input )
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
+
+#endif
+
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
 
@@ -118,21 +149,27 @@ void x15_8way_hash( void *state, const void *input )
      // 6 Keccak
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                       vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
      // 9 Shavite
      sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -166,16 +203,28 @@ void x15_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-     // 10 Simd
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
-     // 11 Echo
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
      memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
@@ -200,10 +249,11 @@ void x15_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
-
-     // 12 Hamsi parallel 4way 64 bit
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
+
+#endif
+
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
@@ -374,11 +424,11 @@ void x15_4way_hash( void *state, const void *input )
      memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
 
      // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -397,15 +447,15 @@ void x15_4way_hash( void *state, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
 
      // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // Serial to the end
@@ -469,7 +519,7 @@ void x15_4way_hash( void *state, const void *input )
 
      // 12 Hamsi parallel 4way 32 bit
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c
index c148618..5083a36 100644
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -15,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x15;
   gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c
index d724c78..c4c9dab 100644
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -26,6 +26,11 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
@@ -36,20 +41,26 @@ union _x16r_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_8way_context     sha512;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
@@ -115,31 +126,42 @@ void x16r_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
-         break;
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                (const char*)in0, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                (const char*)in1, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                (const char*)in2, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                (const char*)in3, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                (const char*)in4, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                (const char*)in5, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                (const char*)in6, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                (const char*)in7, size<<3 );
+#endif
+            break;
          case SKEIN:
             skein512_8way_init( &ctx.skein );
             if ( i == 0 )
@@ -203,6 +225,16 @@ void x16r_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in0, size );
             sph_shavite512_close( &ctx.shavite, hash0 );
@@ -227,7 +259,8 @@ void x16r_8way_hash( void* output, const void* input )
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
+#endif
+            break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
             simd_4way_init( &ctx.simd, 512 );
@@ -239,31 +272,42 @@ void x16r_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
-         break;
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                               (const BitSequence*)in0, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                               (const BitSequence*)in1, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                               (const BitSequence*)in2, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                               (const BitSequence*)in3, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                               (const BitSequence*)in4, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                               (const BitSequence*)in5, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                               (const BitSequence*)in6, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                               (const BitSequence*)in7, size<<3 );
+#endif
+             break;
          case HAMSI:
              intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
@@ -463,11 +507,11 @@ void x16r_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -475,11 +519,11 @@ void x16r_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -501,11 +545,11 @@ void x16r_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -513,11 +557,11 @@ void x16r_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -525,11 +569,11 @@ void x16r_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -599,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
@@ -620,7 +664,7 @@ void x16r_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
          break;
@@ -641,7 +685,7 @@ void x16r_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
          break;
diff --git a/algo/x16/x16r-gate.c b/algo/x16/x16r-gate.c
index 031379a..6323589 100644
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -44,7 +44,7 @@ bool register_x16r_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -62,7 +62,7 @@ bool register_x16rv2_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rv2;
   gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -80,7 +80,7 @@ bool register_x16s_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16r;
   gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
   opt_target_factor = 256.0;
   return true;
@@ -215,7 +215,7 @@ bool register_x16rt_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   opt_target_factor = 256.0;
   return true;
 };
@@ -232,7 +232,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x16rt;
   gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   gate->build_extraheader = (void*)&veil_build_extraheader;
   opt_target_factor = 256.0;
   return true;
@@ -262,17 +262,20 @@ bool register_x21s_algo( algo_gate_t* gate )
   gate->scanhash          = (void*)&scanhash_x21s_8way;
   gate->hash              = (void*)&x21s_8way_hash;
   gate->miner_thread_init = (void*)&x21s_8way_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
+                            | VAES_OPT;
 #elif defined (X21S_4WAY)
   gate->scanhash          = (void*)&scanhash_x21s_4way;
   gate->hash              = (void*)&x21s_4way_hash;
   gate->miner_thread_init = (void*)&x21s_4way_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                            | AVX512_OPT | VAES_OPT;
 #else
   gate->scanhash          = (void*)&scanhash_x21s;
   gate->hash              = (void*)&x21s_hash;
   gate->miner_thread_init = (void*)&x21s_thread_init;
-  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                          | AVX512_OPT | VAES_OPT;
 #endif
 //  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
   x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
diff --git a/algo/x16/x16rt-4way.c b/algo/x16/x16rt-4way.c
index 663f61e..56c7b69 100644
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -20,6 +20,11 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
@@ -30,20 +35,26 @@ union _x16rt_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_8way_context     sha512;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
@@ -109,6 +120,16 @@ void x16rt_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case GROESTL:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
                init_groestl( &ctx.groestl, 64 );
                update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                  (const char*)in0, size<<3 );
@@ -133,7 +154,8 @@ void x16rt_8way_hash( void* output, const void* input )
                init_groestl( &ctx.groestl, 64 );
                update_and_final_groestl( &ctx.groestl, (char*)hash7,
                                                  (const char*)in7, size<<3 );
-         break;
+#endif
+               break;
          case SKEIN:
             skein512_8way_init( &ctx.skein );
             if ( i == 0 )
@@ -197,6 +219,16 @@ void x16rt_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in0, size );
             sph_shavite512_close( &ctx.shavite, hash0 );
@@ -221,7 +253,8 @@ void x16rt_8way_hash( void* output, const void* input )
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
+#endif
+            break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
             simd_4way_init( &ctx.simd, 512 );
@@ -233,6 +266,16 @@ void x16rt_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
              init_echo( &ctx.echo, 512 );
              update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                 (const BitSequence*)in0, size<<3 );
@@ -257,7 +300,8 @@ void x16rt_8way_hash( void* output, const void* input )
              init_echo( &ctx.echo, 512 );
              update_final_echo ( &ctx.echo, (BitSequence *)hash7,
                                 (const BitSequence*)in7, size<<3 );
-         break;
+#endif
+             break;
          case HAMSI:
              intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
@@ -458,11 +502,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -470,11 +514,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -496,11 +540,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -508,11 +552,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -520,11 +564,11 @@ void x16rt_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -594,7 +638,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -615,7 +659,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -636,7 +680,7 @@ void x16rt_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c
index 7406138..f945133 100644
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -27,6 +27,11 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/tiger/sph_tiger.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
@@ -37,21 +42,30 @@ union _x16rv2_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
+//    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
+//    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
+//    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_8way_context     sha512;
     sph_tiger_context       tiger;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
@@ -117,6 +131,16 @@ void x16rv2_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case GROESTL:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
                init_groestl( &ctx.groestl, 64 );
                update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                  (const char*)in0, size<<3 );
@@ -141,7 +165,8 @@ void x16rv2_8way_hash( void* output, const void* input )
                init_groestl( &ctx.groestl, 64 );
                update_and_final_groestl( &ctx.groestl, (char*)hash7,
                                                  (const char*)in7, size<<3 );
-         break;
+#endif
+               break;
          case SKEIN:
             skein512_8way_init( &ctx.skein );
             if ( i == 0 )
@@ -258,6 +283,16 @@ void x16rv2_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in0, size );
             sph_shavite512_close( &ctx.shavite, hash0 );
@@ -282,7 +317,8 @@ void x16rv2_8way_hash( void* output, const void* input )
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
+#endif
+            break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
             simd_4way_init( &ctx.simd, 512 );
@@ -294,6 +330,16 @@ void x16rv2_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
              init_echo( &ctx.echo, 512 );
              update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                 (const BitSequence*)in0, size<<3 );
@@ -318,7 +364,8 @@ void x16rv2_8way_hash( void* output, const void* input )
              init_echo( &ctx.echo, 512 );
              update_final_echo ( &ctx.echo, (BitSequence *)hash7,
                                 (const BitSequence*)in7, size<<3 );
-         break;
+#endif
+             break;
          case HAMSI:
              intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
@@ -553,11 +600,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -565,11 +612,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -591,11 +638,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -603,11 +650,11 @@ void x16rv2_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -631,7 +678,7 @@ void x16rv2_4way_hash( void* output, const void* input )
 
              intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
              keccak512_4way_init( &ctx.keccak );
-             keccak512_4way( &ctx.keccak, vhash, 64 );
+             keccak512_4way_update( &ctx.keccak, vhash, 64 );
              keccak512_4way_close( &ctx.keccak, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -721,7 +768,7 @@ void x16rv2_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -742,7 +789,7 @@ void x16rv2_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -779,7 +826,7 @@ void x16rv2_4way_hash( void* output, const void* input )
  
              intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, 64 );
+             sha512_4way_update( &ctx.sha512, vhash, 64 );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
diff --git a/algo/x16/x21s-4way.c b/algo/x16/x21s-4way.c
index 7d98a00..69773d7 100644
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -30,6 +30,11 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 #if defined(__SHA__)
  #include <openssl/sha.h>
 #endif
@@ -45,15 +50,12 @@ union _x21s_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
@@ -63,6 +65,15 @@ union _x21s_8way_context_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
     sha256_8way_context     sha256;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;
@@ -128,31 +139,42 @@ void x21s_8way_hash( void* output, const void* input )
                           hash7, vhash );
          break;
          case GROESTL:
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash0,
-                                                 (const char*)in0, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash1,
-                                                 (const char*)in1, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash2,
-                                                 (const char*)in2, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash3,
-                                                 (const char*)in3, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash4,
-                                                 (const char*)in4, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash5,
-                                                 (const char*)in5, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash6,
-                                                 (const char*)in6, size<<3 );
-               init_groestl( &ctx.groestl, 64 );
-               update_and_final_groestl( &ctx.groestl, (char*)hash7,
-                                                 (const char*)in7, size<<3 );
-         break;
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            groestl512_4way_init( &ctx.groestl, 64 );
+            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                              (const char*)in0, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                              (const char*)in1, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                              (const char*)in2, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                              (const char*)in3, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                              (const char*)in4, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                              (const char*)in5, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                              (const char*)in6, size<<3 );
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                              (const char*)in7, size<<3 );
+#endif
+               break;
          case SKEIN:
             skein512_8way_init( &ctx.skein );
             if ( i == 0 )
@@ -216,6 +238,16 @@ void x21s_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case SHAVITE:
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            shavite512_4way_init( &ctx.shavite );
+            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in0, size );
             sph_shavite512_close( &ctx.shavite, hash0 );
@@ -240,7 +272,8 @@ void x21s_8way_hash( void* output, const void* input )
             sph_shavite512_init( &ctx.shavite );
             sph_shavite512( &ctx.shavite, in7, size );
             sph_shavite512_close( &ctx.shavite, hash7 );
-         break;
+#endif
+            break;
          case SIMD:
             intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
             simd_4way_init( &ctx.simd, 512 );
@@ -252,31 +285,43 @@ void x21s_8way_hash( void* output, const void* input )
             dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
          break;
          case ECHO:
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
-                                (const BitSequence*)in0, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
-                                (const BitSequence*)in1, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
-                                (const BitSequence*)in2, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
-                                (const BitSequence*)in3, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
-                                (const BitSequence*)in4, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
-                                (const BitSequence*)in5, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
-                                (const BitSequence*)in6, size<<3 );
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
-                                (const BitSequence*)in7, size<<3 );
-         break;
+
+#if defined(__VAES__)
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            echo_4way_init( &ctx.echo, 512 );
+            echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+#else
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                               (const BitSequence*)in0, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                               (const BitSequence*)in1, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                               (const BitSequence*)in2, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                               (const BitSequence*)in3, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                               (const BitSequence*)in4, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                               (const BitSequence*)in5, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                               (const BitSequence*)in6, size<<3 );
+            init_echo( &ctx.echo, 512 );
+            update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                               (const BitSequence*)in7, size<<3 );
+#endif
+             break;
          case HAMSI:
              intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                             size<<3 );
@@ -578,11 +623,11 @@ void x21s_4way_hash( void* output, const void* input )
          case BLAKE:
             blake512_4way_init( &ctx.blake );
             if ( i == 0 )
-               blake512_4way( &ctx.blake, input, size );
+               blake512_4way_update( &ctx.blake, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way( &ctx.blake, vhash, size );
+               blake512_4way_update( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -590,11 +635,11 @@ void x21s_4way_hash( void* output, const void* input )
          case BMW:
             bmw512_4way_init( &ctx.bmw );
             if ( i == 0 )
-               bmw512_4way( &ctx.bmw, input, size );
+               bmw512_4way_update( &ctx.bmw, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way( &ctx.bmw, vhash, size );
+               bmw512_4way_update( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -616,11 +661,11 @@ void x21s_4way_hash( void* output, const void* input )
          case SKEIN:
             skein512_4way_init( &ctx.skein );
             if ( i == 0 )
-               skein512_4way( &ctx.skein, input, size );
+               skein512_4way_update( &ctx.skein, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way( &ctx.skein, vhash, size );
+               skein512_4way_update( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -628,11 +673,11 @@ void x21s_4way_hash( void* output, const void* input )
          case JH:
             jh512_4way_init( &ctx.jh );
             if ( i == 0 )
-               jh512_4way( &ctx.jh, input, size );
+               jh512_4way_update( &ctx.jh, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way( &ctx.jh, vhash, size );
+               jh512_4way_update( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -640,11 +685,11 @@ void x21s_4way_hash( void* output, const void* input )
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
             if ( i == 0 )
-               keccak512_4way( &ctx.keccak, input, size );
+               keccak512_4way_update( &ctx.keccak, input, size );
             else
             {
                intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way( &ctx.keccak, vhash, size );
+               keccak512_4way_update( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -714,7 +759,7 @@ void x21s_4way_hash( void* output, const void* input )
          case HAMSI:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
-             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_update( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -735,7 +780,7 @@ void x21s_4way_hash( void* output, const void* input )
          case SHABAL:
              intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
              shabal512_4way_init( &ctx.shabal );
-             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_update( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
              dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -756,7 +801,7 @@ void x21s_4way_hash( void* output, const void* input )
          case SHA_512:
              intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
              sha512_4way_init( &ctx.sha512 );
-             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_update( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
@@ -767,7 +812,7 @@ void x21s_4way_hash( void* output, const void* input )
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
 
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhash, 64 );
+   haval256_5_4way_update( &ctx.haval, vhash, 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
 
    dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -831,7 +876,7 @@ void x21s_4way_hash( void* output, const void* input )
 
    intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhash, 64 );
+   sha256_4way_update( &ctx.sha256, vhash, 64 );
    sha256_4way_close( &ctx.sha256, vhash );
    dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );
 
diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c
index 3a0b248..f28da43 100644
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -21,6 +21,11 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(SONOA_8WAY)
 
@@ -28,21 +33,27 @@ union _sonoa_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
-    hashState_echo          echo;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
     sph_whirlpool_context   whirlpool;
     sha512_8way_context     sha512;
     haval256_5_8way_context haval;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 
 typedef union _sonoa_8way_context_overlay sonoa_8way_context_overlay;
@@ -72,6 +83,19 @@ void sonoa_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -95,6 +119,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -119,6 +145,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+     
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
      
@@ -150,11 +185,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+     
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -182,16 +230,31 @@ void sonoa_8way_hash( void *state, const void *input )
      init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
-     
-// 2
 
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
+// 2
+
      bmw512_8way_init( &ctx.bmw );
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -215,6 +278,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -239,6 +304,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -270,11 +344,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -306,6 +393,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -316,6 +405,19 @@ void sonoa_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -339,6 +441,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -363,6 +467,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -394,11 +507,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -430,6 +556,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -471,6 +599,19 @@ void sonoa_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -494,6 +635,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -518,6 +661,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -549,11 +701,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -585,6 +750,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -630,6 +797,17 @@ void sonoa_8way_hash( void *state, const void *input )
      hamsi512_8way_update( &ctx.hamsi, vhashA, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
 
+#if defined(__VAES__)
+     
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+     
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
@@ -658,6 +836,18 @@ void sonoa_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                        (const BitSequence *) hash7, 512 );
 
+#endif
+
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
 
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -684,11 +874,13 @@ void sonoa_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
-// 5
-
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
+// 5
+
      bmw512_8way_init( &ctx.bmw );
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
@@ -699,6 +891,19 @@ void sonoa_8way_hash( void *state, const void *input )
      shabal512_8way_update( &ctx.shabal, vhashA, 64 );
      shabal512_8way_close( &ctx.shabal, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x32_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                        vhash );
 
@@ -722,6 +927,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -746,6 +953,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -777,14 +993,27 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
-
+     
      init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
@@ -813,6 +1042,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -889,6 +1120,19 @@ void sonoa_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -912,6 +1156,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -936,6 +1182,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -967,11 +1222,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -1003,6 +1271,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -1114,6 +1384,19 @@ void sonoa_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7, vhash );
 
@@ -1137,6 +1420,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
      skein512_8way_close( &ctx.skein, vhash );
@@ -1161,6 +1446,15 @@ void sonoa_8way_hash( void *state, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -1192,11 +1486,24 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
      intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
      dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
      dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -1228,6 +1535,8 @@ void sonoa_8way_hash( void *state, const void *input )
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                        hash7 );
 
+#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -1319,7 +1628,7 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
-     const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t last_nonce = max_nonce - 8;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
@@ -1350,8 +1659,6 @@ int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
    return 0;
 }
 
-
-
 #elif defined(SONOA_4WAY)
 
 union _sonoa_4way_context_overlay
@@ -1391,11 +1698,11 @@ void sonoa_4way_hash( void *state, const void *input )
 // 1
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1412,15 +1719,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1466,7 +1773,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1483,15 +1790,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1535,13 +1842,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
 // 3
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1558,15 +1865,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1610,7 +1917,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1632,7 +1939,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1649,15 +1956,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1701,7 +2008,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1722,13 +2029,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      rintrlv_4x32_4x64( vhashB, vhash, 512 ); 
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhashB, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhashB, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1758,13 +2065,13 @@ void sonoa_4way_hash( void *state, const void *input )
      rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhashB, 64 );
+     shabal512_4way_update( &ctx.shabal, vhashB, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1781,15 +2088,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1833,7 +2140,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1854,7 +2161,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1877,7 +2184,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
      
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1894,15 +2201,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -1946,7 +2253,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -1967,7 +2274,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -1988,7 +2295,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2011,7 +2318,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2028,15 +2335,15 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -2080,7 +2387,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -2101,7 +2408,7 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -2122,13 +2429,13 @@ void sonoa_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
      haval256_5_4way_close( &ctx.haval, state );
 }
 
diff --git a/algo/x17/sonoa-gate.c b/algo/x17/sonoa-gate.c
index 3687733..7dce68f 100644
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -13,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_sonoa;
   gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index 18eed41..83d4712 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -13,6 +13,11 @@
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -28,15 +33,21 @@ union _x17_8way_context_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
     sph_shavite512_context  shavite;
-    simd_4way_context       simd;
     hashState_echo          echo;
+#endif
+    simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_8way_context  shabal;
@@ -49,8 +60,8 @@ typedef union _x17_8way_context_overlay x17_8way_context_overlay;
 void x17_8way_hash( void *state, const void *input )
 {
      uint64_t vhash[8*8] __attribute__ ((aligned (128)));
-     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
-     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -61,7 +72,7 @@ void x17_8way_hash( void *state, const void *input )
      uint64_t hash7[8] __attribute__ ((aligned (64)));
      x17_8way_context_overlay ctx;
 
-     // 1 Blake parallel 4 way 64 bit
+     // 1 Blake
      blake512_8way_init( &ctx.blake );
      blake512_8way_update( &ctx.blake, input, 80 );
      blake512_8way_close( &ctx.blake, vhash );
@@ -71,11 +82,24 @@ void x17_8way_hash( void *state, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, 64 );
      bmw512_8way_close( &ctx.bmw, vhash );
 
-     // Serialize
+     // 3 Groestl
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+     
+#else
+
      dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash );
 
-     // 3 Groestl
      init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
      init_groestl( &ctx.groestl, 64 );
@@ -93,10 +117,11 @@ void x17_8way_hash( void *state, const void *input )
      init_groestl( &ctx.groestl, 64 );
      update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
 
-     // Parallellize
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7 );
 
+#endif
+
      // 4 Skein parallel 4 way 64 bit 
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, 64 );
@@ -112,24 +137,34 @@ void x17_8way_hash( void *state, const void *input )
      keccak512_8way_update( &ctx.keccak, vhash, 64 );
      keccak512_8way_close( &ctx.keccak, vhash );
 
-     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
 
      // 7 Luffa  
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
      luffa_4way_init( &ctx.luffa, 512 );
-     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
 
      // 8 Cubehash
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
      cube_4way_init( &ctx.cube, 512, 16, 32 );
-     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
      // 9 Shavite
+
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
      sph_shavite512_init( &ctx.shavite );
      sph_shavite512( &ctx.shavite, hash0, 64 );
      sph_shavite512_close( &ctx.shavite, hash0 );
@@ -155,18 +190,33 @@ void x17_8way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash7, 64 );
      sph_shavite512_close( &ctx.shavite, hash7 );
 
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+
      // 10 Simd
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
      simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
      simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+     // 11 Echo
+
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
-     // 11 Echo serial
      init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                             (const BitSequence *) hash0, 512 );
@@ -192,9 +242,11 @@ void x17_8way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash7,
                             (const BitSequence *) hash7, 512 );
 
-     // 12 Hamsi parallel 4 way 64 bit
      intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
+#endif
+
+     // 12 Hamsi
 
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
@@ -229,7 +281,7 @@ void x17_8way_hash( void *state, const void *input )
      sph_fugue512( &ctx.fugue, hash7, 64 );
      sph_fugue512_close( &ctx.fugue, hash7 );
 
-     // 14 Shabal, parallel 4 way 32 bit
+     // 14 Shabal, parallel 8 way 32 bit
      intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                       hash7 );
 
@@ -275,10 +327,10 @@ void x17_8way_hash( void *state, const void *input )
      sha512_8way_close( &ctx.sha512, vhash );
 
      // 17 Haval parallel 32 bit
-     rintrlv_8x64_8x32( vhash0, vhash,  512 );
+     rintrlv_8x64_8x32( vhashA, vhash,  512 );
 
      haval256_5_8way_init( &ctx.haval );
-     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
+     haval256_5_8way_update( &ctx.haval, vhashA, 64 );
      haval256_5_8way_close( &ctx.haval, state );
 }
 
@@ -292,7 +344,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
-     const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t last_nonce = max_nonce - 8;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
@@ -349,23 +401,23 @@ typedef union _x17_4way_context_overlay x17_4way_context_overlay;
 
 void x17_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
+     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
      uint64_t hash0[8] __attribute__ ((aligned (64)));
      uint64_t hash1[8] __attribute__ ((aligned (64)));
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
-     uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
      x17_4way_context_overlay ctx;
 
      // 1 Blake parallel 4 way 64 bit
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close( &ctx.blake, vhash );
 
      // 2 Bmw
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serialize
@@ -386,17 +438,17 @@ void x17_4way_hash( void *state, const void *input )
 
      // 4 Skein parallel 4 way 64 bit 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
      skein512_4way_close( &ctx.skein, vhash );
 
      // 5 JH
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
      jh512_4way_close( &ctx.jh, vhash );
 
      // 6 Keccak
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      // 7 Luffa  parallel 2 way 128 bit
@@ -428,7 +480,6 @@ void x17_4way_hash( void *state, const void *input )
      dintrlv_2x128_512( hash0, hash1, vhashA );
      dintrlv_2x128_512( hash2, hash3, vhashB );
 
-
      // 11 Echo serial
      init_echo( &ctx.echo, 512 );
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -447,7 +498,7 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
@@ -470,7 +521,7 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
@@ -493,27 +544,28 @@ void x17_4way_hash( void *state, const void *input )
      intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, 64 );
+     sha512_4way_update( &ctx.sha512, vhash, 64 );
      sha512_4way_close( &ctx.sha512, vhash );     
 
      // 17 Haval parallel 32 bit
      rintrlv_4x64_4x32( vhashB, vhash,  512 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashB, 64 );
+     haval256_5_4way_update( &ctx.haval, vhashB, 64 );
      haval256_5_4way_close( &ctx.haval, state );
 }
 
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (128)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t *pdata = work->data;
    const uint32_t *ptarget = work->target;
    const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce -4;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    uint32_t n = first_nonce;
    const int thr_id = mythr->id;
@@ -537,9 +589,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
          }
       }
       n += 4;
-   } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
    return 0;
 }
 
diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c
index 73ce607..0bad7a2 100644
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -12,7 +12,7 @@ bool register_x17_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_x17;
   gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   return true;
 };
 
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 28bc1c2..ced4a31 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -22,6 +22,11 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 #if defined(XEVAN_8WAY)
 
@@ -29,13 +34,11 @@ union _xevan_8way_context_overlay
 {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
-   hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
-   sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
@@ -44,6 +47,15 @@ union _xevan_8way_context_overlay
    sph_whirlpool_context   whirlpool;
    sha512_8way_context     sha512;
    haval256_5_8way_context haval;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+//    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+//    hashState_echo          echo;
+#endif
 } __attribute__ ((aligned (64)));
 typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
 
@@ -72,6 +84,19 @@ void xevan_8way_hash( void *output, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, dataLen );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
@@ -103,6 +128,8 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, dataLen );
      skein512_8way_close( &ctx.skein, vhash );
@@ -127,6 +154,15 @@ void xevan_8way_hash( void *output, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+
+#else
+
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
@@ -158,11 +194,26 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
      intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
 
+/*
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+*/
+
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
@@ -194,6 +245,8 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
 
+//#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -286,6 +339,19 @@ void xevan_8way_hash( void *output, const void *input )
      bmw512_8way_update( &ctx.bmw, vhash, dataLen );
      bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, dataLen<<3 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, dataLen<<3 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+
      dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                    vhash, dataLen<<3 );
 
@@ -317,6 +383,8 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
 
+#endif
+
      skein512_8way_init( &ctx.skein );
      skein512_8way_update( &ctx.skein, vhash, dataLen );
      skein512_8way_close( &ctx.skein, vhash );
@@ -341,6 +409,15 @@ void xevan_8way_hash( void *output, const void *input )
      cube_4way_init( &ctx.cube, 512, 16, 32 );
      cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
+
+#else
+
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
@@ -372,11 +449,27 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
      intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
 
+#endif
+
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
      simd_4way_init( &ctx.simd, 512 );
      simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
 
+
+/*
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, dataLen<<3 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, dataLen<<3 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 );
+
+#else
+*/
+
      dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
      dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
 
@@ -408,6 +501,8 @@ void xevan_8way_hash( void *output, const void *input )
      intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                   hash7, dataLen<<3 );
 
+//#endif
+
      hamsi512_8way_init( &ctx.hamsi );
      hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -569,12 +664,12 @@ void xevan_4way_hash( void *output, const void *input )
      // parallel 4 way
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
      blake512_4way_close(&ctx.blake, vhash);
      memset( &vhash[8<<2], 0, 64<<2 );
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      // Serial
@@ -597,15 +692,15 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_update( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_update( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -649,7 +744,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -671,7 +766,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -693,13 +788,13 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, dataLen );
+     sha512_4way_update( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
      haval256_5_4way_close( &ctx.haval, vhashA );
 
      rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );
@@ -707,11 +802,11 @@ void xevan_4way_hash( void *output, const void *input )
      memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
 
      blake512_4way_init( &ctx.blake );
-     blake512_4way( &ctx.blake, vhash, dataLen );
+     blake512_4way_update( &ctx.blake, vhash, dataLen );
      blake512_4way_close(&ctx.blake, vhash);
 
      bmw512_4way_init( &ctx.bmw );
-     bmw512_4way( &ctx.bmw, vhash, dataLen );
+     bmw512_4way_update( &ctx.bmw, vhash, dataLen );
      bmw512_4way_close( &ctx.bmw, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -732,15 +827,15 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      skein512_4way_init( &ctx.skein );
-     skein512_4way( &ctx.skein, vhash, dataLen );
+     skein512_4way_update( &ctx.skein, vhash, dataLen );
      skein512_4way_close( &ctx.skein, vhash );
 
      jh512_4way_init( &ctx.jh );
-     jh512_4way( &ctx.jh, vhash, dataLen );
+     jh512_4way_update( &ctx.jh, vhash, dataLen );
      jh512_4way_close( &ctx.jh, vhash );
 
      keccak512_4way_init( &ctx.keccak );
-     keccak512_4way( &ctx.keccak, vhash, dataLen );
+     keccak512_4way_update( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
      rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
@@ -784,7 +879,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      hamsi512_4way_init( &ctx.hamsi );
-     hamsi512_4way( &ctx.hamsi, vhash, dataLen );
+     hamsi512_4way_update( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
 
      dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -805,7 +900,7 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      shabal512_4way_init( &ctx.shabal );
-     shabal512_4way( &ctx.shabal, vhash, dataLen );
+     shabal512_4way_update( &ctx.shabal, vhash, dataLen );
      shabal512_4way_close( &ctx.shabal, vhash );
 
      dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
@@ -826,13 +921,13 @@ void xevan_4way_hash( void *output, const void *input )
      intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
 
      sha512_4way_init( &ctx.sha512 );
-     sha512_4way( &ctx.sha512, vhash, dataLen );
+     sha512_4way_update( &ctx.sha512, vhash, dataLen );
      sha512_4way_close( &ctx.sha512, vhash );
 
      rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
 
      haval256_5_4way_init( &ctx.haval );
-     haval256_5_4way( &ctx.haval, vhashA, dataLen );
+     haval256_5_4way_update( &ctx.haval, vhashA, dataLen );
      haval256_5_4way_close( &ctx.haval, output );
 }
 
diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c
index 8cb86a4..d3e65f7 100644
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -13,7 +13,7 @@ bool register_xevan_algo( algo_gate_t* gate )
   gate->scanhash  = (void*)&scanhash_xevan;
   gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
   opt_target_factor = 256.0;
   return true;
 };
diff --git a/algo/x22/x22i-4way.c b/algo/x22/x22i-4way.c
index 0d28285..5d912b0 100644
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -22,6 +22,11 @@
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/swifftx/swifftx.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 
 #if defined(X22I_8WAY)
@@ -30,14 +35,11 @@ union _x22i_8way_ctx_overlay
 {
     blake512_8way_context   blake;
     bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
-    hashState_echo          echo;
     skein512_8way_context   skein;
     jh512_8way_context      jh;
     keccak512_8way_context  keccak;
     luffa_4way_context      luffa;
     cube_4way_context       cube;
-    sph_shavite512_context  shavite;
     simd_4way_context       simd;
     hamsi512_8way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -48,6 +50,15 @@ union _x22i_8way_ctx_overlay
     sph_tiger_context       tiger;
     sph_gost512_context     gost;
     sha256_8way_context     sha256;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
 };
 typedef union _x22i_8way_ctx_overlay x22i_8way_ctx_overlay;
 
@@ -84,6 +95,19 @@ void x22i_8way_hash( void *output, const void *input )
    bmw512_8way_update( &ctx.bmw, vhash, 64 );
    bmw512_8way_close( &ctx.bmw, vhash );
 
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_8x64_512( hash0, hash1, hash2, hash3,
                      hash4, hash5, hash6, hash7, vhash );
 
@@ -115,6 +139,8 @@ void x22i_8way_hash( void *output, const void *input )
    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                            hash4, hash5, hash6, hash7 );
    
+#endif
+
    skein512_8way_init( &ctx.skein );
    skein512_8way_update( &ctx.skein, vhash, 64 );
    skein512_8way_close( &ctx.skein, vhash );
@@ -139,6 +165,15 @@ void x22i_8way_hash( void *output, const void *input )
    cube_4way_init( &ctx.cube, 512, 16, 32 );
    cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
 
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
    dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
    dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -170,11 +205,24 @@ void x22i_8way_hash( void *output, const void *input )
    intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
    intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
 
+#endif
+
    simd_4way_init( &ctx.simd, 512 );
    simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
    simd_4way_init( &ctx.simd, 512 );
    simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
 
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
    dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
 
@@ -206,6 +254,8 @@ void x22i_8way_hash( void *output, const void *input )
    intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3,
                            hash4, hash5, hash6, hash7 );
 
+#endif
+
    hamsi512_8way_init( &ctx.hamsi );
    hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_8way_close( &ctx.hamsi, vhash );
@@ -439,10 +489,8 @@ int scanhash_x22i_8way( struct work* work, uint32_t max_nonce,
    return 0;
 }
 
-
 #elif defined(X22I_4WAY)
 
-
 union _x22i_4way_ctx_overlay
 {
     blake512_4way_context   blake;
@@ -477,8 +525,6 @@ void x22i_4way_hash( void *output, const void *input )
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
-
-//   unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0};
    unsigned char hashA0[64]    __attribute__((aligned(64))) = {0};
    unsigned char hashA1[64]    __attribute__((aligned(32))) = {0};
    unsigned char hashA2[64]    __attribute__((aligned(32))) = {0};
@@ -486,13 +532,12 @@ void x22i_4way_hash( void *output, const void *input )
    x22i_ctx_overlay ctx;
 
    blake512_4way_init( &ctx.blake );
-   blake512_4way( &ctx.blake, input, 80 );
+   blake512_4way_update( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
-
    dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
    
    init_groestl( &ctx.groestl, 64 );
@@ -511,15 +556,15 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
    skein512_4way_init( &ctx.skein );
-   skein512_4way( &ctx.skein, vhash, 64 );
+   skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );
 
    jh512_4way_init( &ctx.jh );
-   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
 
    keccak512_4way_init( &ctx.keccak );
-   keccak512_4way( &ctx.keccak, vhash, 64 );
+   keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
 
    rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
@@ -560,13 +605,11 @@ void x22i_4way_hash( void *output, const void *input )
    update_final_echo ( &ctx.echo, (BitSequence*)hash3,
                             (const BitSequence*)hash3, 512 );
 
-
    intrlv_4x64_512( vhash, hash0, hash1, hash2, hash3 );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
-
    dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
 
    sph_fugue512_init( &ctx.fugue );
@@ -585,9 +628,8 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
-
    dintrlv_4x32_512( &hash0[8], &hash1[8], &hash2[8], &hash3[8], vhash );
 
    sph_whirlpool_init( &ctx.whirlpool );
@@ -606,12 +648,10 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, &hash0[16], &hash1[16], &hash2[16], &hash3[16] );
 
    sha512_4way_init( &ctx.sha512 );
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash );
-
    dintrlv_4x64_512( &hash0[24], &hash1[24], &hash2[24], &hash3[24], vhash );
 
-//	InitializeSWIFFTX();
 	ComputeSingleSWIFFTX((unsigned char*)hash0, (unsigned char*)hashA0);
    ComputeSingleSWIFFTX((unsigned char*)hash1, (unsigned char*)hashA1);
    ComputeSingleSWIFFTX((unsigned char*)hash2, (unsigned char*)hashA2);
@@ -622,9 +662,8 @@ void x22i_4way_hash( void *output, const void *input )
    memset( vhash, 0, 64*4 );
 
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashA, 64 );
+   haval256_5_4way_update( &ctx.haval, vhashA, 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
-
    dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
      
 	memset( hashA0, 0, 64 );
@@ -675,10 +714,8 @@ void x22i_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0, hash1, hash2, hash3 );
 
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhash, 64 );
+   sha256_4way_update( &ctx.sha256, vhash, 64 );
    sha256_4way_close( &ctx.sha256, output );
-   
-//	memcpy(output, hash, 32);
 }
 
 
diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c
index 893a0e3..78f23b4 100644
--- a/algo/x22/x22i-gate.c
+++ b/algo/x22/x22i-gate.c
@@ -1,19 +1,26 @@
 #include "x22i-gate.h"
 
+// Ryzen has poor AVX2 performance so use SHA over AVX2.
+// Intel has AVX512 so use AVX512 over SHA.
+// When Ryzen AVX2 improves use AVX2 over SHA.
+
 bool register_x22i_algo( algo_gate_t* gate )
 {
 #if defined (X22I_8WAY)
   gate->scanhash  = (void*)&scanhash_x22i_8way;
   gate->hash      = (void*)&x22i_8way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
+                      | AVX512_OPT | VAES_OPT;
 #elif defined (X22I_4WAY)
   gate->scanhash  = (void*)&scanhash_x22i_4way;
   gate->hash      = (void*)&x22i_4way_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                      | AVX512_OPT | VAES_OPT;
 #else
   gate->scanhash  = (void*)&scanhash_x22i;
   gate->hash      = (void*)&x22i_hash;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                      | AVX512_OPT | VAES_OPT;
 #endif
   return true;
 };
@@ -23,17 +30,20 @@ bool register_x25x_algo( algo_gate_t* gate )
 #if defined (X25X_8WAY)
   gate->scanhash  = (void*)&scanhash_x25x_8way;
   gate->hash      = (void*)&x25x_8way_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT
+                      | AVX512_OPT | VAES_OPT;
 #elif defined (X25X_4WAY)
   gate->scanhash  = (void*)&scanhash_x25x_4way;
   gate->hash      = (void*)&x25x_4way_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                      | AVX512_OPT | VAES_OPT;
 #else
   gate->scanhash  = (void*)&scanhash_x25x;
   gate->hash      = (void*)&x25x_hash;
-//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT
+                      | AVX512_OPT | VAES_OPT;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
 
   return true;
 };
diff --git a/algo/x22/x22i-gate.h b/algo/x22/x22i-gate.h
index 1dbb305..a03079f 100644
--- a/algo/x22/x22i-gate.h
+++ b/algo/x22/x22i-gate.h
@@ -34,13 +34,9 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
 
 #endif
 
-
-// Big problems with x25x 8 way. It blows up just by increasing the
-// buffer sizes and nothing else. It may have to do with accessing 2 dim arrays.
-
-//#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-//  #define X25X_8WAY 1
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X25X_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
   #define X25X_4WAY 1
 #endif
 
diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c
index df8f312..fbbb1fd 100644
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -1,7 +1,4 @@
 #include "x22i-gate.h"
-
-#if defined(X25X_4WAY)
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
@@ -16,8 +13,11 @@
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/tiger/sph_tiger.h"
@@ -26,33 +26,11 @@
 #include "algo/swifftx/swifftx.h"
 #include "algo/panama/sph_panama.h"
 #include "algo/lanehash/lane.h"
-
-union _x25x_4way_ctx_overlay
-{
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-    hashState_groestl       groestl;
-    hashState_echo          echo;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    hashState_sd            simd;
-    hamsi512_4way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-    haval256_5_4way_context haval;
-    sph_tiger_context       tiger;
-    sph_gost512_context     gost;
-    sha256_4way_context     sha256;
-    sph_panama_context      panama;
-     blake2s_4way_state           blake2s;
-};
-typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
 
 void x25x_shuffle( void *hash )
 {
@@ -81,28 +59,587 @@ void x25x_shuffle( void *hash )
    #undef X25X_SHUFFLE_ROUNDS
 }
 
-void x25x_4way_hash( void *output, const void *input )
+#if defined(X25X_8WAY)
+
+union _x25x_8way_ctx_overlay
 {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_8way_context     sha256;
+    sph_panama_context      panama;
+    blake2s_8way_state      blake2s;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
+};
+typedef union _x25x_8way_ctx_overlay x25x_8way_ctx_overlay;
+
+void x25x_8way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+   unsigned char hash0[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash1[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash4[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash5[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash6[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char hash7[25][64] __attribute__((aligned(64))) = {0};
+   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
+   uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+   x25x_8way_ctx_overlay ctx __attribute__ ((aligned (64)));
+
+   blake512_8way_init( &ctx.blake );
+   blake512_8way_update( &ctx.blake, input, 80 );
+   blake512_8way_close( &ctx.blake, vhash );
+   dintrlv_8x64_512( hash0[0], hash1[0], hash2[0], hash3[0],
+                     hash4[0], hash5[0], hash6[0], hash7[0], vhash );
+
+   bmw512_8way_init( &ctx.bmw );
+   bmw512_8way_update( &ctx.bmw, vhash, 64 );
+   bmw512_8way_close( &ctx.bmw, vhash );
+   dintrlv_8x64_512( hash0[1], hash1[1], hash2[1], hash3[1],
+                     hash4[1], hash5[1], hash6[1], hash7[1], vhash );
+
+#if defined(__VAES__)
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   groestl512_4way_init( &ctx.groestl, 64 );
+   groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+   groestl512_4way_init( &ctx.groestl, 64 );
+   groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+   dintrlv_4x128_512( hash0[2], hash1[2], hash2[2], hash3[2], vhashA );
+   dintrlv_4x128_512( hash4[2], hash5[2], hash6[2], hash7[2], vhashB );
+
+   intrlv_8x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2],
+                           hash4[2], hash5[2], hash6[2], hash7[2] );
+
+#else
+
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash0[2],
+                                  (const char*)hash0[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash1[2],
+                                  (const char*)hash1[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash2[2],
+                                  (const char*)hash2[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
+                                  (const char*)hash3[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash4[2],
+                                  (const char*)hash4[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash5[2],
+                                  (const char*)hash5[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash6[2],
+                                  (const char*)hash6[1], 512 );
+   init_groestl( &ctx.groestl, 64 );
+   update_and_final_groestl( &ctx.groestl, (char*)hash7[2],
+                                  (const char*)hash7[1], 512 );
+
+   intrlv_8x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2],
+                           hash4[2], hash5[2], hash6[2], hash7[2] );
+   
+#endif
+
+   skein512_8way_init( &ctx.skein );
+   skein512_8way_update( &ctx.skein, vhash, 64 );
+   skein512_8way_close( &ctx.skein, vhash );
+   dintrlv_8x64_512( hash0[3], hash1[3], hash2[3], hash3[3],
+                     hash4[3], hash5[3], hash6[3], hash7[3], vhash );
+
+   jh512_8way_init( &ctx.jh );
+   jh512_8way_update( &ctx.jh, vhash, 64 );
+   jh512_8way_close( &ctx.jh, vhash );
+   dintrlv_8x64_512( hash0[4], hash1[4], hash2[4], hash3[4],
+                     hash4[4], hash5[4], hash6[4], hash7[4], vhash );
+
+   keccak512_8way_init( &ctx.keccak );
+   keccak512_8way_update( &ctx.keccak, vhash, 64 );
+   keccak512_8way_close( &ctx.keccak, vhash );
+   dintrlv_8x64_512( hash0[5], hash1[5], hash2[5], hash3[5],
+                     hash4[5], hash5[5], hash6[5], hash7[5], vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+   luffa_4way_init( &ctx.luffa, 512 );
+   luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+   dintrlv_4x128_512( hash0[6], hash1[6], hash2[6], hash3[6], vhashA );
+   dintrlv_4x128_512( hash4[6], hash5[6], hash6[6], hash7[6], vhashB );
+
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+   cube_4way_init( &ctx.cube, 512, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+   dintrlv_4x128_512( hash0[7], hash1[7], hash2[7], hash3[7], vhashA );
+   dintrlv_4x128_512( hash4[7], hash5[7], hash6[7], hash7[7], vhashB );
+
+#if defined(__VAES__)
+
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+   shavite512_4way_init( &ctx.shavite );
+   shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+   dintrlv_4x128_512( hash0[8], hash1[8], hash2[8], hash3[8], vhashA );
+   dintrlv_4x128_512( hash4[8], hash5[8], hash6[8], hash7[8], vhashB );
+
+#else
+
+	sph_shavite512_init(&ctx.shavite);
+	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+	sph_shavite512_close(&ctx.shavite, hash0[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash1[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash2[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash2[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash3[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash3[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash4[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash4[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash5[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash5[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash6[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash6[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash7[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash7[8]);
+   intrlv_4x128_512( vhashA, hash0[8], hash1[8], hash2[8], hash3[8] );
+   intrlv_4x128_512( vhashB, hash4[8], hash5[8], hash6[8], hash7[8] );
+
+#endif
+
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+   simd_4way_init( &ctx.simd, 512 );
+   simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+   dintrlv_4x128_512( hash0[9], hash1[9], hash2[9], hash3[9], vhashA );
+   dintrlv_4x128_512( hash4[9], hash5[9], hash6[9], hash7[9], vhashB );
+
+#if defined(__VAES__)
+
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+   echo_4way_init( &ctx.echo, 512 );
+   echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+   dintrlv_4x128_512( hash0[10], hash1[10], hash2[10], hash3[10], vhashA );
+   dintrlv_4x128_512( hash4[10], hash5[10], hash6[10], hash7[10], vhashB );
+
+   intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
+                           hash4[10], hash5[10], hash6[10], hash7[10] );
+
+#else
+
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash0[10],
+                            (const BitSequence*)hash0[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash1[10],
+                            (const BitSequence*)hash1[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash2[10],
+                            (const BitSequence*)hash2[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash3[10],
+                            (const BitSequence*)hash3[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash4[10],
+                            (const BitSequence*)hash4[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash5[10],
+                            (const BitSequence*)hash5[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash6[10],
+                            (const BitSequence*)hash6[9], 512 );
+   init_echo( &ctx.echo, 512 );
+   update_final_echo ( &ctx.echo, (BitSequence*)hash7[10],
+                            (const BitSequence*)hash7[9], 512 );
+   intrlv_8x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10],
+                           hash4[10], hash5[10], hash6[10], hash7[10] );
+
+#endif
+
+   hamsi512_8way_init( &ctx.hamsi );
+   hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+   hamsi512_8way_close( &ctx.hamsi, vhash );
+   dintrlv_8x64_512( hash0[11], hash1[11], hash2[11], hash3[11],
+                     hash4[11], hash5[11], hash6[11], hash7[11], vhash );
+   
+	sph_fugue512_init(&ctx.fugue);
+	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+	sph_fugue512_close(&ctx.fugue, hash0[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash1[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash2[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash2[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash3[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash3[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash4[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash4[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash5[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash5[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash6[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash6[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash7[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash7[12]);
+   intrlv_8x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12],
+                           hash4[12], hash5[12], hash6[12], hash7[12] );
+
+   shabal512_8way_init( &ctx.shabal );
+   shabal512_8way_update( &ctx.shabal, vhash, 64 );
+   shabal512_8way_close( &ctx.shabal, vhash );
+   dintrlv_8x32_512( hash0[13], hash1[13], hash2[13], hash3[13],
+                     hash4[13], hash5[13], hash6[13], hash7[13], vhash );
+
+	sph_whirlpool_init(&ctx.whirlpool);
+	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash2[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash2[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash3[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash3[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash4[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash4[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash5[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash5[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash6[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash6[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash7[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash7[14]);
+   intrlv_8x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14],
+                           hash4[14], hash5[14], hash6[14], hash7[14] );
+
+   sha512_8way_init( &ctx.sha512 );
+   sha512_8way_update( &ctx.sha512, vhash, 64 );
+   sha512_8way_close( &ctx.sha512, vhash );
+   dintrlv_8x64_512( hash0[15], hash1[15], hash2[15], hash3[15],
+                     hash4[15], hash5[15], hash6[15], hash7[15], vhash );
+
+   ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash3[12], (unsigned char*)hash3[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash4[12], (unsigned char*)hash4[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash5[12], (unsigned char*)hash5[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash6[12], (unsigned char*)hash6[16]);
+   ComputeSingleSWIFFTX((unsigned char*)hash7[12], (unsigned char*)hash7[16]);
+   intrlv_8x32_512( vhashA, hash0[16], hash1[16], hash2[16], hash3[16],
+                            hash4[16], hash5[16], hash6[16], hash7[16] );
+   memset( vhash, 0, 64*8 );
+
+   haval256_5_8way_init( &ctx.haval );
+   haval256_5_8way_update( &ctx.haval, vhashA, 64 );
+   haval256_5_8way_close( &ctx.haval, vhash );
+   dintrlv_8x32_512( hash0[17], hash1[17], hash2[17], hash3[17],
+                     hash4[17], hash5[17], hash6[17], hash7[17], vhash );
+
+	sph_tiger_init(&ctx.tiger);
+	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash2[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash2[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash3[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash4[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash4[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash5[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash5[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash6[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash6[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash7[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash7[18]);
+
+   intrlv_2x256( vhash, hash0[18], hash1[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0[19], hash1[19], vhash, 256 );
+   intrlv_2x256( vhash, hash2[18], hash3[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2[19], hash3[19], vhash, 256 );
+   intrlv_2x256( vhash, hash4[18], hash5[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4[19], hash5[19], vhash, 256 );
+   intrlv_2x256( vhash, hash6[18], hash7[18], 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6[19], hash7[19], vhash, 256 );
+
+	sph_gost512_init(&ctx.gost);
+	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash1[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash2[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash2[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash3[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash3[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash4[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash4[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash5[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash5[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash6[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash6[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash7[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash7[20]);
+   intrlv_8x32_512( vhashA, hash0[20], hash1[20], hash2[20], hash3[20],
+                            hash4[20], hash5[20], hash6[20], hash7[20] );
+
+   sha256_8way_init( &ctx.sha256 );
+   sha256_8way_update( &ctx.sha256, vhashA, 64 );
+   sha256_8way_close( &ctx.sha256, vhash );
+   dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21],
+                     hash4[21], hash5[21], hash6[21], hash7[21], vhash );
+
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash0[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash0[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash1[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash1[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash2[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash2[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash3[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash3[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash4[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash4[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash5[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash5[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash6[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash6[22]);
+   sph_panama_init(&ctx.panama);
+   sph_panama (&ctx.panama, (const void*) hash7[21], 64 );
+   sph_panama_close(&ctx.panama, (void*) hash7[22]);
+
+   laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]);
+   laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]);
+   laneHash(512, (const BitSequence*)hash2[22], 512, (BitSequence*)hash2[23]);
+   laneHash(512, (const BitSequence*)hash3[22], 512, (BitSequence*)hash3[23]);
+   laneHash(512, (const BitSequence*)hash4[22], 512, (BitSequence*)hash4[23]);
+   laneHash(512, (const BitSequence*)hash5[22], 512, (BitSequence*)hash5[23]);
+   laneHash(512, (const BitSequence*)hash6[22], 512, (BitSequence*)hash6[23]);
+   laneHash(512, (const BitSequence*)hash7[22], 512, (BitSequence*)hash7[23]);
+
+   x25x_shuffle( hash0 );
+   x25x_shuffle( hash1 );
+   x25x_shuffle( hash2 );
+   x25x_shuffle( hash3 );
+   x25x_shuffle( hash4 );
+   x25x_shuffle( hash5 );
+   x25x_shuffle( hash6 );
+   x25x_shuffle( hash7 );
+
+   intrlv_8x32_512( vhashX[ 0], hash0[ 0], hash1[ 0], hash2[ 0], hash3[ 0],
+                                hash4[ 0], hash5[ 0], hash6[ 0], hash7[ 0] );
+   intrlv_8x32_512( vhashX[ 1], hash0[ 1], hash1[ 1], hash2[ 1], hash3[ 1],
+                                hash4[ 1], hash5[ 1], hash6[ 1], hash7[ 1] );
+   intrlv_8x32_512( vhashX[ 2], hash0[ 2], hash1[ 2], hash2[ 2], hash3[ 2],
+                                hash4[ 2], hash5[ 2], hash6[ 2], hash7[ 2] );
+   intrlv_8x32_512( vhashX[ 3], hash0[ 3], hash1[ 3], hash2[ 3], hash3[ 3],
+                                hash4[ 3], hash5[ 3], hash6[ 3], hash7[ 3] );
+   intrlv_8x32_512( vhashX[ 4], hash0[ 4], hash1[ 4], hash2[ 4], hash3[ 4],
+                                hash4[ 4], hash5[ 4], hash6[ 4], hash7[ 4] );
+   intrlv_8x32_512( vhashX[ 5], hash0[ 5], hash1[ 5], hash2[ 5], hash3[ 5],
+                                hash4[ 5], hash5[ 5], hash6[ 5], hash7[ 5] );
+   intrlv_8x32_512( vhashX[ 6], hash0[ 6], hash1[ 6], hash2[ 6], hash3[ 6],
+                                hash4[ 6], hash5[ 6], hash6[ 6], hash7[ 6] );
+   intrlv_8x32_512( vhashX[ 7], hash0[ 7], hash1[ 7], hash2[ 7], hash3[ 7],
+                                hash4[ 7], hash5[ 7], hash6[ 7], hash7[ 7] );
+   intrlv_8x32_512( vhashX[ 8], hash0[ 8], hash1[ 8], hash2[ 8], hash3[ 8],
+                                hash4[ 8], hash5[ 8], hash6[ 8], hash7[ 8] );
+   intrlv_8x32_512( vhashX[ 9], hash0[ 9], hash1[ 9], hash2[ 9], hash3[ 9],
+                                hash4[ 9], hash5[ 9], hash6[ 9], hash7[ 9] );
+   intrlv_8x32_512( vhashX[10], hash0[10], hash1[10], hash2[10], hash3[10],
+                                hash4[10], hash5[10], hash6[10], hash7[10] );
+   intrlv_8x32_512( vhashX[11], hash0[11], hash1[11], hash2[11], hash3[11],
+                                hash4[11], hash5[11], hash6[11], hash7[11] );
+   intrlv_8x32_512( vhashX[12], hash0[12], hash1[12], hash2[12], hash3[12],
+                                hash4[12], hash5[12], hash6[12], hash7[12] );
+   intrlv_8x32_512( vhashX[13], hash0[13], hash1[13], hash2[13], hash3[13],
+                                hash4[13], hash5[13], hash6[13], hash7[13] );
+   intrlv_8x32_512( vhashX[14], hash0[14], hash1[14], hash2[14], hash3[14],
+                                hash4[14], hash5[14], hash6[14], hash7[14] );
+   intrlv_8x32_512( vhashX[15], hash0[15], hash1[15], hash2[15], hash3[15],
+                                hash4[15], hash5[15], hash6[15], hash7[15] );
+   intrlv_8x32_512( vhashX[16], hash0[16], hash1[16], hash2[16], hash3[16],
+                                hash4[16], hash5[16], hash6[16], hash7[16] );
+   intrlv_8x32_512( vhashX[17], hash0[17], hash1[17], hash2[17], hash3[17],
+                                hash4[17], hash5[17], hash6[17], hash7[17] );
+   intrlv_8x32_512( vhashX[18], hash0[18], hash1[18], hash2[18], hash3[18],
+                                hash4[18], hash5[18], hash6[18], hash7[18] );
+   intrlv_8x32_512( vhashX[19], hash0[19], hash1[19], hash2[19], hash3[19],
+                                hash4[19], hash5[19], hash6[19], hash7[19] );
+   intrlv_8x32_512( vhashX[20], hash0[20], hash1[20], hash2[20], hash3[20],
+                                hash4[20], hash5[20], hash6[20], hash7[20] );
+   intrlv_8x32_512( vhashX[21], hash0[21], hash1[21], hash2[21], hash3[21],
+                                hash4[21], hash5[21], hash6[21], hash7[21] );
+   intrlv_8x32_512( vhashX[22], hash0[22], hash1[22], hash2[22], hash3[22],
+                                hash4[22], hash5[22], hash6[22], hash7[22] );
+   intrlv_8x32_512( vhashX[23], hash0[23], hash1[23], hash2[23], hash3[23],
+                                hash4[23], hash5[23], hash6[23], hash7[23] );
+
+   blake2s_8way_init( &ctx.blake2s, 32 );
+   blake2s_8way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
+}
+
+int scanhash_x25x_8way( struct work* work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 4;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x08ff;
+
+   InitializeSWIFFTX();
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x25x_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X25X_4WAY)
+
+union _x25x_4way_ctx_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    hashState_echo          echo;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+    sha256_4way_context     sha256;
+    sph_panama_context      panama;
+    blake2s_4way_state      blake2s;
+};
+typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay;
+
+void x25x_4way_hash( void *output, const void *input )
+{
+   uint64_t vhash[8*4] __attribute__ ((aligned (128)));
    unsigned char hash0[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash1[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash2[25][64] __attribute__((aligned(64))) = {0};
    unsigned char hash3[25][64] __attribute__((aligned(64))) = {0};
-   uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-// Doubling the size of vhashX breaks everything. It may have something
-// to do with accessing arrays: vhashX vs vhashX[0] vs &vhash[0].
-// Changing notation did seem to allow the larger buffer but still resulted
-// in problems further along.
-//   unsigned char vhashX[24][64*8] __attribute__ ((aligned (64)));
    unsigned char vhashX[24][64*4] __attribute__ ((aligned (64)));
    x25x_4way_ctx_overlay ctx __attribute__ ((aligned (64)));
 
    blake512_4way_init( &ctx.blake );
-   blake512_4way( &ctx.blake, input, 80 );
+   blake512_4way_update( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );
    dintrlv_4x64_512( hash0[0], hash1[0], hash2[0], hash3[0], vhash );
 
    bmw512_4way_init( &ctx.bmw );
-   bmw512_4way( &ctx.bmw, vhash, 64 );
+   bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );
    dintrlv_4x64_512( hash0[1], hash1[1], hash2[1], hash3[1], vhash );
 
@@ -118,24 +655,24 @@ void x25x_4way_hash( void *output, const void *input )
    init_groestl( &ctx.groestl, 64 );
    update_and_final_groestl( &ctx.groestl, (char*)hash3[2],
                                   (const char*)hash3[1], 512 );
-   
+
    intrlv_4x64_512( vhash, hash0[2], hash1[2], hash2[2], hash3[2] );
 
    skein512_4way_init( &ctx.skein );
-   skein512_4way( &ctx.skein, vhash, 64 );
+   skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );
    dintrlv_4x64_512( hash0[3], hash1[3], hash2[3], hash3[3], vhash );
 
    jh512_4way_init( &ctx.jh );
-   jh512_4way( &ctx.jh, vhash, 64 );
+   jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );
    dintrlv_4x64_512( hash0[4], hash1[4], hash2[4], hash3[4], vhash );
 
    keccak512_4way_init( &ctx.keccak );
-   keccak512_4way( &ctx.keccak, vhash, 64 );
+   keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );
    dintrlv_4x64_512( hash0[5], hash1[5], hash2[5], hash3[5], vhash );
-   
+
    init_luffa( &ctx.luffa, 512 );
    update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0[6],
                                 (const BitSequence*)hash0[5], 64 );
@@ -162,9 +699,9 @@ void x25x_4way_hash( void *output, const void *input )
    cubehashUpdateDigest( &ctx.cube, (byte*) hash3[7],
                               (const byte*)hash3[6], 64 );
 
-	sph_shavite512_init(&ctx.shavite);
-	sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
-	sph_shavite512_close(&ctx.shavite, hash0[8]);
+   sph_shavite512_init(&ctx.shavite);
+   sph_shavite512(&ctx.shavite, (const void*) hash0[7], 64);
+   sph_shavite512_close(&ctx.shavite, hash0[8]);
    sph_shavite512_init(&ctx.shavite);
    sph_shavite512(&ctx.shavite, (const void*) hash1[7], 64);
    sph_shavite512_close(&ctx.shavite, hash1[8]);
@@ -204,13 +741,13 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0[10], hash1[10], hash2[10], hash3[10] );
 
    hamsi512_4way_init( &ctx.hamsi );
-   hamsi512_4way( &ctx.hamsi, vhash, 64 );
+   hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
    hamsi512_4way_close( &ctx.hamsi, vhash );
    dintrlv_4x64_512( hash0[11], hash1[11], hash2[11], hash3[11], vhash );
 
-	sph_fugue512_init(&ctx.fugue);
-	sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
-	sph_fugue512_close(&ctx.fugue, hash0[12]);
+   sph_fugue512_init(&ctx.fugue);
+   sph_fugue512(&ctx.fugue, (const void*) hash0[11], 64);
+   sph_fugue512_close(&ctx.fugue, hash0[12]);
    sph_fugue512_init(&ctx.fugue);
    sph_fugue512(&ctx.fugue, (const void*) hash1[11], 64);
    sph_fugue512_close(&ctx.fugue, hash1[12]);
@@ -224,13 +761,13 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhash, hash0[12], hash1[12], hash2[12], hash3[12] );
 
    shabal512_4way_init( &ctx.shabal );
-   shabal512_4way( &ctx.shabal, vhash, 64 );
+   shabal512_4way_update( &ctx.shabal, vhash, 64 );
    shabal512_4way_close( &ctx.shabal, vhash );
    dintrlv_4x32_512( hash0[13], hash1[13], hash2[13], hash3[13], vhash );
 
-	sph_whirlpool_init(&ctx.whirlpool);
-	sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
-	sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
+   sph_whirlpool_init(&ctx.whirlpool);
+   sph_whirlpool (&ctx.whirlpool, (const void*) hash0[13], 64);
+   sph_whirlpool_close(&ctx.whirlpool, hash0[14]);
    sph_whirlpool_init(&ctx.whirlpool);
    sph_whirlpool (&ctx.whirlpool, (const void*) hash1[13], 64);
    sph_whirlpool_close(&ctx.whirlpool, hash1[14]);
@@ -244,11 +781,10 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x64_512( vhash, hash0[14], hash1[14], hash2[14], hash3[14] );
 
    sha512_4way_init( &ctx.sha512 );
-   sha512_4way( &ctx.sha512, vhash, 64 );
+   sha512_4way_update( &ctx.sha512, vhash, 64 );
    sha512_4way_close( &ctx.sha512, vhash );
    dintrlv_4x64_512( hash0[15], hash1[15], hash2[15], hash3[15], vhash );
 
-
    ComputeSingleSWIFFTX((unsigned char*)hash0[12], (unsigned char*)hash0[16]);
    ComputeSingleSWIFFTX((unsigned char*)hash1[12], (unsigned char*)hash1[16]);
    ComputeSingleSWIFFTX((unsigned char*)hash2[12], (unsigned char*)hash2[16]);
@@ -257,15 +793,15 @@ void x25x_4way_hash( void *output, const void *input )
    intrlv_4x32_512( vhashX[0], hash0[16], hash1[16], hash2[16], hash3[16] );
 
    memset( vhash, 0, 64*4 );
-   
+
    haval256_5_4way_init( &ctx.haval );
-   haval256_5_4way( &ctx.haval, vhashX[0], 64 );
+   haval256_5_4way_update( &ctx.haval, vhashX[0], 64 );
    haval256_5_4way_close( &ctx.haval, vhash );
    dintrlv_4x32_512( hash0[17], hash1[17], hash2[17], hash3[17], vhash );
 
-	sph_tiger_init(&ctx.tiger);
-	sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
-	sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
+   sph_tiger_init(&ctx.tiger);
+   sph_tiger (&ctx.tiger, (const void*) hash0[17], 64);
+   sph_tiger_close(&ctx.tiger, (void*) hash0[18]);
    sph_tiger_init(&ctx.tiger);
    sph_tiger (&ctx.tiger, (const void*) hash1[17], 64);
    sph_tiger_close(&ctx.tiger, (void*) hash1[18]);
@@ -276,7 +812,7 @@ void x25x_4way_hash( void *output, const void *input )
    sph_tiger (&ctx.tiger, (const void*) hash3[17], 64);
    sph_tiger_close(&ctx.tiger, (void*) hash3[18]);
 
-	LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
+   LYRA2RE( (void*)hash0[19], 32, (const void*)hash0[18], 32,
             (const void*)hash0[18], 32, 1, 4, 4 );
    LYRA2RE( (void*)hash1[19], 32, (const void*)hash1[18], 32,
             (const void*)hash1[18], 32, 1, 4, 4 );
@@ -285,9 +821,9 @@ void x25x_4way_hash( void *output, const void *input )
    LYRA2RE( (void*)hash3[19], 32, (const void*)hash3[18], 32,
             (const void*)hash3[18], 32, 1, 4, 4 );
 
-	sph_gost512_init(&ctx.gost);
-	sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
-	sph_gost512_close(&ctx.gost, (void*) hash0[20]);
+   sph_gost512_init(&ctx.gost);
+   sph_gost512 (&ctx.gost, (const void*) hash0[19], 64);
+   sph_gost512_close(&ctx.gost, (void*) hash0[20]);
    sph_gost512_init(&ctx.gost);
    sph_gost512 (&ctx.gost, (const void*) hash1[19], 64);
    sph_gost512_close(&ctx.gost, (void*) hash1[20]);
@@ -302,7 +838,7 @@ void x25x_4way_hash( void *output, const void *input )
    memset( vhash, 0, 64*4 );
 
    sha256_4way_init( &ctx.sha256 );
-   sha256_4way( &ctx.sha256, vhashX[0], 64 );
+   sha256_4way_update( &ctx.sha256, vhashX[0], 64 );
    sha256_4way_close( &ctx.sha256, vhash );
    dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash );
 
@@ -356,20 +892,12 @@ void x25x_4way_hash( void *output, const void *input )
 
    blake2s_4way_init( &ctx.blake2s, 32 );
    blake2s_4way_full_blocks( &ctx.blake2s, output, vhashX, 64*24 );
-/*
-   dintrlv_4x32( hash0[24], hash1[24], hash2[24], hash3[24], vhash, 256 );
-     
-	memcpy(output,    hash0[24], 32);
-   memcpy(output+32, hash1[24], 32);
-   memcpy(output+64, hash2[24], 32);
-   memcpy(output+96, hash3[24], 32);
-*/
 }
 
 int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t hash[16*4] __attribute__ ((aligned (128)));
    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash7 = &(hash[7<<2]);
@@ -401,17 +929,8 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
          {
               pdata[19] = n + lane;
               submit_lane_solution( work, lane_hash, mythr, lane );
-              }
+         }
       }
-/*
-      for ( int i = 0; i < 4; i++ )
-      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
-      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
-      {
-         pdata[19] = n+i;
-         submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-*/
       n += 4;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
 
diff --git a/build-allarch.sh b/build-allarch.sh
index 6e8fd89..ea69c63 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 
-rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
 
 make distclean || echo clean
 rm -f config.status
diff --git a/build-avx2.sh b/build-avx2.sh
new file mode 100755
index 0000000..7a12473
--- /dev/null
+++ b/build-avx2.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#if [ "$OS" = "Windows_NT" ]; then
+#    ./mingw64.sh
+#    exit 0
+#fi
+
+# Linux build
+
+make distclean || echo clean
+
+rm -f config.status
+./autogen.sh || echo done
+
+# Ubuntu 10.04 (gcc 4.4)
+# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
+
+# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
+#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
+
+#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
+CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+
+make -j 4
+
+strip -s cpuminer
diff --git a/clean-all.sh b/clean-all.sh
new file mode 100755
index 0000000..48a233e
--- /dev/null
+++ b/clean-all.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# imake clean and rm all the targetted executables.
+# tips to users.
+
+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
+
+rm cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-aes-avx.exe cpuminer-aes-sse42.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-sse2.exe cpuminer-zen.exe  > /dev/null
+
+make distclean > /dev/null
diff --git a/configure b/configure
index 3a5454b..76f55d4 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.6'
-PACKAGE_STRING='cpuminer-opt 3.10.6'
+PACKAGE_VERSION='3.11.0'
+PACKAGE_STRING='cpuminer-opt 3.11.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.11.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1404,7 +1404,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.11.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.10.6
+cpuminer-opt configure 3.11.0
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.10.6, which was
+It was created by cpuminer-opt $as_me 3.11.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.10.6'
+ VERSION='3.11.0'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.6, which was
+This file was extended by cpuminer-opt $as_me 3.11.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.6
+cpuminer-opt config.status 3.11.0
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 613de42..c633926 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.6])
+AC_INIT([cpuminer-opt], [3.11.0])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index d1fb2d6..764b928 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1028,7 +1028,7 @@ static int share_result( int result, struct work *null_work,
    {
       // empty queue, it must have overflowed and stats were lost for a share.
       pthread_mutex_unlock( &stats_lock );
-      applog(LOG_WARNING,"Pending shares overflow, stats for share are lost.");
+      applog(LOG_WARNING,"Share stats not available.");
    }
 
    // calculate latency and share time.
diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h
index 961c57d..64b8d7b 100644
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2283,7 +2283,46 @@ static inline void rintrlv_8x32_8x64( void *dst,
    d[63] = _mm_unpackhi_epi32( s[61], s[63] );
 }
 
+// 8x32 -> 4x128
 
+// 16 bytes per lane
+#define RLEAVE_8X32_4X128( i ) \
+do { \
+    uint32_t *d0 = (uint32_t*)dst0 + (i); \
+    uint32_t *d1 = (uint32_t*)dst1 + (i); \
+    const uint32_t *s  = (const uint32_t*)src + ((i)<<1); \
+   d0[ 0] = s[ 0];      d1[ 0] = s[ 4]; \
+   d0[ 1] = s[ 8];      d1[ 1] = s[12]; \
+   d0[ 2] = s[16];      d1[ 2] = s[20]; \
+   d0[ 3] = s[24];      d1[ 3] = s[28]; \
+\
+   d0[ 4] = s[ 1];      d1[ 4] = s[ 5]; \
+   d0[ 5] = s[ 9];      d1[ 5] = s[13]; \
+   d0[ 6] = s[17];      d1[ 6] = s[21]; \
+   d0[ 7] = s[25];      d1[ 7] = s[29]; \
+\
+   d0[ 8] = s[ 2];      d1[ 8] = s[ 6]; \
+   d0[ 9] = s[10];      d1[ 9] = s[14]; \
+   d0[10] = s[18];      d1[10] = s[22]; \
+   d0[11] = s[26];      d1[11] = s[30]; \
+\
+   d0[12] = s[ 3];      d1[12] = s[ 7]; \
+   d0[13] = s[11];      d1[13] = s[15]; \
+   d0[14] = s[19];      d1[14] = s[23]; \
+   d0[15] = s[27];      d1[15] = s[31]; \
+} while(0)  
+
+static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
+                                    const void *src, const int bit_len )
+{
+   RLEAVE_8X32_4X128(   0 );    RLEAVE_8X32_4X128(  16 );
+   if ( bit_len <= 256 ) return;
+   RLEAVE_8X32_4X128(  32 );    RLEAVE_8X32_4X128(  48 );
+   if ( bit_len <= 512 ) return;
+   RLEAVE_8X32_4X128(  64 );    RLEAVE_8X32_4X128(  80 );
+   RLEAVE_8X32_4X128(  96 );    RLEAVE_8X32_4X128( 112 );
+}
+#undef RLEAVE_8X32_4X128
 
 /*
 #define RLEAVE_4x32_4x64(i) do \
diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h
index 3bdde9b..ce9218c 100644
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -42,17 +42,18 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
    return mm256_concat_128( hi, lo );
 }
 
-// Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all
-// 128 bit lanes.
-#define m256_const2_64( i1, i0 ) \
-    _mm256_permute4x64_epi64( _mm256_castsi128_si256( \
-                              m128_const_64( i1, i0 ) ), 0x44 )
-
 // Equivalent of set1, broadcast integer constant to all elements.
-#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
-#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
-#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
-#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v )
+#define m256_const1_64( i )  _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
+#define m256_const1_32( i )  _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
+#define m256_const1_16( i )  _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
+#define m256_const1_8 ( i )  _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
+
+#define m256_const2_64( i1, i0 ) \
+  m256_const1_128( m128_const_64( i1, i0 ) )
+
+#define m126_const2_32( i1, i0 ) \
+   m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) ) 
 
 
 //
diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h
index 5a87979..03118fa 100644
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -38,6 +38,36 @@
 //    shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
 //    doesn't cross 128 bit lane boundaries but is consistent with AVX2
 //    where shuffle_epi8 spans the entire vector.
+//
+//    There are 2 areas where overhead is aconcern: constants and
+//    permutations.
+//
+//    Constants need to be composed at run time by assembling individual
+//    elements, very expensive. The cost is proportional to the number of
+//    elements therefor use the largest element size possible, even by
+//    merging smaller values.
+//
+//    Constants with repeating patterns can be optimized with the smaller
+//    patterns repeated more frequently being more efficient.
+//
+//    Some specific constants can be very efficient. Zero is very efficient,
+//    1 and -1 slightly less so.
+//
+//    If an expensive constant is to be reused in the same function it should
+//    be declared as a local variable defined once and reused.
+//
+//    Permutations cab be very exppensive if they use a vector control index,
+//    even if the permutation itself is quite efficient.
+//    The index is essentially a constant with all the baggage that brings.
+//    The same rules apply, if an index is to be reused it should be defined
+//    as a local. This applies specifically to bswap operations.
+//
+//    Additionally, permutations using smaller vectors can be more efficient
+//    if the permutation doesn't cross lane boundaries ,typically 128 bits,
+//    ans the smnaller vector can use an imm comtrol.
+//
+//    If the permutation doesn't cross lane boundaries a shuffle instructions
+//    can be used with imm control instead of permute.
 
 //////////////////////////////////////////////////////////////
 //
@@ -106,12 +136,14 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const1_16( i )    _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
 #define m512_const1_8( i )     _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
 
+#define m512_const2_128( v1, v0 ) \
+   m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( lo ), hi, 1 ) )
+
 #define m512_const2_64( i1, i0 ) \
    m512_const1_128( m128_const_64( i1, i0 ) )
 
 #define m512_const2_32( i1, i0 ) \
-   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
-                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
+   m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) )
 
 // { m128_1, m128_1, m128_0, m128_0 }
 #define m512_const_2x128( v1, v0 ) \
diff --git a/winbuild-cross.sh b/winbuild-cross.sh
index f7f8968..c2d7720 100755
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -27,6 +27,9 @@ ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
 #sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
 
 # make release directory and copy selected DLLs.
+
+rm -rf release > /dev/null
+
 mkdir release
 cp README.txt release/
 cp README.md release/
@@ -35,10 +38,6 @@ cp $MINGW_LIB/zlib1.dll release/
 cp $MINGW_LIB/libwinpthread-1.dll release/
 cp $GCC_MINGW_LIB/libstdc++-6.dll release/
 cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/
-#cp /usr/x86_64-w64-mingw32/lib/zlib1.dll release/
-#cp /usr/x86_64-w64-mingw32/lib/libwinpthread-1.dll release/
-#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libstdc++-6.dll release/
-#cp /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32/libgcc_s_seh-1.dll release/
 cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/
 cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/