v3.11.0

v3.10.6
v3.10.5
2025-09-17 23:44:27 +00:00 · 2020-01-02 23:54:08 -05:00 · 2019-12-25 01:26:26 -05:00 · 2019-12-21 13:19:29 -05:00
150 changed files with 18331 additions and 2729 deletions
--- a/71
+++ b/71
@@ -1,12 +1,14 @@


-Requirements:
+1. Requirements:
+---------------

 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.

-Building on linux prerequisites:
+2. Building on linux prerequisites:
+-----------------------------------

 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu

 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
-will give a clue as to the missing package.
+be installed manually. There may be others, read the compiler error messages,
+they will give a clue as to the missing package.

 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
-but different package names.
+but different package names. 

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
-compiler version, to CFLAGS:
-"-march=native" or "-march=znver1" or "-msha".
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
+support depending on your CPU and compiler version:
+
+"-march=native" is always the best choice
+
+"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+
+"-msha"  Add SHA to other tuning options

 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.

-Extract cpuminer source.
+3. Download cpuminer-opt
+------------------------

-tar xvzf cpuminer-opt-x.y.z.tar.gz
-cd cpuminer-opt-x.y.z
+Download the source code for the latest realease from the official repository.

-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases

-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-make
+Extract the source code.

-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
+
+
+Alternatively it can be cloned from git.
+
+$ git clone https://github.com/JayDDee/cpuminer-opt.git
+ 
+4. Build cpuminer-opt
+---------------------
+
+It is recomended to Build with default options, this will usuallly
+produce the best results.
+
+$ ./build.sh to build on Linux or execute the following commands.
+
+or 
+
+$ ./autogen.sh
+$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+$ make -j n
+
+n is the number of threads.
+
+5. Start mining.
+----------------
+
+$ ./cpuminer -a algo -o url -u username -p password

-./cpuminer -a algo -o url -u username -p password

 Windows
+-------
+
+See also INSTAL_WINDOWS
+
+The following procedure is obsolete and uses an old compiler.

 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,10 +84,14 @@ cpuminer_SOURCES = \
  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
+  algo/echo/echo-hash-4way.c \
  algo/echo/aes_ni/hash.c\
  algo/gost/sph_gost.c \
+  algo/groestl/groestl-gate.c \
+  algo/groestl/groestl512-hash-4way.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
+  algo/groestl/groestl-4way.c \
  algo/groestl/myrgr-gate.c \
  algo/groestl/myrgr-4way.c \
  algo/groestl/myr-groestl.c \
@@ -124,6 +128,8 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/sponge-2way.c \
+  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
@@ -185,6 +191,7 @@ cpuminer_SOURCES = \
  algo/shavite/sph_shavite.c \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite-hash-2way.c \
+  algo/shavite/shavite-hash-4way.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
  algo/simd/nist.c \
--- a/51
+++ b/51
@@ -1,13 +1,17 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.

+See also README.md for list of supported algorithms,
+
 Security warning
 ----------------

 Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are cryptocurrency 
-miners. The source code is open for anyone to inspect. If you don't trust 
-the software, don't use it.
+usually a false positive, they are flagged simply because they are
+cryptocurrency miners. However, some malware has been spread using the
+cover that miners are known to be subject to false positives. Always be on
+alert. The source code of cpuminer-opt is open for anyone to inspect.
+If you don't trust the software don't download it.

 The cryptographic hashing code has been taken from trusted sources but has been
 modified for speed at the expense of accepted security practices. This
@@ -17,7 +21,7 @@ required.
 Compile Instructions
 --------------------

-See INSTALL_LINUX or INSTALL_WINDOWS fror compile instruuctions
+See INSTALL_LINUX or INSTALL_WINDOWS for compile instruuctions

 Requirements
 ------------
@@ -31,7 +35,44 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------

-v3.10.2
+v3.11.0
+
+Fixed x25x AVX512 lane 4 invalid shares.
+
+AVX512 for hex, phi2.
+
+VAES optimzation for Intel Icelake CPUs for most algos recently optimized
+with AVX512, source code only.
+
+v3.10.7
+
+AVX512 for x25x, lbry, x13bcd (bcd).
+
+v3.10.6
+
+Added support for SSL stratum: stratum+tcps://
+
+Added job id reporting again, but leaner, suppressed with --quiet.
+
+AVX512 for x21s, x22i, lyra2z, allium.
+
+Fixed share overflow warnings mining lbry with Ryzen (SHA).
+
+v3.10.5
+
+AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
+Faster hmq1725 AVX2.
+
+v3.10.4
+
+AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
+
+v3.10.3
+
+AVX512 for x12, x13, x14, x15.
+Fixed x12 AVX2 invalid shares.
+
+v.10.2

 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
 Fixed c11 AVX2 invalid shares.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -317,6 +317,7 @@ const char* const algo_alias_map[][2] =
  { "argon2d-crds",      "argon2d250"   },
  { "argon2d-dyn",       "argon2d500"   },
  { "argon2d-uis",       "argon2d4096"  },
+  { "bcd",               "x13bcd"       },
  { "bitcore",           "timetravel10" },
  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define  rotr32  mm256_swap32_64
-#define  rotr24  mm256_ror3x8_64
-#define  rotr16  mm256_ror1x16_64
-#define  rotr63( x ) mm256_rol_64( x, 1 )
+#define  rotr32( x )  mm256_ror_64( x, 32 )
+#define  rotr24( x )  mm256_ror_64( x, 24 )
+#define  rotr16( x )  mm256_ror_64( x, 16 )
+#define  rotr63( x )  mm256_rol_64( x,  1 )

 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
 //#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
+#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
+#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
+#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
+//#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
+#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

 // Blake-512 4 way
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -842,7 +842,8 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
 }

 static void
-blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
+blake32_4way( blake_4way_small_context *ctx, const void *data,
+              size_t len )
 {
   __m128i *buf = (__m128i*)ctx->buf;
   size_t  bptr = ctx->ptr<<2;
@@ -1184,7 +1185,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
 }

 void
-blake256_16way_close_update(void *cc, void *dst)
+blake256_16way_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -1237,7 +1238,7 @@ blake256_4way_init(void *ctx)
 }

 void
-blake256_4way(void *ctx, const void *data, size_t len)
+blake256_4way_update(void *ctx, const void *data, size_t len)
 {
 	blake32_4way(ctx, data, len);
 }
@@ -1259,7 +1260,7 @@ blake256_8way_init(void *cc)
 }

 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }
@@ -1279,7 +1280,7 @@ void blake256r14_4way_init(void *cc)
 }

 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1298,7 +1299,7 @@ void blake256r14_8way_init(void *cc)
 }

 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
@@ -1318,7 +1319,7 @@ void blake256r8_4way_init(void *cc)
 }

 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1337,7 +1338,7 @@ void blake256r8_8way_init(void *cc)
 }

 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -463,6 +463,38 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
   return 0;
 }

+// Update and final when inlen is a multiple of 64 bytes
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen )
+{
+    __m256i *in = (__m256i*)input;
+    __m256i *buf = (__m256i*)S->buf;
+
+    while( inlen > BLAKE2S_BLOCKBYTES )
+    {
+       memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+       S->buflen = BLAKE2S_BLOCKBYTES;
+       inlen -= BLAKE2S_BLOCKBYTES;
+       S->t[0] += BLAKE2S_BLOCKBYTES;
+       S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+       blake2s_8way_compress( S, buf );
+       S->buflen = 0;
+       in += ( BLAKE2S_BLOCKBYTES >> 2 );
+    }
+
+    // last block
+    memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
+    S->buflen = BLAKE2S_BLOCKBYTES;
+    S->t[0] += S->buflen;
+    S->t[1] += ( S->t[0] < S->buflen );
+    if ( S->last_node )  S->f[1] = ~0U;
+    S->f[0] = ~0U;
+    blake2s_8way_compress( S, buf );
+
+    for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+    return 0;
+}

 #endif // __AVX2__

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,6 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-//#if defined(__SSE4_2__)
 #if defined(__SSE2__)

 #include "simd-utils.h"
@@ -95,8 +94,8 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
 int blake2s_8way_update( blake2s_8way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
-//int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
-//                              const void *input, uint64_t inlen );
+int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
+                              const void *input, uint64_t inlen );

 #endif

@@ -132,6 +131,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
 }
 #endif

-#endif  // __SSE4_2__
+#endif  // __SSE2__

 #endif
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+/*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+*/
 }

 static const __m256i final_s8[16] =
--- a/algo/bmw/bmw512-4way.c
+++ b/algo/bmw/bmw512-4way.c
@@ -41,7 +41,6 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
-//      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
          extr_lane_8x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
@@ -66,7 +65,7 @@ void bmw512hash_4way(void *state, const void *input)
 {
    bmw512_4way_context ctx;
    bmw512_4way_init( &ctx );
-    bmw512_4way( &ctx, input, 80 );
+    bmw512_4way_update( &ctx, input, 80 );
    bmw512_4way_close( &ctx, state );
 }

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap64_128( x4 );
-        x5 = mm512_swap64_128( x5 );
-        x6 = mm512_swap64_128( x6 );
-        x7 = mm512_swap64_128( x7 );
+        x4 = mm512_swap128_64( x4 );
+        x5 = mm512_swap128_64( x5 );
+        x6 = mm512_swap128_64( x6 );
+        x7 = mm512_swap128_64( x7 );
        x4 = _mm512_add_epi32( x0, x4 );
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap32_64( x4 );
-        x5 = mm512_swap32_64( x5 );
-        x6 = mm512_swap32_64( x6 );
-        x7 = mm512_swap32_64( x7 );
+        x4 = mm512_swap64_32( x4 );
+        x5 = mm512_swap64_32( x5 );
+        x6 = mm512_swap64_32( x6 );
+        x7 = mm512_swap64_32( x7 );
    }

    _mm512_store_si512( (__m512i*)sp->h,     x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
-        x5 = mm256_swap64_128( x5 );
-        x6 = mm256_swap64_128( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
-        x5 = mm256_swap32_64( x5 );
-        x6 = mm256_swap32_64( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
    }

    _mm256_store_si256( (__m256i*)sp->h,     x0 );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -21,7 +21,27 @@ static void transform( cubehashParam *sp )
    int r;
    const int rounds = sp->rounds;

-#ifdef __AVX2__
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+    register __m512i x0, x1;
+
+    x0 = _mm512_load_si512( (__m512i*)sp->x     );
+    x1 = _mm512_load_si512( (__m512i*)sp->x + 1 );
+
+    for ( r = 0; r < rounds; ++r )
+    { 
+        x1 = _mm512_add_epi32( x0, x1 );
+        x0 = _mm512_xor_si512( mm512_rol_32( mm512_swap_256( x0 ), 7 ), x1 );
+        x1 = _mm512_add_epi32( x0, mm512_swap128_64( x1 ) );
+        x0 = _mm512_xor_si512( mm512_rol_32(
+                                         mm512_swap256_128( x0 ), 11 ), x1 );
+        x1 = mm512_swap64_32( x1 );
+    }
+
+    _mm512_store_si512( (__m512i*)sp->x,     x0 );
+    _mm512_store_si512( (__m512i*)sp->x + 1, x1 );
+
+#elif defined(__AVX2__)

    register __m256i x0, x1, x2, x3, y0, y1;

@@ -39,8 +59,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x2 = mm256_swap128_64( x2 );
+        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = mm256_swap_128( x0 );
@@ -49,8 +69,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x2 = mm256_swap64_32( x2 );
+        x3 = mm256_swap64_32( x3 );
    }

    _mm256_store_si256( (__m256i*)sp->x,     x0 );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -186,7 +186,7 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
 	{
 	   for(i = 0; i < 4; i++)
 	   {
-		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+		_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
 	   }
 	}

@@ -390,13 +390,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
 	}

 	// Store the hash value
-	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
-	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+	_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);

 	if(state->uHashSize == 512)
 	{
-		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
-		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+		_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
 	}

 	return SUCCESS;
@@ -513,13 +513,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
   }

   // Store the hash value
-   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
-   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
+   _mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );

   if( state->uHashSize == 512 )
   {
-        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
-        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
+        _mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );

   }
   return SUCCESS;
--- a/algo/echo/aes_ni/hash.c.test
+++ b/algo/echo/aes_ni/hash.c.test
@@ -0,0 +1,620 @@
+/*
+ * file        : echo_vperm.c
+ * version     : 1.0.208
+ * date        : 14.12.2010
+ * 
+ * - vperm and aes_ni implementations of hash function ECHO
+ * - implements NIST hash api
+ * - assumes that message lenght is multiple of 8-bits
+ * - _ECHO_VPERM_ must be defined if compiling with ../main.c
+ * -  define NO_AES_NI for aes_ni version
+ *
+ * Cagdas Calik
+ * ccalik@metu.edu.tr
+ * Institute of Applied Mathematics, Middle East Technical University, Turkey.
+ *
+ */
+#if defined(__AES__)
+
+#include <memory.h>
+#include "miner.h"
+#include "hash_api.h"
+//#include "vperm.h"
+#include <immintrin.h>
+/*
+#ifndef NO_AES_NI
+#include <wmmintrin.h>
+#else
+#include <tmmintrin.h>
+#endif
+*/
+
+MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
+MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
+MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
+MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
+MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
+MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
+MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
+MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
+MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
+MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
+MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
+MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
+MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
+MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
+MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
+MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
+MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
+MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
+MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
+MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
+MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
+
+
+MYALIGN const unsigned int 	const1[]		= {0x00000001, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2mask[]		= {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	lsbmask[]		= {0x01010101, 0x01010101, 0x01010101, 0x01010101};
+MYALIGN const unsigned int	invshiftrows[]	= {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
+MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x00000000};
+MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
+
+
+#define ECHO_SUBBYTES(state, i, j) \
+	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
+	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
+	k1 = _mm_add_epi32(k1, M128(const1))
+
+#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
+	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
+	t1 = _mm_srli_epi16(state1[0][j], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = s2;\
+	state2[1][j] = state1[0][j];\
+	state2[2][j] = state1[0][j];\
+	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
+	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
+	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
+	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+
+
+#define ECHO_ROUND_UNROLL2 \
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+
+
+#define SAVESTATE(dst, src)\
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]
+
+
+void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+{
+   unsigned int r, b, i, j;
+   __m128i t1, t2, s2, k1;
+   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+   for(i = 0; i < 4; i++)
+	for(j = 0; j < ctx->uHashSize / 256; j++)
+		_state[i][j] = ctx->state[i][j];
+
+   for(b = 0; b < uBlockCount; b++)
+   {
+	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
+
+	// load message
+	for(j = ctx->uHashSize / 256; j < 4; j++)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	   }
+	}
+
+uint64_t *b = (uint64_t*)_state;
+//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+   
+	// save state
+	SAVESTATE(_statebackup, _state);
+
+	k1 = ctx->k;
+
+	for(r = 0; r < ctx->uRounds / 2; r++)
+	{
+		ECHO_ROUND_UNROLL2;
+	}
+
+//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+   
+   
+	if(ctx->uHashSize == 256)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	   }
+	}
+	else
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+           }
+	}
+	pmsg += ctx->uBlockLength;
+   }
+	SAVESTATE(ctx->state, _state);
+
+}
+
+
+
+HashReturn init_echo(hashState_echo *ctx, int nHashSize)
+{
+	int i, j;
+
+        ctx->k = _mm_setzero_si128(); 
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch(nHashSize)
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
+			ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
+			break;
+
+		default:
+			return BAD_HASHBITLEN;
+	}
+
+
+	for(i = 0; i < 4; i++)
+		for(j = 0; j < nHashSize / 256; j++)
+			ctx->state[i][j] = ctx->hashsize;
+
+	for(i = 0; i < 4; i++)
+		for(j = nHashSize / 256; j < 4; j++)
+			ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
+
+	return SUCCESS;
+}
+
+HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
+{
+	unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+	uByteLength = (unsigned int)(databitlen / 8);
+
+	if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
+	{
+		if(state->uBufferBytes != 0)
+		{
+			// Fill the buffer
+			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+
+			// Process buffer
+			Compress(state, state->buffer, 1);
+			state->processed_bits += state->uBlockLength * 8;
+
+			data += state->uBlockLength - state->uBufferBytes;
+			uByteLength -= state->uBlockLength - state->uBufferBytes;
+		}
+
+		// buffer now does not contain any unprocessed bytes
+
+		uBlockCount = uByteLength / state->uBlockLength;
+		uRemainingBytes = uByteLength % state->uBlockLength;
+
+		if(uBlockCount > 0)
+		{
+			Compress(state, data, uBlockCount);
+
+			state->processed_bits += uBlockCount * state->uBlockLength * 8;
+			data += uBlockCount * state->uBlockLength;
+		}
+
+		if(uRemainingBytes > 0)
+		{
+			memcpy(state->buffer, (void*)data, uRemainingBytes);
+		}
+
+		state->uBufferBytes = uRemainingBytes;
+	}
+	else
+	{
+		memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
+		state->uBufferBytes += uByteLength;
+	}
+
+	return SUCCESS;
+}
+
+HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
+{
+	__m128i remainingbits;
+
+	// Add remaining bytes in the buffer
+	state->processed_bits += state->uBufferBytes * 8;
+
+	remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
+
+	// Pad with 0x80
+	state->buffer[state->uBufferBytes++] = 0x80;
+	
+	// Enough buffer space for padding in this block?
+	if((state->uBlockLength - state->uBufferBytes) >= 18)
+	{
+		// Pad with zeros
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Last block contains message bits?
+		if(state->uBufferBytes == 1)
+		{
+			state->k = _mm_xor_si128(state->k, state->k);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+		else
+		{
+			state->k = _mm_add_epi64(state->k, remainingbits);
+			state->k = _mm_sub_epi64(state->k, state->const1536);
+		}
+
+		// Compress
+		Compress(state, state->buffer, 1);
+	}
+	else
+	{
+		// Fill with zero and compress
+		memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
+		state->k = _mm_add_epi64(state->k, remainingbits);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+
+		// Last block
+		memset(state->buffer, 0, state->uBlockLength - 18);
+
+		// Hash size
+		*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
+
+		// Processed bits
+		*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
+		*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
+
+		// Compress the last block
+		state->k = _mm_xor_si128(state->k, state->k);
+		state->k = _mm_sub_epi64(state->k, state->const1536);
+		Compress(state, state->buffer, 1);
+	}
+
+	// Store the hash value
+	_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
+	_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
+
+	if(state->uHashSize == 512)
+	{
+		_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
+		_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
+	}
+
+	return SUCCESS;
+}
+
+HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
+                              const BitSequence *data, DataLength databitlen )
+{
+   unsigned int uByteLength, uBlockCount, uRemainingBytes;
+
+   uByteLength = (unsigned int)(databitlen / 8);
+
+/*   
+   if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
+   {
+printf("full block\n");
+      if( state->uBufferBytes != 0 )
+        {
+           // Fill the buffer
+           memcpy( state->buffer + state->uBufferBytes,
+                   (void*)data, state->uBlockLength - state->uBufferBytes );
+
+           // Process buffer
+           Compress( state, state->buffer, 1 );
+           state->processed_bits += state->uBlockLength * 8;
+
+           data += state->uBlockLength - state->uBufferBytes;
+           uByteLength -= state->uBlockLength - state->uBufferBytes;
+        }
+
+        // buffer now does not contain any unprocessed bytes
+
+        uBlockCount = uByteLength / state->uBlockLength;
+        uRemainingBytes = uByteLength % state->uBlockLength;
+
+        if( uBlockCount > 0 )
+        {
+           Compress( state, data, uBlockCount );
+           state->processed_bits += uBlockCount * state->uBlockLength * 8;
+           data += uBlockCount * state->uBlockLength;
+        }
+
+        if( uRemainingBytes > 0 )
+        memcpy(state->buffer, (void*)data, uRemainingBytes);
+
+        state->uBufferBytes = uRemainingBytes;
+   }
+   else
+   {
+*/
+   memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
+        state->uBufferBytes += uByteLength;
+//   }
+
+   __m128i remainingbits;
+
+   // Add remaining bytes in the buffer
+   state->processed_bits += state->uBufferBytes * 8;
+
+   remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
+
+   // Pad with 0x80
+   state->buffer[state->uBufferBytes++] = 0x80;
+
+   // Enough buffer space for padding in this block?
+
+//   if( (state->uBlockLength - state->uBufferBytes) >= 18 )
+//   {
+        // Pad with zeros
+
+        memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+
+
+        // Last block contains message bits?
+        if( state->uBufferBytes == 1 )
+        {
+           state->k = _mm_xor_si128( state->k, state->k );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+        else
+        {
+           state->k = _mm_add_epi64( state->k, remainingbits );
+           state->k = _mm_sub_epi64( state->k, state->const1536 );
+        }
+
+uint64_t *b = (uint64_t*)&state->k;
+/*
+printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+b = (uint64_t*)state->buffer;
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
+printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
+
+b = (uint64_t*)state->state;
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
+printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
+*/        
+        // Compress
+        Compress( state, state->buffer, 1 );
+
+//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
+
+        
+/*
+   }
+   else
+   {
+        // Fill with zero and compress
+        memset( state->buffer + state->uBufferBytes, 0,
+                state->uBlockLength - state->uBufferBytes );
+        state->k = _mm_add_epi64( state->k, remainingbits );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1 );
+
+        // Last block
+        memset( state->buffer, 0, state->uBlockLength - 18 );
+
+        // Hash size
+        *( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
+                 state->uHashSize;
+
+        // Processed bits
+        *( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
+                   state->processed_bits;
+        *( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
+        // Compress the last block
+        state->k = _mm_xor_si128( state->k, state->k );
+        state->k = _mm_sub_epi64( state->k, state->const1536 );
+        Compress( state, state->buffer, 1) ;
+   }
+*/
+
+   // Store the hash value
+   _mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
+   _mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
+
+   if( state->uHashSize == 512 )
+   {
+        _mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
+        _mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
+
+   }
+   return SUCCESS;
+}
+
+
+HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
+{
+	HashReturn hRet;
+	hashState_echo hs;
+
+	/////
+	/*
+	__m128i a, b, c, d, t[4], u[4], v[4];
+
+	a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
+	b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
+	c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
+	d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
+
+	t[0] = _mm_unpacklo_epi8(a, b);
+	t[1] = _mm_unpackhi_epi8(a, b);
+	t[2] = _mm_unpacklo_epi8(c, d);
+	t[3] = _mm_unpackhi_epi8(c, d);
+
+	u[0] = _mm_unpacklo_epi16(t[0], t[2]);
+	u[1] = _mm_unpackhi_epi16(t[0], t[2]);
+	u[2] = _mm_unpacklo_epi16(t[1], t[3]);
+	u[3] = _mm_unpackhi_epi16(t[1], t[3]);
+
+
+	t[0] = _mm_unpacklo_epi16(u[0], u[1]);
+	t[1] = _mm_unpackhi_epi16(u[0], u[1]);
+	t[2] = _mm_unpacklo_epi16(u[2], u[3]);
+	t[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+	u[0] = _mm_unpacklo_epi8(t[0], t[1]);
+	u[1] = _mm_unpackhi_epi8(t[0], t[1]);
+	u[2] = _mm_unpacklo_epi8(t[2], t[3]);
+	u[3] = _mm_unpackhi_epi8(t[2], t[3]);
+
+	a = _mm_unpacklo_epi8(u[0], u[1]);
+	b = _mm_unpackhi_epi8(u[0], u[1]);
+	c = _mm_unpacklo_epi8(u[2], u[3]);
+	d = _mm_unpackhi_epi8(u[2], u[3]);
+	*/
+	/////
+
+	hRet = init_echo(&hs, hashbitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = update_echo(&hs, data, databitlen);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	hRet = final_echo(&hs, hashval);
+	if(hRet != SUCCESS)
+		return hRet;
+
+	return SUCCESS;
+}
+
+#endif
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -0,0 +1,317 @@
+//#if 0
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+#include "echo-hash-4way.h"
+
+/*
+static const unsigned int mul2ipt[] __attribute__ ((aligned (64))) =
+{  
+   0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57,
+   0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234
+};
+*/
+// do these need to be reversed?
+
+#define mul2mask \
+   _mm512_set4_epi32( 0, 0, 0, 0x00001b00 ) 
+//   _mm512_set4_epi32( 0x00001b00, 0, 0, 0 )  
+
+#define lsbmask    m512_const1_32( 0x01010101 ) 
+
+#define ECHO_SUBBYTES( state, i, j ) \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], k1 ); \
+	state[i][j] = _mm512_aesenc_epi128( state[i][j], m512_zero ); \
+	k1 = _mm512_add_epi32( k1, m512_one_128 );
+
+#define ECHO_MIXBYTES( state1, state2, j, t1, t2, s2 ) do \
+{ \
+   const int j1 = ( (j)+1 ) & 3; \
+   const int j2 = ( (j)+2 ) & 3; \
+   const int j3 = ( (j)+3 ) & 3; \
+   s2 = _mm512_add_epi8( state1[ 0 ] [j ], state1[ 0 ][ j ] ); \
+	t1 = _mm512_srli_epi16( state1[ 0 ][ j ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask );\
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ] [j ] = s2; \
+	state2[ 1 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 2 ] [j ] = state1[ 0 ][ j ]; \
+	state2[ 3 ] [j ] = _mm512_xor_si512( s2, state1[ 0 ][ j ] );\
+	s2 = _mm512_add_epi8( state1[ 1 ][ j1 ], state1[ 1 ][ j1 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 1 ][ j1 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 );\
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 1 ][ j1 ] ) ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], s2 ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
+	s2 = _mm512_add_epi8( state1[ 2 ][ j2 ], state1[ 2 ][ j2 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 2 ][ j2 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 2 ][ j2 ] ) ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], s2 ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
+	s2 = _mm512_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
+	t1 = _mm512_srli_epi16( state1[ 3 ][ j3 ], 7 ); \
+	t1 = _mm512_and_si512( t1, lsbmask ); \
+	t2 = _mm512_shuffle_epi8( mul2mask, t1 ); \
+	s2 = _mm512_xor_si512( s2, t2 ); \
+	state2[ 0 ][ j ] = _mm512_xor_si512( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 1 ][ j ] = _mm512_xor_si512( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
+	state2[ 2 ][ j ] = _mm512_xor_si512( state2[ 2 ][ j ], \
+                            _mm512_xor_si512( s2, state1[ 3 ][ j3] ) ); \
+	state2[ 3 ][ j ] = _mm512_xor_si512( state2[ 3 ][ j ], s2 ); \
+} while(0)
+
+#define ECHO_ROUND_UNROLL2 \
+	ECHO_SUBBYTES(_state, 0, 0);\
+   ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+
+#define SAVESTATE(dst, src)\
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]
+
+// blockcount always 1
+void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
+               unsigned int uBlockCount )
+{
+  unsigned int r, b, i, j;
+  __m512i t1, t2, s2, k1;
+  __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+
+  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
+  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
+  _state[ 0 ][ 2 ] = ctx->state[ 0 ][ 2 ];
+  _state[ 0 ][ 3 ] = ctx->state[ 0 ][ 3 ];
+  _state[ 1 ][ 0 ] = ctx->state[ 1 ][ 0 ];
+  _state[ 1 ][ 1 ] = ctx->state[ 1 ][ 1 ];
+  _state[ 1 ][ 2 ] = ctx->state[ 1 ][ 2 ];
+  _state[ 1 ][ 3 ] = ctx->state[ 1 ][ 3 ];
+  _state[ 2 ][ 0 ] = ctx->state[ 2 ][ 0 ];
+  _state[ 2 ][ 1 ] = ctx->state[ 2 ][ 1 ];
+  _state[ 2 ][ 2 ] = ctx->state[ 2 ][ 2 ];
+  _state[ 2 ][ 3 ] = ctx->state[ 2 ][ 3 ];
+  _state[ 3 ][ 0 ] = ctx->state[ 3 ][ 0 ];
+  _state[ 3 ][ 1 ] = ctx->state[ 3 ][ 1 ];
+  _state[ 3 ][ 2 ] = ctx->state[ 3 ][ 2 ];
+  _state[ 3 ][ 3 ] = ctx->state[ 3 ][ 3 ];
+
+  for ( b = 0; b < uBlockCount; b++ )
+  {
+    ctx->k = _mm512_add_epi64( ctx->k, ctx->const1536 );
+
+    for( j = ctx->uHashSize / 256; j < 4; j++ )
+    {
+      for ( i = 0; i < 4; i++ )
+	   {
+        _state[ i ][ j ] = _mm512_load_si512( 
+                     pmsg + 4 * (j - (ctx->uHashSize / 256)) + i );
+	   }
+	 }
+    
+    // save state
+	 SAVESTATE( _statebackup, _state );
+
+	 k1 = ctx->k;
+
+	 for ( r = 0; r < ctx->uRounds / 2; r++ )
+	 {
+		ECHO_ROUND_UNROLL2;
+	 }
+		
+	 if ( ctx->uHashSize == 256 )
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 2 ] ) ;
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 3 ] );
+	   }
+	 }
+	 else
+	 {
+	   for ( i = 0; i < 4; i++ )
+	   {
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _state[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _state[ i ][ 3 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ][ 0 ],
+                                              _statebackup[ i ][ 0 ] );
+		   _state[ i ][ 0 ] = _mm512_xor_si512( _state[ i ] [0 ],
+                                              _statebackup[ i ][ 2 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 1 ] );
+		   _state[ i ][ 1 ] = _mm512_xor_si512( _state[ i ][ 1 ],
+                                              _statebackup[ i ][ 3 ] );
+      }
+	 }
+    pmsg += ctx->uBlockLength;
+  }
+  SAVESTATE(ctx->state, _state);
+
+}
+
+int echo_4way_init( echo_4way_context *ctx, int nHashSize )
+{
+	int i, j;
+
+   ctx->k = m512_zero; 
+	ctx->processed_bits = 0;
+	ctx->uBufferBytes = 0;
+
+	switch( nHashSize )
+	{
+		case 256:
+			ctx->uHashSize = 256;
+			ctx->uBlockLength = 192;
+			ctx->uRounds = 8;
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
+			break;
+
+		case 512:
+			ctx->uHashSize = 512;
+			ctx->uBlockLength = 128;
+			ctx->uRounds = 10;
+			ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
+			ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
+			break;
+
+		default:
+			return 1;
+	}
+
+	for( i = 0; i < 4; i++ )
+		for( j = 0; j < nHashSize / 256; j++ )
+			ctx->state[ i ][ j ] = ctx->hashsize;
+
+	for( i = 0; i < 4; i++ )
+		for( j = nHashSize / 256; j < 4; j++ )
+			ctx->state[ i ][ j ] = m512_zero;
+
+	return 0;
+}
+
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen )
+{
+// bytelen is either 32 (maybe), 64 or 80 or 128!
+// all are less than full block.
+
+   int vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+   const int vblen = state->uBlockLength / 16; //  16 bytes per lane
+   __m512i remainingbits;
+
+   if ( databitlen == 1024 )
+   {
+      echo_4way_compress( state, data, 1 );
+      state->processed_bits = 1024;
+      remainingbits = m512_zero;
+      vlen = 0;
+   }
+   else
+   {
+      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
+      memcpy_512( state->buffer, data, vlen );
+   
+      state->processed_bits += (unsigned int)( databitlen );
+      remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
+
+   }
+
+  state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
+  memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
+  state->buffer[ vblen-2 ] =
+                _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
+  state->buffer[ vblen-1 ] =
+                   _mm512_set4_epi64( 0, state->processed_bits,
+                                      0, state->processed_bits );  
+
+  state->k = _mm512_add_epi64( state->k, remainingbits );
+  state->k = _mm512_sub_epi64( state->k, state->const1536 );
+
+  echo_4way_compress( state, state->buffer, 1 );
+
+  _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
+  _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
+
+  if ( state->uHashSize == 512 )
+  {
+     _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
+     _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
+  }
+  return 0;
+}
+
+#endif
--- a/algo/echo/echo-hash-4way.h
+++ b/algo/echo/echo-hash-4way.h
@@ -0,0 +1,36 @@
+#if !defined(ECHO_HASH_4WAY_H__)
+#define ECHO_HASH_4WAY_H__ 1
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#include "simd-utils.h"
+
+typedef struct
+{
+   __m512i    state[4][4];
+   __m512i    buffer[ 4 * 192 / 16 ];  // 4x128 interleaved 192 bytes
+   __m512i    k;
+   __m512i    hashsize;
+   __m512i    const1536;
+
+   unsigned int   uRounds;
+   unsigned int   uHashSize;
+   unsigned int   uBlockLength;
+   unsigned int   uBufferBytes;
+   unsigned int   processed_bits;
+
+} echo_4way_context __attribute__ ((aligned (64)));
+
+int echo_4way_init( echo_4way_context *state, int hashbitlen );
+
+
+int echo_4way_update( echo_4way_context *state, const void *data,
+    unsigned int databitlen);
+
+int echo_close( echo_4way_context *state, void *hashval );
+
+int echo_4way_update_close( echo_4way_context *state, void *hashval,
+                              const void *data, int databitlen );
+
+#endif 
+#endif
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -73,7 +73,7 @@ __m128i ALL_FF;
  b5 = a7;\
  a6 = _mm_xor_si128(a6, a7);\
  a7 = _mm_xor_si128(a7, b6);\
-  \
+   \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
  b0 = _mm_xor_si128(b0, a4);\
  b6 = _mm_xor_si128(b6, a4);\
@@ -195,7 +195,7 @@ __m128i ALL_FF;
  for(round_counter = 0; round_counter < 14; round_counter+=2) {\
    /* AddRoundConstant P1024 */\
    xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
+     /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm_shuffle_epi8(xmm8,  (SUBSH_MASK[0]));\
    xmm9  = _mm_shuffle_epi8(xmm9,  (SUBSH_MASK[1]));\
    xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
@@ -209,7 +209,6 @@ __m128i ALL_FF;
    \
    /* AddRoundConstant P1024 */\
    xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
-    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
    xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
    xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
@@ -218,7 +217,6 @@ __m128i ALL_FF;
    xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
    xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
    xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
-    /* SubBytes + MixBytes */\
    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
 }
--- a/algo/groestl/aes_ni/groestl-version.h
+++ b/algo/groestl/aes_ni/groestl-version.h
@@ -9,6 +9,7 @@

 //#ifndef NO_AES_NI

+// Not to be confused with AVX512VAES
 #define VAES
 // #define VAVX
 // #define VVPERM
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -230,6 +230,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
+
   OF1024( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/groestl-4way.c
+++ b/algo/groestl/groestl-4way.c
@@ -0,0 +1,64 @@
+#include "groestl-gate.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(GROESTL_4WAY_VAES)
+
+#include "groestl512-hash-4way.h"
+
+void groestl_4way_hash( void *output, const void *input )
+{
+     uint32_t hash[16*4] __attribute__ ((aligned (128)));
+     groestl512_4way_context ctx;
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, input, 640 );
+
+     groestl512_4way_init( &ctx, 64 );
+     groestl512_4way_update_close( &ctx, hash, hash, 512 );
+
+     dintrlv_4x128( output, output+32, output+64, output+96, hash, 256 );
+ }
+
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*4] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 4;
+     uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+     int thr_id = mythr->id;
+     const uint32_t Htarg = ptarget[7];
+
+     mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+     do
+     {
+        be32enc( noncep,    n   );
+        be32enc( noncep+ 4, n+1 );
+        be32enc( noncep+ 8, n+2 );
+        be32enc( noncep+12, n+3 );
+
+        groestl_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int lane = 0; lane < 4; lane++ )
+        if ( ( hash+(lane<<3) )[7] < Htarg )
+        if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+        }
+        n += 4;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#endif
--- a/algo/groestl/groestl-gate.c
+++ b/algo/groestl/groestl-gate.c
@@ -0,0 +1,23 @@
+#include "groestl-gate.h"
+
+bool register_dmd_gr_algo( algo_gate_t *gate )
+{
+#if defined (GROESTL_4WAY_VAES)
+  gate->scanhash  = (void*)&scanhash_groestl_4way;
+  gate->hash      = (void*)&groestl_4way_hash;
+#else
+  init_groestl_ctx();
+  gate->scanhash  = (void*)&scanhash_groestl;
+  gate->hash      = (void*)&groestlhash;
+#endif
+  gate->optimizations = AES_OPT | VAES_OPT;
+  return true;
+};
+
+bool register_groestl_algo( algo_gate_t* gate )
+{
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+    return true;
+};
+
--- a/algo/groestl/groestl-gate.h
+++ b/algo/groestl/groestl-gate.h
@@ -0,0 +1,31 @@
+#ifndef GROESTL_GATE_H__
+#define GROESTL_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define GROESTL_4WAY_VAES 1
+#endif
+
+bool register_dmd_gr_algo( algo_gate_t* gate );
+
+bool register_groestl_algo( algo_gate_t* gate );
+
+#if defined(GROESTL_4WAY_VAES)
+
+void groestl_4way_hash( void *state, const void *input );
+int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#else
+
+void groestlhash( void *state, const void *input );
+int scanhash_groestl( struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done, struct thr_info *mythr );
+void init_groestl_ctx();
+
+#endif
+
+#endif
+
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,5 +1,4 @@
-#include "algo-gate-api.h"
-
+#include "groestl-gate.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -78,15 +77,12 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 		groestlhash(hash, endiandata);

 		if (hash[7] <= Htarg )
-                   if ( fulltest(hash, ptarget))
-                   {
+      if ( fulltest(hash, ptarget) && !opt_benchmark )
+      {
 			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-	           }
-         
+         submit_solution( work, hash, mythr );
+	   }
 		nonce++;
-
 	} while (nonce < max_nonce && !work_restart[thr_id].restart);

 	pdata[19] = nonce;
@@ -94,20 +90,3 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_dmd_gr_algo( algo_gate_t* gate )
-{
-    init_groestl_ctx();
-    gate->optimizations   = SSE2_OPT | AES_OPT;
-    gate->scanhash        = (void*)&scanhash_groestl;
-    gate->hash            = (void*)&groestlhash;
-    opt_target_factor = 256.0;
-    return true;
-};
-
-bool register_groestl_algo( algo_gate_t* gate )
-{
-    register_dmd_gr_algo( gate );
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-    return true;
-};
-
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -0,0 +1,280 @@
+/* hash.c     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#include <memory.h>
+#include "hash-groestl256.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#ifndef NO_AES_NI
+
+#include "groestl-version.h"
+
+#ifdef TASM
+  #ifdef VAES
+    #include "groestl256-asm-aes.h"
+  #else
+    #ifdef VAVX
+      #include "groestl256-asm-avx.h"
+    #else
+      #ifdef VVPERM
+        #include "groestl256-asm-vperm.h"
+      #else
+        #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+      #endif
+    #endif
+  #endif
+#else
+  #ifdef TINTR
+    #ifdef VAES
+      #include "groestl256-intr-aes.h"
+    #else
+      #ifdef VAVX
+        #include "groestl256-intr-avx.h"
+      #else
+        #ifdef VVPERM
+          #include "groestl256-intr-vperm.h"
+        #else
+          #error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
+        #endif
+      #endif
+    #endif
+  #else
+    #error NO TYPE SPECIFIED (-DT[ASM/INTR])
+  #endif
+#endif
+
+/* initialise context */
+HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256( ctx->chaining );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return SUCCESS_GR;
+}
+
+
+HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
+ {
+  int i;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return FAIL_GR;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256(ctx->chaining);
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return SUCCESS_GR;
+}
+
+// Use this only for midstate and never for cryptonight
+HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
+                                 DataLength_gr databitlen )
+{
+   __m128i* in = (__m128i*)input;
+   const int len = (int)databitlen / 128;  // bits to __m128i
+   const int blocks = len / SIZE256;    // __M128i to blocks
+   int rem = ctx->rem_ptr;
+   int i;
+
+   ctx->blk_count = blocks;
+   ctx->databitlen = databitlen;
+
+   // digest any full blocks 
+   for ( i = 0; i < blocks; i++ )
+       TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   // adjust buf_ptr to last block
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // Copy any remainder to buffer
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   // adjust rem_ptr for new data
+   ctx->rem_ptr += i;
+
+   return SUCCESS_GR;
+}
+
+// don't use this at all
+HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
+{
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
+   int i;
+
+   // first pad byte = 0x80, last pad byte = block count
+   // everything in between is zero
+
+   if ( rem_ptr == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
+
+   return SUCCESS_GR;
+}
+
+HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m128i* in = (__m128i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // cryptonight has 200 byte input, an odd number of __m128i
+   // remainder is only 8 bytes, ie u64.
+   if ( databitlen % 128 !=0 )
+   {
+      // must be cryptonight, copy 64 bits of data
+      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
+      i = -1; // signal for odd length
+   }
+   else   
+   { 
+      // Copy any remaining data to buffer for final transform
+      for ( i = 0; i < len % SIZE256; i++ )
+          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+      i += rem;   // use i as rem_ptr in final
+   }
+
+   //--- final ---
+
+   // adjust for final block
+   blocks++;
+
+   if ( i == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+      if ( i == -1 )
+      {
+         // cryptonight odd length
+         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
+         // finish the block with zero and length padding as normal
+         i = 0;
+       }
+       else
+       {
+          // add first padding
+          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                         0,0,0,0, 0,0,0,0x80 );
+       }
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return SUCCESS_GR;
+}
+
+/* hash bit sequence */
+HashReturn_gr hash_groestl256(int hashbitlen,
+                const BitSequence_gr* data,
+                DataLength_gr databitlen,
+                BitSequence_gr* hashval) {
+  HashReturn_gr ret;
+  hashState_groestl256 context;
+
+  /* initialise */
+  if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
+    return ret;
+
+  /* process message */
+  if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
+    return ret;
+
+  /* finalise */
+  ret = final_groestl256(&context, hashval);
+
+  return ret;
+}
+
+/* eBash API */
+//#ifdef crypto_hash_BYTES
+//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
+//{
+//  if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
+//  return -1;
+//}
+//#endif
+
+#endif
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -0,0 +1,121 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#ifndef __hash_h
+#define __hash_h
+
+#include <immintrin.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+/* eBash API begin */
+/*
+#include "crypto_hash.h"
+#ifdef crypto_hash_BYTES
+
+#include <crypto_uint8.h>
+#include <crypto_uint32.h>
+#include <crypto_uint64.h>
+typedef crypto_uint8 u8;
+typedef crypto_uint32 u32;
+typedef crypto_uint64 u64;
+#endif
+ */
+/* eBash API end */
+
+//#define LENGTH (512)
+
+#include "brg_endian.h"
+#define NEED_UINT_64T
+#include "algo/sha/brg_types.h"
+
+#ifdef IACA_TRACE
+  #include IACA_MARKS
+#endif
+
+#define LENGTH (256)
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+#define COLS512 (8)
+//#define COLS1024 (16)
+#define SIZE_512 ((ROWS)*(COLS512))
+//#define SIZE1024 ((ROWS)*(COLS1024))
+#define ROUNDS512 (10)
+//#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+#define COLS (COLS512)
+//#define SIZE (SIZE512)
+#define ROUNDS (ROUNDS512)
+//#else
+//#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+//#define ROUNDS (ROUNDS1024)
+//#endif
+
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif /* IS_BIG_ENDIAN */
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif /* IS_LITTLE_ENDIAN */
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum
+{
+    SUCCESS_GR = 0,
+    FAIL_GR = 1,
+    BAD_HASHBITLEN_GR = 2
+} HashReturn_gr;
+
+#define SIZE256 (SIZE_512/16)
+
+typedef struct {
+  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
+  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
+//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
+//  u64 block_counter;        /* message block counter */
+  int hashlen;              // bytes
+  int blk_count;
+  int buf_ptr;              /* data buffer pointer */
+  int rem_ptr;
+  int databitlen;
+} hashState_groestl256;
+
+HashReturn_gr init_groestl256( hashState_groestl256*, int );
+
+HashReturn_gr reinit_groestl256( hashState_groestl256* );
+
+HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
+                              DataLength_gr );
+
+HashReturn_gr final_groestl256( hashState_groestl256*, void* );
+
+HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
+                            BitSequence_gr* );
+
+HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
+                                           const void*, DataLength_gr );
+
+#endif /* __hash_h */
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -0,0 +1,492 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include "hash-groestl256.h"
+
+/* global constants  */
+__m128i ROUND_CONST_Lx;
+__m128i ROUND_CONST_L0[ROUNDS512];
+__m128i ROUND_CONST_L7[ROUNDS512];
+//__m128i ROUND_CONST_P[ROUNDS1024];
+//__m128i ROUND_CONST_Q[ROUNDS1024];
+__m128i TRANSP_MASK;
+__m128i SUBSH_MASK[8];
+__m128i ALL_1B;
+__m128i ALL_FF;
+
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm_xor_si128(j, j);\
+  j = _mm_cmpgt_epi8(j, i);\
+  i = _mm_add_epi8(i, i);\
+  j = _mm_and_si128(j, k);\
+  i = _mm_xor_si128(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm_xor_si128(a0, a1);\
+  b0 = a2;\
+  a1 = _mm_xor_si128(a1, a2);\
+  b1 = a3;\
+  a2 = _mm_xor_si128(a2, a3);\
+  b2 = a4;\
+  a3 = _mm_xor_si128(a3, a4);\
+  b3 = a5;\
+  a4 = _mm_xor_si128(a4, a5);\
+  b4 = a6;\
+  a5 = _mm_xor_si128(a5, a6);\
+  b5 = a7;\
+  a6 = _mm_xor_si128(a6, a7);\
+  a7 = _mm_xor_si128(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm_xor_si128(b0, a4);\
+  b6 = _mm_xor_si128(b6, a4);\
+  b1 = _mm_xor_si128(b1, a5);\
+  b7 = _mm_xor_si128(b7, a5);\
+  b2 = _mm_xor_si128(b2, a6);\
+  b0 = _mm_xor_si128(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm_xor_si128(b3, a7);\
+  b1 = _mm_xor_si128(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm_xor_si128(b4, a0);\
+  b2 = _mm_xor_si128(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm_xor_si128(b5, a1);\
+  b3 = _mm_xor_si128(b3, a1);\
+  b1 = a1;\
+  b6 = _mm_xor_si128(b6, a2);\
+  b4 = _mm_xor_si128(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm_xor_si128(b7, a3);\
+  b5 = _mm_xor_si128(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm_xor_si128(a0, a3);\
+  a1 = _mm_xor_si128(a1, a4);\
+  a2 = _mm_xor_si128(a2, a5);\
+  a3 = _mm_xor_si128(a3, a6);\
+  a4 = _mm_xor_si128(a4, a7);\
+  a5 = _mm_xor_si128(a5, b0);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm_xor_si128(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm_xor_si128(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm_xor_si128(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm_xor_si128(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm_xor_si128(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm_xor_si128(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm_xor_si128(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm_xor_si128(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm_xor_si128(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm_xor_si128(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm_xor_si128(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm_xor_si128(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm_xor_si128(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm_xor_si128(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm_xor_si128(b0, a3);\
+  b1 = _mm_xor_si128(b1, a4);\
+}/*MixBytes*/
+
+#define SET_CONSTANTS(){\
+   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
+  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
+  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
+  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
+  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
+  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
+  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
+  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
+  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
+  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
+  for(i = 0; i < ROUNDS512; i++)\
+  {\
+    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
+    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
+  }\
+  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
+}while(0); \
+
+/* one round
+ * i = round number
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* AddRoundConstant */\
+  b1 = ROUND_CONST_Lx;\
+  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
+  a1 = _mm_xor_si128(a1, b1);\
+  a2 = _mm_xor_si128(a2, b1);\
+  a3 = _mm_xor_si128(a3, b1);\
+  a4 = _mm_xor_si128(a4, b1);\
+  a5 = _mm_xor_si128(a5, b1);\
+  a6 = _mm_xor_si128(a6, b1);\
+  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
+  \
+  /* ShiftBytes + SubBytes (interleaved) */\
+  b0 = _mm_xor_si128(b0,  b0);\
+  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
+  a0 = _mm_aesenclast_si128(a0, b0);\
+  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
+  a1 = _mm_aesenclast_si128(a1, b0);\
+  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
+  a2 = _mm_aesenclast_si128(a2, b0);\
+  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
+  a3 = _mm_aesenclast_si128(a3, b0);\
+  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
+  a4 = _mm_aesenclast_si128(a4, b0);\
+  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
+  a5 = _mm_aesenclast_si128(a5, b0);\
+  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
+  a6 = _mm_aesenclast_si128(a6, b0);\
+  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
+  a7 = _mm_aesenclast_si128(a7, b0);\
+  \
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+\
+}
+
+/* 10 rounds, P and Q in parallel */
+#define ROUNDS_P_Q(){\
+  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+}
+
+/* Matrix Transpose Step 1
+ * input is a 512-bit state with two columns in one xmm
+ * output is a 512-bit state with two rows in one xmm
+ * inputs: i0-i3
+ * outputs: i0, o1-o3
+ * clobbers: t0
+ */
+#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
+  t0 = TRANSP_MASK;\
+  \
+  i0 = _mm_shuffle_epi8(i0, t0);\
+  i1 = _mm_shuffle_epi8(i1, t0);\
+  i2 = _mm_shuffle_epi8(i2, t0);\
+  i3 = _mm_shuffle_epi8(i3, t0);\
+  \
+  o1 = i0;\
+  t0 = i2;\
+  \
+  i0 = _mm_unpacklo_epi16(i0, i1);\
+  o1 = _mm_unpackhi_epi16(o1, i1);\
+  i2 = _mm_unpacklo_epi16(i2, i3);\
+  t0 = _mm_unpackhi_epi16(t0, i3);\
+  \
+  i0 = _mm_shuffle_epi32(i0, 216);\
+  o1 = _mm_shuffle_epi32(o1, 216);\
+  i2 = _mm_shuffle_epi32(i2, 216);\
+  t0 = _mm_shuffle_epi32(t0, 216);\
+  \
+  o2 = i0;\
+  o3 = o1;\
+  \
+  i0 = _mm_unpacklo_epi32(i0, i2);\
+  o1 = _mm_unpacklo_epi32(o1, t0);\
+  o2 = _mm_unpackhi_epi32(o2, i2);\
+  o3 = _mm_unpackhi_epi32(o3, t0);\
+}/**/
+
+/* Matrix Transpose Step 2
+ * input are two 512-bit states with two rows in one xmm
+ * output are two 512-bit states with one row of each state in one xmm
+ * inputs: i0-i3 = P, i4-i7 = Q
+ * outputs: (i0, o1-o7) = (P|Q)
+ * possible reassignments: (output reg = input reg)
+ * * i1 -> o3-7
+ * * i2 -> o5-7
+ * * i3 -> o7
+ * * i4 -> o3-7
+ * * i5 -> o6-7
+ */
+#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
+  o1 = i0;\
+  o2 = i1;\
+  i0 = _mm_unpacklo_epi64(i0, i4);\
+  o1 = _mm_unpackhi_epi64(o1, i4);\
+  o3 = i1;\
+  o4 = i2;\
+  o2 = _mm_unpacklo_epi64(o2, i5);\
+  o3 = _mm_unpackhi_epi64(o3, i5);\
+  o5 = i2;\
+  o6 = i3;\
+  o4 = _mm_unpacklo_epi64(o4, i6);\
+  o5 = _mm_unpackhi_epi64(o5, i6);\
+  o7 = i3;\
+  o6 = _mm_unpacklo_epi64(o6, i7);\
+  o7 = _mm_unpackhi_epi64(o7, i7);\
+}/**/
+
+/* Matrix Transpose Inverse Step 2
+ * input are two 512-bit states with one row of each state in one xmm
+ * output are two 512-bit states with two rows in one xmm
+ * inputs: i0-i7 = (P|Q)
+ * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
+ */
+#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
+  o0 = i0;\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  o0 = _mm_unpackhi_epi64(o0, i1);\
+  o1 = i2;\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  o1 = _mm_unpackhi_epi64(o1, i3);\
+  o2 = i4;\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  o2 = _mm_unpackhi_epi64(o2, i5);\
+  o3 = i6;\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+  o3 = _mm_unpackhi_epi64(o3, i7);\
+}/**/
+
+/* Matrix Transpose Output Step 2
+ * input is one 512-bit state with two rows in one xmm
+ * output is one 512-bit state with one row in the low 64-bits of one xmm
+ * inputs: i0,i2,i4,i6 = S
+ * outputs: (i0-7) = (0|S)
+ */
+#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
+  t0 = _mm_xor_si128(t0, t0);\
+  i1 = i0;\
+  i3 = i2;\
+  i5 = i4;\
+  i7 = i6;\
+  i0 = _mm_unpacklo_epi64(i0, t0);\
+  i1 = _mm_unpackhi_epi64(i1, t0);\
+  i2 = _mm_unpacklo_epi64(i2, t0);\
+  i3 = _mm_unpackhi_epi64(i3, t0);\
+  i4 = _mm_unpacklo_epi64(i4, t0);\
+  i5 = _mm_unpackhi_epi64(i5, t0);\
+  i6 = _mm_unpacklo_epi64(i6, t0);\
+  i7 = _mm_unpackhi_epi64(i7, t0);\
+}/**/
+
+/* Matrix Transpose Output Inverse Step 2
+ * input is one 512-bit state with one row in the low 64-bits of one xmm
+ * output is one 512-bit state with two rows in one xmm
+ * inputs: i0-i7 = (0|S)
+ * outputs: (i0, i2, i4, i6) = S
+ */
+#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
+  i0 = _mm_unpacklo_epi64(i0, i1);\
+  i2 = _mm_unpacklo_epi64(i2, i3);\
+  i4 = _mm_unpacklo_epi64(i4, i5);\
+  i6 = _mm_unpacklo_epi64(i6, i7);\
+}/**/
+
+
+void INIT256( __m128i* chaining )
+{
+  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
+  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm12 - xmm15 */
+  xmm12 = chaining[0];
+  xmm13 = chaining[1];
+  xmm14 = chaining[2];
+  xmm15 = chaining[3];
+
+  /* transform chaining value from column ordering into row ordering */
+  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* store transposed IV */
+  chaining[0] = xmm12;
+  chaining[1] = xmm2;
+  chaining[2] = xmm6;
+  chaining[3] = xmm7;
+}
+
+void TF512( __m128i* chaining, __m128i* message )
+{
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+#ifdef IACA_TRACE
+  IACA_START;
+#endif
+
+  /* load message into registers xmm12 - xmm15 */
+  xmm12 = message[0];
+  xmm13 = message[1];
+  xmm14 = message[2];
+  xmm15 = message[3];
+
+  /* transform message M from column ordering into row ordering */
+  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
+  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
+
+  /* load previous chaining value */
+  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
+  xmm8 = chaining[0];
+  xmm0 = chaining[1];
+  xmm4 = chaining[2];
+  xmm5 = chaining[3];
+
+  /* xor message to CV get input of P */
+  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
+  xmm8 = _mm_xor_si128(xmm8, xmm12);
+  xmm0 = _mm_xor_si128(xmm0, xmm2);
+  xmm4 = _mm_xor_si128(xmm4, xmm6);
+  xmm5 = _mm_xor_si128(xmm5, xmm7);
+
+  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
+  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
+  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
+  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* compute the two permutations P and Q in parallel */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P or two rows of Q in one xmm register */
+  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
+
+  /* xor output of P and Q */
+  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, xmm8);
+  xmm1 = _mm_xor_si128(xmm1, xmm10);
+  xmm2 = _mm_xor_si128(xmm2, xmm12);
+  xmm3 = _mm_xor_si128(xmm3, xmm14);
+
+  /* xor CV (feed-forward) */
+  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
+  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
+  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
+  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
+  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
+
+  /* store CV */
+  chaining[0] = xmm0;
+  chaining[1] = xmm1;
+  chaining[2] = xmm2;
+  chaining[3] = xmm3;
+
+#ifdef IACA_TRACE
+  IACA_END;
+#endif
+  return;
+}
+
+void OF512( __m128i* chaining )
+{
+  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m128i TEMP0;
+  static __m128i TEMP1;
+  static __m128i TEMP2;
+
+  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = chaining[0];
+  xmm10 = chaining[1];
+  xmm12 = chaining[2];
+  xmm14 = chaining[3];
+
+  /* there are now 2 rows of the CV in one xmm register */
+  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
+  /* result: the 8 input rows of P in xmm8 - xmm15 */
+  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
+
+  /* compute the permutation P */
+  /* result: the output of P(CV) in xmm8 - xmm15 */
+  ROUNDS_P_Q();
+
+  /* unpack again to get two rows of P in one xmm register */
+  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
+  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
+  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
+  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
+  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
+  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
+
+  /* transform state back from row ordering into column ordering */
+  /* result: final hash value in xmm9, xmm11 */
+  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
+
+  /* we only need to return the truncated half of the state */
+  chaining[2] = xmm9;
+  chaining[3] = xmm11;
+}
+
+
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -0,0 +1,114 @@
+/* hash.c     Aug 2011
+ * groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt  2019-12.
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+// Optimized for hash and data length that are integrals of __m128i 
+
+
+#include <memory.h>
+#include "groestl512-intr-4way.h"
+#include "miner.h"
+#include "simd-utils.h"
+
+#if defined(__VAES__)
+
+#define ROTL64(a,n) \
+   ( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
+     
+#define U64BIG(a) \
+  ( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
+    ( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
+    ( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
+    ( ROTL64(a,56) & 0xFF000000FF000000 ) )
+
+int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
+{
+  int i;
+
+  ctx->hashlen = hashlen;
+  SET_CONSTANTS();
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }
+
+  uint64_t len = U64BIG((uint64_t)LENGTH);
+  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
+  INIT_4way(ctx->chaining);
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+  return 0;
+}
+
+int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE512;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE512 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+   }   
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
+                   blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
+   }
+
+// digest final padding block and do output transform
+   TF1024_4way( ctx->chaining, ctx->buffer );
+
+   OF1024_4way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
+#endif   // VAES
+
--- a/algo/groestl/groestl512-hash-4way.h
+++ b/algo/groestl/groestl512-hash-4way.h
@@ -0,0 +1,94 @@
+/* hash.h     Aug 2011
+ *
+ * Groestl implementation for different versions.
+ * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
+ *
+ * This code is placed in the public domain
+ */
+
+#if !defined(GROESTL512_HASH_4WAY_H__)
+#define GROESTL512_HASH_4WAY_H__ 1
+
+#include "simd-utils.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#if defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#endif
+#include <stdlib.h>
+
+#define LENGTH (512)
+
+//#include "brg_endian.h"
+//#define NEED_UINT_64T
+//#include "algo/sha/brg_types.h"
+
+/* some sizes (number of bytes) */
+#define ROWS (8)
+#define LENGTHFIELDLEN (ROWS)
+//#define COLS512 (8)
+#define COLS1024 (16)
+//#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE_1024 ((ROWS)*(COLS1024))
+//#define ROUNDS512 (10)
+#define ROUNDS1024 (14)
+
+//#if LENGTH<=256
+//#define COLS (COLS512)
+//#define SIZE (SIZE512)
+//#define ROUNDS (ROUNDS512)
+//#else
+#define COLS (COLS1024)
+//#define SIZE (SIZE1024)
+#define ROUNDS (ROUNDS1024)
+//#endif
+
+/*
+#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
+
+#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
+#define U64BIG(a) (a)
+#endif // IS_BIG_ENDIAN 
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
+#define U64BIG(a) \
+  ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
+   (ROTL64(a,24) & li_64(0000FF000000FF00)) | \
+   (ROTL64(a,40) & li_64(00FF000000FF0000)) | \
+   (ROTL64(a,56) & li_64(FF000000FF000000)))
+#endif // IS_LITTLE_ENDIAN 
+
+typedef unsigned char BitSequence_gr;
+typedef unsigned long long DataLength_gr;
+typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
+*/
+
+#define SIZE512 (SIZE_1024/16)
+
+typedef struct {
+  __attribute__ ((aligned (128))) __m512i chaining[SIZE512];
+  __attribute__ ((aligned (64))) __m512i buffer[SIZE512];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+  int databitlen;    // bits
+} groestl512_4way_context;
+
+
+int groestl512_4way_init( groestl512_4way_context*, uint64_t );
+
+//int reinit_groestl( hashState_groestl* );
+
+int groestl512_4way_update( groestl512_4way_context*, const void*,
+                              uint64_t );
+
+int groestl512_4way_close( groestl512_4way_context*, void* );
+
+int groestl512_4way_update_close( groestl512_4way_context*,  void*,
+                                        const void*, uint64_t );
+
+#endif /* __hash_h */
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -0,0 +1,654 @@
+/* groestl-intr-aes.h     Aug 2011
+ *
+ * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
+ * instructions.
+ * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
+ *
+ * This code is placed in the public domain
+ */
+
+
+#if !defined(GROESTL512_INTR_4WAY_H__)
+#define GROESTL512_INTR_4WAY_H__ 1
+      
+#include "groestl512-hash-4way.h"
+
+#if defined(__VAES__)
+
+/* global constants  */
+__m512i ROUND_CONST_Lx;
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
+__m512i ROUND_CONST_P[ROUNDS1024];
+__m512i ROUND_CONST_Q[ROUNDS1024];
+__m512i TRANSP_MASK;
+__m512i SUBSH_MASK[8];
+__m512i ALL_1B;
+__m512i ALL_FF;
+
+#define tos(a)    #a
+#define tostr(a)  tos(a)
+
+/* xmm[i] will be multiplied by 2
+ * xmm[j] will be lost
+ * xmm[k] has to be all 0x1b */
+#define MUL2(i, j, k){\
+  j = _mm512_xor_si512(j, j);\
+  j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
+  i = _mm512_add_epi8(i, i);\
+  j = _mm512_and_si512(j, k);\
+  i = _mm512_xor_si512(i, j);\
+} 
+
+ /**/
+
+/* Yet another implementation of MixBytes.
+   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
+   Input: a0, ..., a7
+   Output: b0, ..., b7 = MixBytes(a0,...,a7).
+   but we use the relations:
+   t_i = a_i + a_{i+3}
+   x_i = t_i + t_{i+3}
+   y_i = t_i + t+{i+2} + a_{i+6}
+   z_i = 2*x_i
+   w_i = z_i + y_{i+4}
+   v_i = 2*w_i
+   b_i = v_{i+3} + y_{i+4}
+   We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
+   and then adding v_i computed in the meantime in registers xmm0..xmm7.
+   We almost fit into 16 registers, need only 3 spills to memory.
+   This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
+   K. Matusiewicz, 2011/05/29 */
+#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* t_i = a_i + a_{i+1} */\
+  b6 = a0;\
+  b7 = a1;\
+  a0 = _mm512_xor_si512(a0, a1);\
+  b0 = a2;\
+  a1 = _mm512_xor_si512(a1, a2);\
+  b1 = a3;\
+  a2 = _mm512_xor_si512(a2, a3);\
+  b2 = a4;\
+  a3 = _mm512_xor_si512(a3, a4);\
+  b3 = a5;\
+  a4 = _mm512_xor_si512(a4, a5);\
+  b4 = a6;\
+  a5 = _mm512_xor_si512(a5, a6);\
+  b5 = a7;\
+  a6 = _mm512_xor_si512(a6, a7);\
+  a7 = _mm512_xor_si512(a7, b6);\
+  \
+  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
+  b0 = _mm512_xor_si512(b0, a4);\
+  b6 = _mm512_xor_si512(b6, a4);\
+  b1 = _mm512_xor_si512(b1, a5);\
+  b7 = _mm512_xor_si512(b7, a5);\
+  b2 = _mm512_xor_si512(b2, a6);\
+  b0 = _mm512_xor_si512(b0, a6);\
+  /* spill values y_4, y_5 to memory */\
+  TEMP0 = b0;\
+  b3 = _mm512_xor_si512(b3, a7);\
+  b1 = _mm512_xor_si512(b1, a7);\
+  TEMP1 = b1;\
+  b4 = _mm512_xor_si512(b4, a0);\
+  b2 = _mm512_xor_si512(b2, a0);\
+  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
+  b0 = a0;\
+  b5 = _mm512_xor_si512(b5, a1);\
+  b3 = _mm512_xor_si512(b3, a1);\
+  b1 = a1;\
+  b6 = _mm512_xor_si512(b6, a2);\
+  b4 = _mm512_xor_si512(b4, a2);\
+  TEMP2 = a2;\
+  b7 = _mm512_xor_si512(b7, a3);\
+  b5 = _mm512_xor_si512(b5, a3);\
+  \
+  /* compute x_i = t_i + t_{i+3} */\
+  a0 = _mm512_xor_si512(a0, a3);\
+  a1 = _mm512_xor_si512(a1, a4);\
+  a2 = _mm512_xor_si512(a2, a5);\
+  a3 = _mm512_xor_si512(a3, a6);\
+  a4 = _mm512_xor_si512(a4, a7);\
+  a5 = _mm512_xor_si512(a5, b0);\
+  a6 = _mm512_xor_si512(a6, b1);\
+  a7 = _mm512_xor_si512(a7, TEMP2);\
+  \
+  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
+  /* compute w_i : add y_{i+4} */\
+  b1 = ALL_1B;\
+  MUL2(a0, b0, b1);\
+  a0 = _mm512_xor_si512(a0, TEMP0);\
+  MUL2(a1, b0, b1);\
+  a1 = _mm512_xor_si512(a1, TEMP1);\
+  MUL2(a2, b0, b1);\
+  a2 = _mm512_xor_si512(a2, b2);\
+  MUL2(a3, b0, b1);\
+  a3 = _mm512_xor_si512(a3, b3);\
+  MUL2(a4, b0, b1);\
+  a4 = _mm512_xor_si512(a4, b4);\
+  MUL2(a5, b0, b1);\
+  a5 = _mm512_xor_si512(a5, b5);\
+  MUL2(a6, b0, b1);\
+  a6 = _mm512_xor_si512(a6, b6);\
+  MUL2(a7, b0, b1);\
+  a7 = _mm512_xor_si512(a7, b7);\
+  \
+  /* compute v_i : double w_i      */\
+  /* add to y_4 y_5 .. v3, v4, ... */\
+  MUL2(a0, b0, b1);\
+  b5 = _mm512_xor_si512(b5, a0);\
+  MUL2(a1, b0, b1);\
+  b6 = _mm512_xor_si512(b6, a1);\
+  MUL2(a2, b0, b1);\
+  b7 = _mm512_xor_si512(b7, a2);\
+  MUL2(a5, b0, b1);\
+  b2 = _mm512_xor_si512(b2, a5);\
+  MUL2(a6, b0, b1);\
+  b3 = _mm512_xor_si512(b3, a6);\
+  MUL2(a7, b0, b1);\
+  b4 = _mm512_xor_si512(b4, a7);\
+  MUL2(a3, b0, b1);\
+  MUL2(a4, b0, b1);\
+  b0 = TEMP0;\
+  b1 = TEMP1;\
+  b0 = _mm512_xor_si512(b0, a3);\
+  b1 = _mm512_xor_si512(b1, a4);\
+}/*MixBytes*/
+
+// calculate the round constants seperately and load at startup
+
+#define SET_CONSTANTS(){\
+  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
+  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
+  TRANSP_MASK   = _mm512_set_epi32( \
+                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
+                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
+                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
+                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
+  SUBSH_MASK[0] = _mm512_set_epi32( \
+                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
+                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
+                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
+                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
+  SUBSH_MASK[1] = _mm512_set_epi32( \
+                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
+                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
+                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
+                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
+  SUBSH_MASK[2] = _mm512_set_epi32( \
+                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
+                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
+                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
+                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
+  SUBSH_MASK[3] = _mm512_set_epi32( \
+                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
+                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
+                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
+                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
+  SUBSH_MASK[4] = _mm512_set_epi32( \
+                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
+                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
+                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
+                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
+  SUBSH_MASK[5] = _mm512_set_epi32( \
+                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
+                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
+                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
+                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
+  SUBSH_MASK[6] = _mm512_set_epi32( \
+                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
+                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
+                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
+                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
+  SUBSH_MASK[7] = _mm512_set_epi32( \
+                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
+                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
+                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
+                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
+  for( i = 0; i < ROUNDS1024; i++ ) \
+  { \
+    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
+                                          0xb0a09080 ^ (i * 0x01010101), \
+                                          0x70605040 ^ (i * 0x01010101), \
+                                          0x30201000 ^ (i * 0x01010101) ); \
+    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
+                                          0x4f5f6f7f ^ (i * 0x01010101), \
+                                          0x8f9fafbf ^ (i * 0x01010101), \
+                                          0xcfdfefff ^ (i * 0x01010101));\
+  } \
+}while(0);\
+
+/* one round
+ * a0-a7 = input rows
+ * b0-b7 = output rows
+ */
+#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
+  /* SubBytes */\
+  b0 = _mm512_xor_si512( b0, b0 );\
+  a0 = _mm512_aesenclast_epi128( a0, b0 );\
+  a1 = _mm512_aesenclast_epi128( a1, b0 );\
+  a2 = _mm512_aesenclast_epi128( a2, b0 );\
+  a3 = _mm512_aesenclast_epi128( a3, b0 );\
+  a4 = _mm512_aesenclast_epi128( a4, b0 );\
+  a5 = _mm512_aesenclast_epi128( a5, b0 );\
+  a6 = _mm512_aesenclast_epi128( a6, b0 );\
+  a7 = _mm512_aesenclast_epi128( a7, b0 );\
+  /* MixBytes */\
+  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
+}
+
+#define ROUNDS_P(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
+  { \
+    /* AddRoundConstant P1024 */\
+    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+     /* AddRoundConstant P1024 */\
+    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
+    /* ShiftBytes P1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
+    /* SubBytes + MixBytes */\
+     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+#define ROUNDS_Q(){\
+  uint8_t round_counter = 0;\
+  for ( round_counter = 0; round_counter < 14; round_counter += 2) \
+  { \
+    /* AddRoundConstant Q1024 */\
+    xmm1 = ALL_FF;\
+    xmm8  = _mm512_xor_si512( xmm8,  xmm1 );\
+    xmm9  = _mm512_xor_si512( xmm9,  xmm1 );\
+    xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
+    xmm11 = _mm512_xor_si512( xmm11, xmm1 );\
+    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
+    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
+    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
+    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
+    \
+    /* AddRoundConstant Q1024 */\
+    xmm9 = ALL_FF;\
+    xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
+    xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
+    xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
+    xmm3 = _mm512_xor_si512( xmm3, xmm9 );\
+    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
+    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
+    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
+    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
+    /* ShiftBytes Q1024 + pre-AESENCLAST */\
+    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
+    /* SubBytes + MixBytes */\
+    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
+  }\
+}
+
+/* Matrix Transpose
+ * input is a 1024-bit state with two columns in one xmm
+ * output is a 1024-bit state with two rows in one xmm
+ * inputs: i0-i7
+ * outputs: i0-i7
+ * clobbers: t0-t7
+ */
+#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
+  t0 = TRANSP_MASK;\
+\
+  i6 = _mm512_shuffle_epi8(i6, t0);\
+  i0 = _mm512_shuffle_epi8(i0, t0);\
+  i1 = _mm512_shuffle_epi8(i1, t0);\
+  i2 = _mm512_shuffle_epi8(i2, t0);\
+  i3 = _mm512_shuffle_epi8(i3, t0);\
+  t1 = i2;\
+  i4 = _mm512_shuffle_epi8(i4, t0);\
+  i5 = _mm512_shuffle_epi8(i5, t0);\
+  t2 = i4;\
+  t3 = i6;\
+  i7 = _mm512_shuffle_epi8(i7, t0);\
+\
+  /* continue with unpack using 4 temp registers */\
+  t0 = i0;\
+  t2 = _mm512_unpackhi_epi16(t2, i5);\
+  i4 = _mm512_unpacklo_epi16(i4, i5);\
+  t3 = _mm512_unpackhi_epi16(t3, i7);\
+  i6 = _mm512_unpacklo_epi16(i6, i7);\
+  t0 = _mm512_unpackhi_epi16(t0, i1);\
+  t1 = _mm512_unpackhi_epi16(t1, i3);\
+  i2 = _mm512_unpacklo_epi16(i2, i3);\
+  i0 = _mm512_unpacklo_epi16(i0, i1);\
+\
+  /* shuffle with immediate */\
+  t0 = _mm512_shuffle_epi32(t0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t2 = _mm512_shuffle_epi32(t2, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  i2 = _mm512_shuffle_epi32(i2, 216);\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  i6 = _mm512_shuffle_epi32(i6, 216);\
+\
+  /* continue with unpack */\
+  t4 = i0;\
+  i0 = _mm512_unpacklo_epi32(i0, i2);\
+  t4 = _mm512_unpackhi_epi32(t4, i2);\
+  t5 = t0;\
+  t0 = _mm512_unpacklo_epi32(t0, t1);\
+  t5 = _mm512_unpackhi_epi32(t5, t1);\
+  t6 = i4;\
+  i4 = _mm512_unpacklo_epi32(i4, i6);\
+  t7 = t2;\
+  t6 = _mm512_unpackhi_epi32(t6, i6);\
+  i2 = t0;\
+  t2 = _mm512_unpacklo_epi32(t2, t3);\
+  i3 = t0;\
+  t7 = _mm512_unpackhi_epi32(t7, t3);\
+\
+  /* there are now 2 rows in each xmm */\
+  /* unpack to get 1 row of CV in each xmm */\
+  i1 = i0;\
+  i1 = _mm512_unpackhi_epi64(i1, i4);\
+  i0 = _mm512_unpacklo_epi64(i0, i4);\
+  i4 = t4;\
+  i3 = _mm512_unpackhi_epi64(i3, t2);\
+  i5 = t4;\
+  i2 = _mm512_unpacklo_epi64(i2, t2);\
+  i6 = t5;\
+  i5 = _mm512_unpackhi_epi64(i5, t6);\
+  i7 = t5;\
+  i4 = _mm512_unpacklo_epi64(i4, t6);\
+  i7 = _mm512_unpackhi_epi64(i7, t7);\
+  i6 = _mm512_unpacklo_epi64(i6, t7);\
+  /* transpose done */\
+}/**/
+
+/* Matrix Transpose Inverse
+ * input is a 1024-bit state with two rows in one xmm
+ * output is a 1024-bit state with two columns in one xmm
+ * inputs: i0-i7
+ * outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
+ * clobbers: t0-t4
+ */
+#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
+  /*  transpose matrix to get output format */\
+  o1 = i0;\
+  i0 = _mm512_unpacklo_epi64(i0, i1);\
+  o1 = _mm512_unpackhi_epi64(o1, i1);\
+  t0 = i2;\
+  i2 = _mm512_unpacklo_epi64(i2, i3);\
+  t0 = _mm512_unpackhi_epi64(t0, i3);\
+  t1 = i4;\
+  i4 = _mm512_unpacklo_epi64(i4, i5);\
+  t1 = _mm512_unpackhi_epi64(t1, i5);\
+  t2 = i6;\
+  o0 = TRANSP_MASK;\
+  i6 = _mm512_unpacklo_epi64(i6, i7);\
+  t2 = _mm512_unpackhi_epi64(t2, i7);\
+  /* load transpose mask into a register, because it will be used 8 times */\
+  i0 = _mm512_shuffle_epi8(i0, o0);\
+  i2 = _mm512_shuffle_epi8(i2, o0);\
+  i4 = _mm512_shuffle_epi8(i4, o0);\
+  i6 = _mm512_shuffle_epi8(i6, o0);\
+  o1 = _mm512_shuffle_epi8(o1, o0);\
+  t0 = _mm512_shuffle_epi8(t0, o0);\
+  t1 = _mm512_shuffle_epi8(t1, o0);\
+  t2 = _mm512_shuffle_epi8(t2, o0);\
+  /* continue with unpack using 4 temp registers */\
+  t3 = i4;\
+  o2 = o1;\
+  o0 = i0;\
+  t4 = t1;\
+  \
+  t3 = _mm512_unpackhi_epi16(t3, i6);\
+  i4 = _mm512_unpacklo_epi16(i4, i6);\
+  o0 = _mm512_unpackhi_epi16(o0, i2);\
+  i0 = _mm512_unpacklo_epi16(i0, i2);\
+  o2 = _mm512_unpackhi_epi16(o2, t0);\
+  o1 = _mm512_unpacklo_epi16(o1, t0);\
+  t4 = _mm512_unpackhi_epi16(t4, t2);\
+  t1 = _mm512_unpacklo_epi16(t1, t2);\
+  /* shuffle with immediate */\
+  i4 = _mm512_shuffle_epi32(i4, 216);\
+  t3 = _mm512_shuffle_epi32(t3, 216);\
+  o1 = _mm512_shuffle_epi32(o1, 216);\
+  o2 = _mm512_shuffle_epi32(o2, 216);\
+  i0 = _mm512_shuffle_epi32(i0, 216);\
+  o0 = _mm512_shuffle_epi32(o0, 216);\
+  t1 = _mm512_shuffle_epi32(t1, 216);\
+  t4 = _mm512_shuffle_epi32(t4, 216);\
+  /* continue with unpack */\
+  i1 = i0;\
+  i3 = o0;\
+  i5 = o1;\
+  i7 = o2;\
+  i0 = _mm512_unpacklo_epi32(i0, i4);\
+  i1 = _mm512_unpackhi_epi32(i1, i4);\
+  o0 = _mm512_unpacklo_epi32(o0, t3);\
+  i3 = _mm512_unpackhi_epi32(i3, t3);\
+  o1 = _mm512_unpacklo_epi32(o1, t1);\
+  i5 = _mm512_unpackhi_epi32(i5, t1);\
+  o2 = _mm512_unpacklo_epi32(o2, t4);\
+  i7 = _mm512_unpackhi_epi32(i7, t4);\
+  /* transpose done */\
+}/**/
+
+
+void INIT_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+
+  /* load IV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* transform chaining value from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store transposed IV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+}
+
+void TF1024_4way( __m512i* chaining, const __m512i* message )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i QTEMP[8];
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load message into registers xmm8 - xmm15 (Q = message) */
+  xmm8 = message[0];
+  xmm9 = message[1];
+  xmm10 = message[2];
+  xmm11 = message[3];
+  xmm12 = message[4];
+  xmm13 = message[5];
+  xmm14 = message[6];
+  xmm15 = message[7];
+
+  /* transform message M from column ordering into row ordering */
+  Matrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
+
+  /* store message M (Q input) for later */
+  QTEMP[0] = xmm8;
+  QTEMP[1] = xmm9;
+  QTEMP[2] = xmm10;
+  QTEMP[3] = xmm11;
+  QTEMP[4] = xmm12;
+  QTEMP[5] = xmm13;
+  QTEMP[6] = xmm14;
+  QTEMP[7] = xmm15;
+
+  /* xor CV to message to get P input */
+  /* result: CV+M in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* compute permutation P */
+  /* result: P(CV+M) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV+M)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store P(CV+M)+CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  /* load message M (Q input) into xmm8-15 */
+  xmm8 = QTEMP[0];
+  xmm9 = QTEMP[1];
+  xmm10 = QTEMP[2];
+  xmm11 = QTEMP[3];
+  xmm12 = QTEMP[4];
+  xmm13 = QTEMP[5];
+  xmm14 = QTEMP[6];
+  xmm15 = QTEMP[7];
+
+  /* compute permutation Q */
+  /* result: Q(M) in xmm8...xmm15 */
+  ROUNDS_Q();
+
+  /* xor Q output */
+  /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* store CV */
+  chaining[0] = xmm8;
+  chaining[1] = xmm9;
+  chaining[2] = xmm10;
+  chaining[3] = xmm11;
+  chaining[4] = xmm12;
+  chaining[5] = xmm13;
+  chaining[6] = xmm14;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+void OF1024_4way( __m512i* chaining )
+{
+  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  static __m512i TEMP0;
+  static __m512i TEMP1;
+  static __m512i TEMP2;
+
+  /* load CV into registers xmm8 - xmm15 */
+  xmm8 = chaining[0];
+  xmm9 = chaining[1];
+  xmm10 = chaining[2];
+  xmm11 = chaining[3];
+  xmm12 = chaining[4];
+  xmm13 = chaining[5];
+  xmm14 = chaining[6];
+  xmm15 = chaining[7];
+
+  /* compute permutation P */
+  /* result: P(CV) in xmm8...xmm15 */
+  ROUNDS_P();
+
+  /* xor CV to P output (feed-forward) */
+  /* result: P(CV)+CV in xmm8...xmm15 */
+  xmm8 = _mm512_xor_si512( xmm8,  (chaining[0]) );
+  xmm9 = _mm512_xor_si512( xmm9,  (chaining[1]) );
+  xmm10 = _mm512_xor_si512( xmm10, (chaining[2]) );
+  xmm11 = _mm512_xor_si512( xmm11, (chaining[3]) );
+  xmm12 = _mm512_xor_si512( xmm12, (chaining[4]) );
+  xmm13 = _mm512_xor_si512( xmm13, (chaining[5]) );
+  xmm14 = _mm512_xor_si512( xmm14, (chaining[6]) );
+  xmm15 = _mm512_xor_si512( xmm15, (chaining[7]) );
+
+  /* transpose CV back from row ordering to column ordering */
+  /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
+  Matrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7);
+
+  /* we only need to return the truncated half of the state */
+  chaining[4] = xmm0;
+  chaining[5] = xmm6;
+  chaining[6] = xmm13;
+  chaining[7] = xmm15;
+
+  return;
+}
+
+#endif  // VAES
+#endif  // GROESTL512_INTR_4WAY_H__
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -1,14 +1,159 @@
 #include "myrgr-gate.h"
-
-#if defined(MYRGR_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-
 #include "aes_ni/hash-groestl.h"
 #include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "groestl512-hash-4way.h"
+#endif
+
+#if defined(MYRGR_8WAY)
+
+typedef struct {
+#if defined(__VAES__)
+   groestl512_4way_context groestl;
+#else
+   hashState_groestl       groestl;
+#endif
+   sha256_8way_context     sha;
+} myrgr_8way_ctx_holder;
+
+myrgr_8way_ctx_holder myrgr_8way_ctx;
+
+void init_myrgr_8way_ctx()
+{
+#if defined(__VAES__)
+     groestl512_4way_init( &myrgr_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &myrgr_8way_ctx.groestl, 64 );
+#endif
+     sha256_8way_init( &myrgr_8way_ctx.sha );
+}
+
+void myriad_8way_hash( void *output, const void *input )
+{
+     uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vhashA[20*8] __attribute__ ((aligned (64)));
+     uint32_t vhashB[20*8] __attribute__ ((aligned (64)));
+     myrgr_8way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_8way_ctx, sizeof(myrgr_8way_ctx) );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, input, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 640 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 640 );
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+//     rintrlv_4x128_8x32( vhash, vhashA, vhashB, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                       hash6, hash7 );
+
+#else
+
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t hash4[20] __attribute__ ((aligned (64)));
+     uint32_t hash5[20] __attribute__ ((aligned (64)));  
+     uint32_t hash6[20] __attribute__ ((aligned (64)));
+     uint32_t hash7[20] __attribute__ ((aligned (64)));
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
+                         hash4, hash5, hash6, hash7, 512 );
+
+#endif
+
+     sha256_8way_update( &ctx.sha, vhash, 64 );
+     sha256_8way_close( &ctx.sha, output );
+}
+
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   uint32_t *noncep = vdata + 64+3;   // 4*16 + 3
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_4x128( vdata, pdata );
+
+   do
+   {
+      be32enc( noncep,    n   );
+      be32enc( noncep+ 8, n+1 );
+      be32enc( noncep+16, n+2 );
+      be32enc( noncep+24, n+3 );
+      be32enc( noncep+32, n+4 );
+      be32enc( noncep+40, n+5 );
+      be32enc( noncep+48, n+6 );
+      be32enc( noncep+64, n+7 );
+
+      myriad_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash7[ lane ] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(MYRGR_4WAY)

 typedef struct {
    hashState_groestl       groestl;
@@ -45,7 +190,7 @@ void myriad_4way_hash( void *output, const void *input )

     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

-     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_update( &ctx.sha, vhash, 64 );
     sha256_4way_close( &ctx.sha, output );
 }

--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -2,16 +2,22 @@

 bool register_myriad_algo( algo_gate_t* gate )
 {
-#if defined (MYRGR_4WAY)
+#if defined (MYRGR_8WAY)
+  init_myrgr_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_8way;
+  gate->hash      = (void*)&myriad_8way_hash;
+  gate->optimizations = AES_OPT | AVX2_OPT | VAES_OPT;
+#elif defined (MYRGR_4WAY)
  init_myrgr_4way_ctx();
  gate->scanhash  = (void*)&scanhash_myriad_4way;
  gate->hash      = (void*)&myriad_4way_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | VAES_OPT;
 #else
  init_myrgr_ctx();
  gate->scanhash  = (void*)&scanhash_myriad;
  gate->hash      = (void*)&myriad_hash;
+  gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT;
 #endif
-  gate->optimizations = AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -1,30 +1,35 @@
 #ifndef MYRGR_GATE_H__
-#define MYRGR_GATE_H__
+#define MYRGR_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
-  #define MYRGR_4WAY
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define MYRGR_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
+  #define MYRGR_4WAY 1
 #endif

-#if defined(MYRGR_4WAY)
+#if defined(MYRGR_8WAY)
+
+void myriad_8way_hash( void *state, const void *input );
+int scanhash_myriad_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+void init_myrgr_8way_ctx();
+
+#elif defined(MYRGR_4WAY)

 void myriad_4way_hash( void *state, const void *input );
-
 int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_4way_ctx();

-#endif
+#else

 void myriad_hash( void *state, const void *input );
-
 int scanhash_myriad( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_myrgr_ctx();

 #endif
-
+#endif
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };

+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Hamsi 8 way 
+
+#define INPUT_BIG8 \
+do { \
+  __m512i db = *buf; \
+  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
+     dm = mm512_negate_32( _mm512_or_si512( dm, \
+                                          _mm512_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[0] ) ) ); \
+     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[1] ) ) ); \
+     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[2] ) ) ); \
+     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[3] ) ) ); \
+     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[4] ) ) ); \
+     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[5] ) ) ); \
+     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[6] ) ) ); \
+     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[7] ) ) ); \
+     tp += 8; \
+     db = _mm512_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX8( a, b, c, d ) \
+do { \
+  __m512i t; \
+  t = a; \
+  a = _mm512_and_si512( a, c ); \
+  a = _mm512_xor_si512( a, d ); \
+  c = _mm512_xor_si512( c, b ); \
+  c = _mm512_xor_si512( c, a ); \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, b ); \
+  t = _mm512_xor_si512( t, c ); \
+  b = d; \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, a ); \
+  a = _mm512_and_si512( a, b ); \
+  t = _mm512_xor_si512( t, a ); \
+  b = _mm512_xor_si512( b, d ); \
+  b = _mm512_xor_si512( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm512_not( t ); \
+} while (0)
+
+#define L8( a, b, c, d ) \
+do { \
+   a = mm512_rol_32( a, 13 ); \
+   c = mm512_rol_32( c,  3 ); \
+   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
+   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
+                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_rol_32( b, 1 ); \
+   d = mm512_rol_32( d, 7 ); \
+   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
+   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
+                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_rol_32( a,  5 ); \
+   c = mm512_rol_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG8 \
+   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG8(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG8(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+
+#define ROUND_BIG8(rc, alpha) \
+do { \
+   __m512i t0, t1, t2, t3; \
+   s0 = _mm512_xor_si512( s0, m512_const1_64( \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+\
+  SBOX8( s0, s4, s8, sC ); \
+  SBOX8( s1, s5, s9, sD ); \
+  SBOX8( s2, s6, sA, sE ); \
+  SBOX8( s3, s7, sB, sF ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
+                                        _mm512_bslli_epi128( s5, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
+                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  L8( s0, t1, s9, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( s6, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
+                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  L8( s1, t1, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
+                                        _mm512_bslli_epi128( s7, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
+                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  L8( s2, t1, sB, t3 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
+                                        _mm512_bslli_epi128( s4, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  L8( s3, t1, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
+                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  L8( t0, t1, t2, t3 ); \
+  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
+  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
+  L8( t0, t1, t2, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+} while (0)
+
+#define P_BIG8 \
+do { \
+   ROUND_BIG8(0, alpha_n); \
+   ROUND_BIG8(1, alpha_n); \
+   ROUND_BIG8(2, alpha_n); \
+   ROUND_BIG8(3, alpha_n); \
+   ROUND_BIG8(4, alpha_n); \
+   ROUND_BIG8(5, alpha_n); \
+} while (0)
+
+#define PF_BIG8 \
+do { \
+   ROUND_BIG8( 0, alpha_f); \
+   ROUND_BIG8( 1, alpha_f); \
+   ROUND_BIG8( 2, alpha_f); \
+   ROUND_BIG8( 3, alpha_f); \
+   ROUND_BIG8( 4, alpha_f); \
+   ROUND_BIG8( 5, alpha_f); \
+   ROUND_BIG8( 6, alpha_f); \
+   ROUND_BIG8( 7, alpha_f); \
+   ROUND_BIG8( 8, alpha_f); \
+   ROUND_BIG8( 9, alpha_f); \
+   ROUND_BIG8(10, alpha_f); \
+   ROUND_BIG8(11, alpha_f); \
+} while (0)
+
+#define T_BIG8 \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
+{
+   DECL_STATE_BIG8
+   uint32_t tmp = num << 6;
+
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG8( sc );
+   while ( num-- > 0 )
+   {
+      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG8;
+      P_BIG8;
+      T_BIG8;
+      buf++;
+   }
+   WRITE_STATE_BIG8( sc );
+}
+
+void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+{
+   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG8
+   READ_STATE_BIG8( sc );
+   INPUT_BIG8;
+   PF_BIG8;
+   T_BIG8;
+   WRITE_STATE_BIG8( sc );
+}
+
+
+void hamsi512_8way_init( hamsi_8way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
+   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
+   sc->h[2] = m512_const1_64( 0x302c206272672031 );
+   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
+   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
+   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
+   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
+   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
+}
+
+void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_8way_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+{
+   __m512i pad[1];
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
+                               cl, ch, cl, ch, cl, ch, cl, ch );
+//   pad[0] =  m512_const2_32( cl, ch );
+   sc->buf[0] = m512_const1_64( 0x80 );
+   hamsi_8way_big( sc, sc->buf, 1 );
+   hamsi_8way_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+}
+
+
+#endif // AVX512
+
+
+// Hamsi 4 way

 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
   sc->h[0x7] = c7; \
 } while (0)

+/*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
+*/

 #define ROUND_BIG(rc, alpha) \
 do { \
   __m256i t0, t1, t2, t3; \
   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
-        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( \
-        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( \
-        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( \
-        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( \
-        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( \
-        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( \
-        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( \
-        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
  SBOX( s0, s4, s8, sC ); \
  SBOX( s1, s5, s9, sD ); \
@@ -844,7 +1171,8 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->h[7] = m256_const1_64( 0x6769756d2042656c );
 }

-void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
+      size_t len )
 {
   __m256i *vdata = (__m256i*)data;

--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;

 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+      size_t len );
+//#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i h[8];
+   __m512i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_8way_big_context;
+
+typedef hamsi_8way_big_context hamsi512_8way_context;
+
+void hamsi512_8way_init( hamsi512_8way_context *sc );
+void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+                           size_t len );
+void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+
+
+
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -38,7 +38,7 @@
 #define SPH_XCAT_(a, b)   a ## b

 static void
-SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
 ( haval_4way_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
--- a/algo/haval/haval-8way-helper.c
+++ b/algo/haval/haval-8way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
+( haval_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      uint32_t clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE_8W;
+         IN_PREPARE_8W(sc->buf);
+         RSTATE_8W;
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+         WSTATE_8W;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE_8W;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m256_one_32;
+   current += 4;   
+   RSTATE_8W;
+   if ( current > 116UL )
+   {
+      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE_8W(sc->buf);
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE_8W(sc->buf);
+      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+   } while (0);
+   WSTATE_8W;
+   haval_8way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"

-// won't compile with sse4.2
+// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)

@@ -479,9 +479,9 @@ haval ## xxx ## _ ## y ## _4way_init(void *cc) \
 } \
 \
 void \
-haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+haval ## xxx ## _ ## y ## _4way_update (void *cc, const void *data, size_t len) \
 { \
-	haval ## y ## _4way(cc, data, len); \
+	haval ## y ## _4way_update(cc, data, len); \
 } \
 \
 void \
@@ -518,6 +518,301 @@ do { \

 #define INMSG(i)   msg[i]

+#if defined(__AVX2__)
+
+// Haval-256 8 way 32 bit avx2
+
+#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( x0, \
+       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+
+#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+      _mm256_and_si256( x2, \
+         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
+         _mm256_xor_si256( \
+             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
+             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+
+#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+    _mm256_and_si256( x3, \
+      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                     _mm256_xor_si256( x6, x0 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+
+#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+     _mm256_xor_si256( \
+        _mm256_and_si256( x3, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+        _mm256_and_si256( x4, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
+
+
+#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+       _mm256_and_si256( x0, \
+            mm256_not( _mm256_xor_si256( \
+                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                    _mm256_and_si256( x2, x5 ) ), \
+                                    _mm256_and_si256( x3, x6 ) ) )
+
+#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F5_8W(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), \
+                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
+} while (0)
+
+#define PASS1_8W(n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7), SPH_C32(0x00000000)); \
+         } \
+   } while (0)
+
+#define PASSG_8W(p, n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(MP ## p[pass_count + 0]), \
+            RK ## p[pass_count + 0]); \
+         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(MP ## p[pass_count + 1]), \
+            RK ## p[pass_count + 1]); \
+         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(MP ## p[pass_count + 2]), \
+            RK ## p[pass_count + 2]); \
+         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(MP ## p[pass_count + 3]), \
+            RK ## p[pass_count + 3]); \
+         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(MP ## p[pass_count + 4]), \
+            RK ## p[pass_count + 4]); \
+         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(MP ## p[pass_count + 5]), \
+            RK ## p[pass_count + 5]); \
+         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(MP ## p[pass_count + 6]), \
+            RK ## p[pass_count + 6]); \
+         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(MP ## p[pass_count + 7]), \
+            RK ## p[pass_count + 7]); \
+         } \
+   } while (0)
+
+#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
+#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
+#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
+#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
+
+#define SAVE_STATE_8W \
+   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE_8W \
+do { \
+   s0 = _mm256_add_epi32( s0, u0 ); \
+   s1 = _mm256_add_epi32( s1, u1 ); \
+   s2 = _mm256_add_epi32( s2, u2 ); \
+   s3 = _mm256_add_epi32( s3, u3 ); \
+   s4 = _mm256_add_epi32( s4, u4 ); \
+   s5 = _mm256_add_epi32( s5, u5 ); \
+   s6 = _mm256_add_epi32( s6, u6 ); \
+   s7 = _mm256_add_epi32( s7, u7 ); \
+} while (0)
+
+#define CORE_8W5(in)  do { \
+      SAVE_STATE_8W; \
+      PASS1_8W(5, in); \
+      PASS2_8W(5, in); \
+      PASS3_8W(5, in); \
+      PASS4_8W(5, in); \
+      PASS5_8W(5, in); \
+      UPDATE_STATE_8W; \
+   } while (0)
+
+#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
+
+#define RSTATE_8W \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+#define WSTATE_8W \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+static void
+haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = m256_const1_32( 0x243F6A88UL );
+   sc->s1 = m256_const1_32( 0x85A308D3UL );
+   sc->s2 = m256_const1_32( 0x13198A2EUL );
+   sc->s3 = m256_const1_32( 0x03707344UL );
+   sc->s4 = m256_const1_32( 0xA4093822UL );
+   sc->s5 = m256_const1_32( 0x299F31D0UL );
+   sc->s6 = m256_const1_32( 0x082EFA98UL );
+   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+
+}
+#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
+
+#define INW_8W(i)   load_ptr_8w[ i ] 
+
+static void
+haval_8way_out( haval_8way_context *sc, void *dst )
+{
+   __m256i *buf = (__m256i*)dst;
+   DSTATE_8W;
+   RSTATE_8W;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+#undef PASSES
+#define PASSES   5
+#include "haval-8way-helper.c"
+
+#define API_8W(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _8way_init(void *cc) \
+{ \
+   haval_8way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
+{ \
+   haval ## y ## _8way_update(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
+{ \
+   haval ## y ## _8way_close(cc, dst); \
+} \
+
+API_8W(256, 5)
+
+#define RVAL_8W \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL_8W \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG_8W(i)   msg[i]
+
+
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
 */

 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__ 1

 #if defined(__AVX__)

@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );

-void haval256_5_4way( void *cc, const void *data, size_t len );
+void haval256_5_4way_update( void *cc, const void *data, size_t len );
+//#define haval256_5_4way haval256_5_4way_update

 void haval256_5_4way_close( void *cc, void *dst );

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[32];
+   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   uint32_t count_high, count_low;
+} haval_8way_context __attribute__ ((aligned (64)));
+
+typedef haval_8way_context haval256_5_8way_context;
+
+void haval256_5_8way_init( void *cc );
+
+void haval256_5_8way_update( void *cc, const void *data, size_t len );
+
+void haval256_5_8way_close( void *cc, void *dst );
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -103,14 +103,12 @@ typedef jh_4way_context jh512_4way_context;
 void jh256_4way_init( jh_4way_context *sc);

 void jh256_4way_update(void *cc, const void *data, size_t len);
-#define jh256_4way jh256_4way_update

 void jh256_4way_close(void *cc, void *dst);

 void jh512_4way_init( jh_4way_context *sc );

 void jh512_4way_update(void *cc, const void *data, size_t len);
-#define jh512_4way jh512_4way_update

 void jh512_4way_close(void *cc, void *dst);

--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -33,7 +33,7 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way_context ctx_keccak;

    keccak512_4way_init( &ctx_keccak );
-    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_update( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );

    // Heavy & Light Pair Loop
@@ -58,7 +58,7 @@ void jha_hash_4way( void *out, const void *input )
       intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
-       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_update( &ctx_skein, vhash, 64 );
       skein512_4way_close( &ctx_skein, vhashB );

       for ( int i = 0; i < 8; i++ )
@@ -69,7 +69,7 @@ void jha_hash_4way( void *out, const void *input )
       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
-       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_update( &ctx_jh, vhash, 64 );
       jh512_4way_close( &ctx_jh, vhashB );

       for ( int i = 0; i < 8; i++ )
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -99,14 +99,12 @@ typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
 void keccak256_4way_update(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
-#define keccak256_4way keccak256_4way_update

 void keccak512_4way_init(void *cc);
 void keccak512_4way_update(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
-#define keccak512_4way keccak512_4way_update

 #endif

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,15 +1,178 @@
 #include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
-
-#if defined (ALLIUM_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"

+#if defined (ALLIUM_8WAY)  
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   hashState_groestl256      groestl;
+} allium_8way_ctx_holder;
+
+static __thread allium_8way_ctx_holder allium_8way_ctx;
+
+bool init_allium_8way_ctx()
+{
+   keccak256_8way_init( &allium_8way_ctx.keccak );
+   cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &allium_8way_ctx.skein );
+   init_groestl256( &allium_8way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+
+   memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+  
+   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 vhash, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
+   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+}
+
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 8;
+   const uint32_t Htarg = ptarget[7];
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   blake256_8way_init( &allium_8way_ctx.blake );
+   blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
+
+   do {
+     *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                 n+3, n+2, n+1, n ) );
+
+     allium_8way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
+     {
+        if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n + lane;
+           submit_lane_solution( work, hash+(lane<<3), mythr, lane );
+         }
+     }
+     n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (ALLIUM_4WAY)  
+
+
 typedef struct {
   blake256_4way_context     blake;
   keccak256_4way_context    keccak;
@@ -41,11 +204,11 @@ void allium_4way_hash( void *state, const void *input )
   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 

   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash32 );

   rintrlv_4x32_4x64( vhash64, vhash32, 256 );
-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -70,7 +233,7 @@ void allium_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   int size = ROW_LEN_BYTES * 4; // nRows;

-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
+//   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
+   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
+   init_lyra2rev3_16way_ctx();;
+#else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
+#endif
 #endif
   return l2v3_wholeMatrix;
 }

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
+  gate->hash      = (void*)&lyra2rev3_16way_hash;
+#elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -85,10 +94,14 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined (LYRA2REV2_8WAY)
+   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV2_4WAY)
   init_lyra2rev2_4way_ctx();;
 #else
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -96,14 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
+  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -113,7 +129,11 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )

 bool register_lyra2z_algo( algo_gate_t* gate )
 {
-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+  gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_16way;
+  gate->hash       = (void*)&lyra2z_16way_hash;
+#elif defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
  gate->hash       = (void*)&lyra2z_8way_hash;
@@ -126,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -154,7 +174,11 @@ bool register_lyra2h_algo( algo_gate_t* gate )

 bool register_allium_algo( algo_gate_t* gate )
 {
-#if defined (ALLIUM_4WAY)
+#if defined (ALLIUM_8WAY)
+  gate->miner_thread_init = (void*)&init_allium_8way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_8way;
+  gate->hash      = (void*)&allium_8way_hash;
+#elif defined (ALLIUM_4WAY)
  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_4way;
  gate->hash      = (void*)&allium_4way_hash;
@@ -163,7 +187,7 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
@@ -205,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_phi2_algo( algo_gate_t* gate )
 {
 //   init_phi2_ctx();
-   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,27 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
-#endif

-#if defined(__SSE2__)
-  #define LYRA2REV3_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV3_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV3_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2REV3_4WAY 1
 #endif

 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
+#if defined(LYRA2REV3_16WAY)
+
+void lyra2rev3_16way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_16way_ctx();
+
+#elif defined(LYRA2REV3_8WAY)

 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
@@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_8WAY)
+
+void lyra2rev2_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_8way_ctx();
+
+#elif defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
@@ -67,17 +85,25 @@ bool init_lyra2rev2_ctx();

 /////////////////////////

-#if defined(__SSE2__)
-  #define LYRA2Z_4WAY
-#endif
-#if defined(__AVX2__)
-  #define LYRA2Z_8WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2Z_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2Z_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2Z_4WAY 1
 #endif


 #define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8

-#if defined(LYRA2Z_8WAY)
+#if defined(LYRA2Z_16WAY)
+
+void lyra2z_16way_hash( void *state, const void *input );
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool lyra2z_16way_thread_init();
+
+#elif defined(LYRA2Z_8WAY)

 void lyra2z_8way_hash( void *state, const void *input );
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
@@ -126,13 +152,22 @@ bool lyra2h_thread_init();

 //////////////////////////////////

-#if defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define ALLIUM_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY 1
 #endif

 bool register_allium_algo( algo_gate_t* gate );

-#if defined(ALLIUM_4WAY)
+#if defined(ALLIUM_8WAY)
+
+void allium_8way_hash( void *state, const void *input );
+int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+bool init_allium_8way_ctx();
+
+#elif defined(ALLIUM_4WAY)

 void allium_4way_hash( void *state, const void *input );
 int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -26,6 +26,22 @@
 #include "lyra2.h"
 #include "sponge.h"

+//  LYRA2RE 8 cols 8 rows used by lyra2re, allium, phi2, x22i, x25x, 
+//  dynamic matrix allocation.
+//
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2 and x21s, static matrix
+//  allocation.
+//
+//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
+//  rowa in the wandering phase. Used by lyra2rev3. Static matrix
+//  allocation.
+// 
+//  LYRA2Z various cols & rows and supports 80 byte input. Used by lyra2z,
+//  lyra2z330, lyra2h, 
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
 * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
@@ -46,176 +62,137 @@
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */

-int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-               const uint64_t timeCost, const uint64_t nRows,
-               const uint64_t nCols )
+// For lyra2rev3.
+// convert a simple offset to an index into 2x4 u64 interleaved data.
+// good for state and 4 row matrix. 
+// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
+
+#define offset_to_index( o ) \
+   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
+
+
+int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2;
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1;
   //====================================================================/

-   //=== Initializing the Memory Matrix and pointers to it =============//
-   //Tries to allocate enough space for the whole memory matrix
-
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
-
-   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;

-   byte *ptrByte = (byte*) wholeMatrix;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;

-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
+   // now build the rest interleaving on the fly.

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-   
   ptrWord = wholeMatrix;

-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );

   //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );

   do
   {
-      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );

-      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
-      //update prev: it now points to the last row ever computed
+     rowa0 = (rowa0 + step) & (window - 1);

-      prev = row;
-      //updates row: goes to the next row to be computed
-      row++;
+     prev = row;
+     row++;

-      //Checks if all rows in the window where visited.
-      if (rowa == 0)
-      {
-         step = window + gap; //changes the step: approximately doubles its value
-         window *= 2; //doubles the size of the re-visitation window
-         gap = -gap; //inverts the modifier to the step
-      }
-
-   } while (row < nRows);
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2; 
+        gap = -gap;
+     }
+   } while ( row < nRows );

   //===================== Wandering Phase =============================//
-   row = 0; //Resets the visitation to the first row of the memory matrix
-   for (tau = 1; tau <= timeCost; tau++)
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
   {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-       do
-       {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);

-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
+         prev = row;

-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-           //update prev: it now points to the last row ever computed
-           prev = row;
+         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)

-           //updates row: goes to the next row to be computed
-           //----------------------------------------------------
-           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //----------------------------------------------------
-
-       } while (row != 0);
+      } while (row != 0);
   }

   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
   //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );

   return 0;
 }

+// This version is currently only used by REv3 and has some hard coding
+// specific to v3 such as input data size of 32 bytes.
+//
+// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
+// they can be merged.
+//
+// RE is used by RE, allium. The main difference between RE and REv2
+// in the matrix size.
+//
+// Z also needs to support 80 byte input as well as 32 byte, and odd
+// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
+
+
 /////////////////////////////////////////////////

 // 2 way 256
@@ -223,22 +200,25 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 // Data is interleaved 2x256.

 int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-      const void *pwd, const uint64_t pwdlen, const void *salt,
-      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
-      const uint64_t nCols )
+                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
+                    uint64_t nRows, uint64_t nCols )
+
+// hard coded for 32 byte input as well as matrix size.
+// Other required versions include 80 byte input and different block
+// sizes.
+
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
-   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; 
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1; 
+   uint64_t instance0 = 0;
   uint64_t instance1 = 0;
   //====================================================================/

@@ -248,7 +228,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   uint64_t *ptrWord = wholeMatrix;

 //  2 way 256 rewrite. Salt always == password, and data is interleaved,
-//  need to build in parallel:
+//  need to build in parallel as pw isalready interleaved.
+
+   
 //  {   password,    (64 or 80 bytes)
 //      salt,        (64 or 80 bytes) =  same as password
 //      Klen,        (u64)  = 32 bytes
@@ -262,73 +244,54 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
 //      1            (byte)
 //   }
   
-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+// input is usually 32 maybe 64, both are aligned to 256 bit vector.
+// 80 byte inpput is not aligned complicating matters for lyra2z.   

-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+   
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-   byte *ptrByte = (byte*) wholeMatrix;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+ 
+   // now build the rest interleaving on the fly.

-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
-
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
-
-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

   ptrWord = wholeMatrix;

-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );

   do
   {

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );

-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);

      prev = row;
      row++;

-      if (rowa == 0)
+      if (rowa0 == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
@@ -340,37 +303,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   row = 0;
   for (tau = 1; tau <= timeCost; tau++)
   {
-      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
      do
      {
-        // This part is not parallel, rowa will be different for each lane.
-        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+        instance0 = state[ offset_to_index( instance0 ) ];
+        instance1 = (&state[4])[ offset_to_index( instance1 ) ];

-        // index = 2 * instance / 4 * 4 + instance % 4
-        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
-                           + ( instance0 & 0x3 )
-        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
-                           + ( instance1 & 0x3 )
+        rowa0 = state[ offset_to_index( instance0 )  ]
+                & (unsigned int)(nRows-1);
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
+                & (unsigned int)(nRows-1);

-        instance0 = state[ index0 ] & 0xf;
-        instance1 = (state+4)[ index1 ] & 0xf;
-
-        rowa0 = state[ instance0 ];
-        rowa1 = (state+4)[ instance1 ];
-
-        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
-                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
-/*
-           instance = state[instance & 0xF];
-           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
-
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-*/
-        // End of divergence.
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                      nCols );

        prev = row;
        row = (row + step) & (unsigned int)(nRows-1); 
@@ -378,176 +326,136 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
       } while ( row != 0 );
   }

-   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
-   squeeze( state, K, (unsigned int) kLen );
+   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
+                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
+
+   squeeze_2way( state, K, (unsigned int) kLen );

   return 0;
 }

-
-
 //////////////////////////////////////////////////
-int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-            const uint64_t timeCost, const uint64_t nRows,
-            const uint64_t nCols )
+
+int LYRA2Z_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+               const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+               const uint64_t nRows, const uint64_t nCols )
 {
    //========================== Basic variables ============================//
-    uint64_t _ALIGN(256) state[16];
-    int64_t row = 2; //index of row to be processed
-    int64_t prev = 1; //index of prev (last row ever computed/modified)
-    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-    int64_t tau; //Time Loop iterator
-    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//    int64_t i; //auxiliary iteration counter
+    uint64_t _ALIGN(256) state[32];
+    int64_t row = 2;
+    int64_t prev = 1;
+    int64_t rowa0 = 0;
+    int64_t rowa1 = 0;
+    int64_t tau; 
+    int64_t step = 1;
+    int64_t window = 2;
+    int64_t gap = 1; 
    //=======================================================================/

-    //======= Initializing the Memory Matrix and pointers to it =============//
-    //Tries to allocate enough space for the whole memory matrix
-
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-    //==== Getting the password + salt + basil padded with 10*1 ============//
-    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-    //but this ensures that the password copied locally will be overwritten as soon as possible

    //First, we clean enough blocks for the password, salt, basil and padding
-    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
+    uint64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 *
                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
-    byte *ptrByte = (byte*) wholeMatrix;
-    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );

-    //Prepends the password
-    memcpy(ptrByte, pwd, pwdlen);
-    ptrByte += pwdlen;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-    //Concatenates the salt
-    memcpy(ptrByte, salt, saltlen);
-    ptrByte += saltlen;
-    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-    memcpy(ptrByte, &kLen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nRows, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
-    memcpy(ptrByte, &nCols, sizeof (uint64_t));
-    ptrByte += sizeof (uint64_t);
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;

-    //Now comes the padding
-    *ptrByte = 0x80; //first byte of padding: right after the password
-    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+   // now build the rest interleaving on the fly.
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

-    //=================== Initializing the Sponge State ====================//
-    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-//        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
-//        if (state == NULL) {
-//                return -1;
-//        }
-//    initState( state );
+   uint64_t *ptrWord = wholeMatrix;

-    //============================== Setup Phase =============================//
-    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-    uint64_t *ptrWord = wholeMatrix;
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput,
+                               BLOCK_LEN_BLAKE2_SAFE_INT64 );

-    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
-                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
-/*
-    for ( i = 0; i < nBlocksInput; i++ )
-    {
-      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
-    }
-*/
-    //Initializes M[0] and M[1]
-        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
-        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );

-        do {
-                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
-                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );

-                //updates the value of row* (deterministically picked during Setup))
-                rowa = (rowa + step) & (window - 1);
-                //update prev: it now points to the last row ever computed
-                prev = row;
-                //updates row: goes to the next row to be computed
-                row++;
+   do
+   {
+     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-                //Checks if all rows in the window where visited.
-                if (rowa == 0) {
-                        step = window + gap; //changes the step: approximately doubles its value
-                        window *= 2; //doubles the size of the re-visitation window
-                        gap = -gap; //inverts the modifier to the step
-                }
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );

-        } while (row < nRows);
+     rowa0 = (rowa0 + step) & (window - 1);
+     prev = row;
+     row++;

-    //======================== Wandering Phase =============================//
-    row = 0; //Resets the visitation to the first row of the memory matrix
-    for ( tau = 1; tau <= timeCost; tau++ )
-    {
-        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+     if ( rowa0 == 0 )
+     {
+        step = window + gap;
+        window *= 2;
+        gap = -gap;
+     }
+   } while ( row < nRows );
+
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
+   {
        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-        do {
-        //Selects a pseudorandom index row*
-        //----------------------------------------------------------------------
-        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
-        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //-----------------------------------------------------------------
+      do
+      {
+        rowa0 = state[ 0 ] % nRows;
+        rowa1 = state[ 4 ] % nRows;

-        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );

-        //update prev: it now points to the last row ever computed
        prev = row;
-
-        //updates row: goes to the next row to be computed
-        //---------------------------------------------------------------
-        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
-        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-        //--------------------------------------------------------------------
+        row = (row + step) % nRows;

      } while (row != 0);
-    }
+   }

-    //========================= Wrap-up Phase ===============================//
-    //Absorbs the last block of the memory matrix
-    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );

-    //Squeezes the key
-    squeeze( state, K, kLen );
+   //Squeezes the key
+   squeeze_2way( state, K, (unsigned int) kLen );

-    return 0;
+   return 0;
 }

+////////////////////////////////////////////////////
+
 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
-             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
-             const uint64_t nRows, const uint64_t nCols )
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
+                  const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
+   uint64_t _ALIGN(256) state[32];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
   int64_t tau; //Time Loop iterator
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
   //====================================================================/

   //=== Initializing the Memory Matrix and pointers to it =============//
@@ -560,100 +468,56 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;

   i = (int64_t)ROW_LEN_BYTES * nRows;
-   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
+   uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
   if (wholeMatrix == NULL)
      return -1;

-#if defined(__AVX2__)
-   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
-#elif defined(__SSE2__)
-   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
-#else
-   memset( wholeMatrix, 0, i );
-#endif
+   memset_zero_512( (__m512i*)wholeMatrix, i>>5 );

   uint64_t *ptrWord = wholeMatrix;
-
-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
+   uint64_t *pw = (uint64_t*)pwd;

   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;

-   byte *ptrByte = (byte*) wholeMatrix;
+   uint64_t *ptr = wholeMatrix;

-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;

-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
+   // now build the rest interleaving on the fly.

-//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-//                       - (saltlen + pwdlen) );
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );

-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-
-   ptrWord = wholeMatrix;
-
-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
   //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );

   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );

      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);
      //update prev: it now points to the last row ever computed

      prev = row;
@@ -661,7 +525,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
      row++;

      //Checks if all rows in the window where visited.
-      if (rowa == 0)
+      if (rowa0 == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
@@ -674,21 +538,18 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   row = 0; //Resets the visitation to the first row of the memory matrix
   for (tau = 1; tau <= timeCost; tau++)
   {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
-       do
-       {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      do
+      {
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);

-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );

-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
           //update prev: it now points to the last row ever computed
           prev = row;

@@ -703,9 +564,10 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
   //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );

   //================== Freeing the memory =============================//
   _mm_free(wholeMatrix);
@@ -713,3 +575,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   return 0;
 }

+#endif
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
-
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,20 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,

 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
+                  uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+          uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+#endif
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -20,7 +20,7 @@ static __thread blake256_4way_context l2h_4way_blake_mid;
 void lyra2h_4way_midstate( const void* input )
 {
       blake256_4way_init( &l2h_4way_blake_mid );
-       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2h_4way_blake_mid, input, 64 );
 }

 void lyra2h_4way_hash( void *state, const void *input )
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,150 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-
-#if defined (LYRA2REV2_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined (LYRA2REV2_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
+} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
+
+bool init_lyra2rev2_8way_ctx()
+{
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
+   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
+
+   blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );
+
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v2_8way_ctx.blake );
+   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_4WAY)

 typedef struct {
   blake256_4way_context     blake;
@@ -39,12 +176,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

-   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
   blake256_4way_close( &ctx.blake, vhash );

   rintrlv_4x32_4x64( vhash64, vhash, 256 );

-   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_update( &ctx.keccak, vhash64, 32 );
   keccak256_4way_close( &ctx.keccak, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -64,7 +201,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );

-   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_update( &ctx.skein, vhash64, 32 );
   skein256_4way_close( &ctx.skein, vhash64 );

   dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
@@ -80,7 +217,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }

@@ -105,7 +242,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v2_4way_ctx.blake );
-   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,180 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
+
+typedef struct {
+   blake256_16way_context     blake;
+   cube_4way_context          cube;
+   bmw256_16way_context       bmw;
+} lyra2v3_16way_ctx_holder;
+
+static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
+
+bool init_lyra2rev3_16way_ctx()
+{
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
+   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+           vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+             hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+
+int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<4];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+
+      lyra2rev3_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV3_8WAY)

 typedef struct {
   blake256_8way_context     blake;
@@ -37,7 +209,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   lyra2v3_8way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_8way_ctx, sizeof(l2v3_8way_ctx) );

-   blake256_8way( &ctx.blake, input + (64*8), 16 );
+   blake256_8way_update( &ctx.blake, input + (64*8), 16 );
   blake256_8way_close( &ctx.blake, vhash );

   dintrlv_8x32( hash0, hash1, hash2, hash3,
@@ -80,7 +252,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   intrlv_8x32( vhash, hash0, hash1, hash2, hash3,
                             hash4, hash5, hash6, hash7, 256 );

-   bmw256_8way( &ctx.bmw, vhash, 32 );
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
   bmw256_8way_close( &ctx.bmw, state );

   }
@@ -105,7 +277,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   mm256_bswap32_intrlv80_8x32( vdata, pdata );

   blake256_8way_init( &l2v3_8way_ctx.blake );
-   blake256_8way( &l2v3_8way_ctx.blake, vdata, 64 );
+   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
@@ -162,8 +334,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );

-//   blake256_4way( &ctx.blake, input, 80 );
-   blake256_4way( &ctx.blake, input + (64*4), 16 );
+   blake256_4way_update( &ctx.blake, input + (64*4), 16 );
   blake256_4way_close( &ctx.blake, vhash );
   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

@@ -186,7 +357,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_update( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }

@@ -211,7 +382,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
   mm128_bswap32_intrlv80_4x32( vdata, pdata );

   blake256_4way_init( &l2v3_4way_ctx.blake );
-   blake256_4way( &l2v3_4way_ctx.blake, vdata, 64 );
+   blake256_4way_update( &l2v3_4way_ctx.blake, vdata, 64 );

   do
   {
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,13 +1,240 @@
 #include "lyra2-gate.h"
-
-#ifdef LYRA2Z_4WAY
-
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"

+#if defined(LYRA2Z_16WAY)
+
+__thread uint64_t* lyra2z_16way_matrix;
+
+bool lyra2z_16way_thread_init()
+{
+ return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_16way_context l2z_16way_blake_mid;
+
+void lyra2z_16way_midstate( const void* input )
+{
+       blake256_16way_init( &l2z_16way_blake_mid );
+       blake256_16way_update( &l2z_16way_blake_mid, input, 64 );
+}
+
+void lyra2z_16way_hash( void *state, const void *input )
+{
+    uint32_t vhash[8*16] __attribute__ ((aligned (128)));
+    uint32_t hash0[8] __attribute__ ((aligned (64)));
+    uint32_t hash1[8] __attribute__ ((aligned (64)));
+    uint32_t hash2[8] __attribute__ ((aligned (64)));
+    uint32_t hash3[8] __attribute__ ((aligned (64)));
+    uint32_t hash4[8] __attribute__ ((aligned (64)));
+    uint32_t hash5[8] __attribute__ ((aligned (64)));
+    uint32_t hash6[8] __attribute__ ((aligned (64)));
+    uint32_t hash7[8] __attribute__ ((aligned (64)));
+    uint32_t hash8[8] __attribute__ ((aligned (64)));
+    uint32_t hash9[8] __attribute__ ((aligned (64)));
+    uint32_t hash10[8] __attribute__ ((aligned (64)));
+    uint32_t hash11[8] __attribute__ ((aligned (64)));
+    uint32_t hash12[8] __attribute__ ((aligned (64)));
+    uint32_t hash13[8] __attribute__ ((aligned (64)));
+    uint32_t hash14[8] __attribute__ ((aligned (64)));
+    uint32_t hash15[8] __attribute__ ((aligned (64)));
+    blake256_16way_context ctx_blake __attribute__ ((aligned (64)));
+
+    memcpy( &ctx_blake, &l2z_16way_blake_mid, sizeof l2z_16way_blake_mid );
+    blake256_16way_update( &ctx_blake, input + (64*16), 16 );
+    blake256_16way_close( &ctx_blake, vhash );
+
+    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+               vhash, 256 );
+
+    intrlv_2x256( vhash, hash0, hash1, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash0, hash1, vhash, 256 );
+    intrlv_2x256( vhash, hash2, hash3, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash2, hash3, vhash, 256 );
+    intrlv_2x256( vhash, hash4, hash5, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash4, hash5, vhash, 256 );
+    intrlv_2x256( vhash, hash6, hash7, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash6, hash7, vhash, 256 );
+    intrlv_2x256( vhash, hash8, hash9, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash8, hash9, vhash, 256 );
+    intrlv_2x256( vhash, hash10, hash11, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash10, hash11, vhash, 256 );
+    intrlv_2x256( vhash, hash12, hash13, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash12, hash13, vhash, 256 );
+    intrlv_2x256( vhash, hash14, hash15, 256 );
+    LYRA2Z_2WAY( lyra2z_16way_matrix, vhash, 32, vhash, 32, 8, 8, 8 );
+    dintrlv_2x256( hash14, hash15, vhash, 256 );
+   
+    memcpy( state,     hash0, 32 );
+    memcpy( state+ 32, hash1, 32 );
+    memcpy( state+ 64, hash2, 32 );
+    memcpy( state+ 96, hash3, 32 );
+    memcpy( state+128, hash4, 32 );
+    memcpy( state+160, hash5, 32 );
+    memcpy( state+192, hash6, 32 );
+    memcpy( state+224, hash7, 32 );
+    memcpy( state+256, hash8, 32 );
+    memcpy( state+288, hash9, 32 );
+    memcpy( state+320, hash10, 32 );
+    memcpy( state+352, hash11, 32 );
+    memcpy( state+384, hash12, 32 );
+    memcpy( state+416, hash13, 32 );
+    memcpy( state+448, hash14, 32 );
+    memcpy( state+480, hash15, 32 );
+}
+
+int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   lyra2z_16way_midstate( vdata );
+
+   do {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+      lyra2z_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 16; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 16;
+   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#elif defined(LYRA2Z_8WAY)
+
+__thread uint64_t* lyra2z_8way_matrix;
+
+bool lyra2z_8way_thread_init()
+{
+ return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_8way_context l2z_8way_blake_mid;
+
+void lyra2z_8way_midstate( const void* input )
+{
+       blake256_8way_init( &l2z_8way_blake_mid );
+       blake256_8way_update( &l2z_8way_blake_mid, input, 64 );
+}
+
+void lyra2z_8way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t hash4[8] __attribute__ ((aligned (64)));
+     uint32_t hash5[8] __attribute__ ((aligned (64)));
+     uint32_t hash6[8] __attribute__ ((aligned (64)));
+     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
+     blake256_8way_update( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_close( &ctx_blake, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3,
+                   hash4, hash5, hash6, hash7, vhash, 256 );
+
+     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
+
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   lyra2z_8way_midstate( vdata );
+
+   do {
+      *noncev = mm256_bswap_32(
+                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
+           && !opt_benchmark )
+      {
+          pdata[19] = n+i;
+          submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+
+#elif defined(LYRA2Z_4WAY)
+
+
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -20,7 +247,7 @@ static __thread blake256_4way_context l2z_4way_blake_mid;
 void lyra2z_4way_midstate( const void* input )
 {
       blake256_4way_init( &l2z_4way_blake_mid );
-       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+       blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
 }

 void lyra2z_4way_hash( void *state, const void *input )
@@ -33,7 +260,7 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_update( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
@@ -85,100 +312,3 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(LYRA2Z_8WAY)
-
-__thread uint64_t* lyra2z_8way_matrix;
-
-bool lyra2z_8way_thread_init()
-{
- return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
-}
-
-static __thread blake256_8way_context l2z_8way_blake_mid;
-
-void lyra2z_8way_midstate( const void* input )
-{
-       blake256_8way_init( &l2z_8way_blake_mid );
-       blake256_8way( &l2z_8way_blake_mid, input, 64 );
-}
-
-void lyra2z_8way_hash( void *state, const void *input )
-{
-     uint32_t hash0[8] __attribute__ ((aligned (64)));
-     uint32_t hash1[8] __attribute__ ((aligned (64)));
-     uint32_t hash2[8] __attribute__ ((aligned (64)));
-     uint32_t hash3[8] __attribute__ ((aligned (64)));
-     uint32_t hash4[8] __attribute__ ((aligned (64)));
-     uint32_t hash5[8] __attribute__ ((aligned (64)));
-     uint32_t hash6[8] __attribute__ ((aligned (64)));
-     uint32_t hash7[8] __attribute__ ((aligned (64)));
-     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
-
-     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
-     blake256_8way( &ctx_blake, input + (64*8), 16 );
-     blake256_8way_close( &ctx_blake, vhash );
-
-     dintrlv_8x32( hash0, hash1, hash2, hash3,
-                   hash4, hash5, hash6, hash7, vhash, 256 );
-
-     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
-     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
-
-     memcpy( state,     hash0, 32 );
-     memcpy( state+ 32, hash1, 32 );
-     memcpy( state+ 64, hash2, 32 );
-     memcpy( state+ 96, hash3, 32 );
-     memcpy( state+128, hash4, 32 );
-     memcpy( state+160, hash5, 32 );
-     memcpy( state+192, hash6, 32 );
-     memcpy( state+224, hash7, 32 );
-}
-
-int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   uint32_t n = first_nonce;
-   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
-
-   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-   lyra2z_8way_midstate( vdata );
-
-   do {
-      *noncev = mm256_bswap_32(
-                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
-      lyra2z_8way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
-      {
-          pdata[19] = n+i;         
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
-      }
-      n += 8;
-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
-   return 0;
-}
-
-
-#endif
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "algo-gate.h"
+//#include "algo-gate.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -40,19 +40,26 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
-       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I*2;
+       memcpy_512( out, state, BLOCK_LEN_M256I );
+       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
 }

-inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                                               const uint64_t *In1 ) 
 {
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m512i*)In;
-
+    __m512i in[3];
+    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
+    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
+    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
+    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
+    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
+    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
+    
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
    state2 = _mm512_load_si512( (__m512i*)State + 2 );
@@ -90,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
    state1 = _mm512_xor_si512( state1, in[1] );

    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len * 2;
+    In += block_len*2;
  }

  _mm512_store_si512( (__m512i*)State,     state0 );
@@ -109,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,


    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -126,13 +133,13 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;

       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I * 2;
+       out -= BLOCK_LEN_M256I;

       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -143,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-// This function has to deal with gathering 2 256 bit rowin vectors from
-// non-contiguous memory. Extra work and performance penalty.

 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                 uint64_t *rowOut, uint64_t nCols )
 {
    int i;
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m256i*)rowIn;
+    __m512i *in = (__m512i*)rowIn;
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -171,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );

         //Input: next column (i.e., next block in sequence)
-         in0 += BLOCK_LEN_M256I;
-         in1 += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_M256I;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I * 2;
+         out -= BLOCK_LEN_M256I;
    }

-    _mm512_store_si256( (__m512i*)State,     state0 );
-    _mm512_store_si256( (__m512i*)State + 1, state1 );
-    _mm512_store_si256( (__m512i*)State + 2, state2 );
-    _mm512_store_si256( (__m512i*)State + 3, state3 );
-   }
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
 {
    int i;
-
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
@@ -209,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       state2 = _mm512_xor_si512( state2,
                                  _mm512_add_epi64( in[2], inout[2] ) );

-       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );

       out[0] = _mm512_xor_si512( state0, in[0] );
       out[1] = _mm512_xor_si512( state1, in[1] );
@@ -221,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       t2 = _mm512_permutex_epi64( state2, 0x93 );

       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+

       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I * 2;
+       out   -= BLOCK_LEN_M256I;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -240,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
-                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
-                uint64_t nCols )
+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
 {
   int i;
-
   register __m512i state0, state1, state2, state3;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m2512* in    = (__m512i*)rowIn;
-    __m2512* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut;
-    __m512i  t0, t1, t2;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
+    __m512i t0, t1, t2;

-    _mm_prefetch( in0,     _MM_HINT_T0 );
-    _mm_prefetch( in1,     _MM_HINT_T0 );
-    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
-   
   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
+    
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
+
+    
+    for ( i = 0; i < nCols; i++ )
+    {

      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];

-//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
-//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
-//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
-      t0 = mm512_concat_256( in1[0], in0[0] );
-      t1 = mm512_concat_256( in1[1], in0[1] );
-      t2 = mm512_concat_256( in1[2], in0[2] );
-      
      state0 = _mm512_xor_si512( state0,
-                                     _mm512_add_epi64( t0, inout[0] ) );
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
      state1 = _mm512_xor_si512( state1,
-                                     _mm512_add_epi64( t1, inout[1] ) );
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
      state2 = _mm512_xor_si512( state2,
-                                     _mm512_add_epi64( t2, inout[2] ) );
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+

      //Applies the reduced-round transformation f to the sponge's state
      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -292,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
      out[1] = _mm512_xor_si512( out[1], state1 );
      out[2] = _mm512_xor_si512( out[2], state2 );

+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }
+
      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
      t0 = _mm512_permutex_epi64( state0, 0x93 );
      t1 = _mm512_permutex_epi64( state1, 0x93 );
      t2 = _mm512_permutex_epi64( state2, 0x93 );

-      inout[0] = _mm512_xor_si512( inout[0],
-                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
-      inout[1] = _mm512_xor_si512( inout[1],
-                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
-      inout[2] = _mm512_xor_si512( inout[2],
-                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];

       //Goes to next block
-       in    += BLOCK_LEN_M256I * 2;
-       out   += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -375,7 +375,10 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
    {
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );
-                   
+
+//printf("S RSR0 col= %d, out= %x\n",i,out);
+
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm256_xor_si256( state1, in[1] );
       out[2] = _mm256_xor_si256( state2, in[2] );

+/*
+printf("s duplexsetup col= %d\n",i); 
+uint64_t * o = (uint64_t*)out;
+printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
       //M[row*][col] = M[row*][col] XOR rotW(rand)
       t0 = _mm256_permute4x64_epi64( state0, 0x93 );
       t1 = _mm256_permute4x64_epi64( state1, 0x93 );
       t2 = _mm256_permute4x64_epi64( state2, 0x93 );

+/*
+uint64_t *t = (uint64_t*)&t0;
+printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
+
+o = (uint64_t*)inout;
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/       
       inout[0] = _mm256_xor_si256( inout[0],
                                    _mm256_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       inout[2] = _mm256_xor_si256( inout[2],
                                    _mm256_blend_epi32( t2, t1, 0x03 ) );

-       //Inputs: next column (i.e., next block in sequence)
+/*
+o = (uint64_t*)inout;
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
+//Inputs: next column (i.e., next block in sequence)
       in    += BLOCK_LEN_M256I;
       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
      _mm_prefetch( inout +  9, _MM_HINT_T0 );
      _mm_prefetch( inout + 11, _MM_HINT_T0 );

+/*
+uint64_t *io = (uint64_t*)inout;
+uint64_t *ii = (uint64_t*)in;
+
+printf("RDRS1 col= %d\n", i);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
+*/
+
+
      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
                                     _mm256_add_epi64( in[0], inout[0] ) );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror_1x64( s1); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_rol1x64_256( s3 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol1x64_256( s1 ); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_ror1x64_256( s3 );
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror256_64( s1); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_rol256_64( s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_ror256_64( s3 );

 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_ror256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_rol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+union _povly
+{
+   __m512i *v512;
+   __m256i *v256;
+   uint64_t *u64;
+};
+typedef union _povly povly;
+
 //---- Housekeeping
-void initState_2way( uint64_t state[/*16*/] );
+void initState_2way( uint64_t State[/*16*/] );

 //---- Squeezes
-void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
 void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );

 //---- Absorbs
-void absorbBlock_2way( uint64_t *state, const uint64_t *in );
-void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                       const uint64_t *In1 );
+void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
                            const uint64_t nBlocks, const uint64_t block_len );

 //---- Duplexes
-void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                             uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols);

 #endif

--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -133,7 +133,7 @@ void nist5hash_4way( void *out, const void *input )
     keccak512_4way_context ctx_keccak;

     blake512_4way_init( &ctx_blake );
-     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_update( &ctx_blake, input, 80 );
     blake512_4way_close( &ctx_blake, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -154,15 +154,15 @@ void nist5hash_4way( void *out, const void *input )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     jh512_4way_init( &ctx_jh );
-     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_update( &ctx_jh, vhash, 64 );
     jh512_4way_close( &ctx_jh, vhash );

     keccak512_4way_init( &ctx_keccak );
-     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_update( &ctx_keccak, vhash, 64 );
     keccak512_4way_close( &ctx_keccak, vhash );

     skein512_4way_init( &ctx_skein );
-     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_update( &ctx_skein, vhash, 64 );
     skein512_4way_close( &ctx_skein, out );
 }

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -54,10 +54,10 @@ void anime_4way_hash( void *state, const void *input )
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );

-    bmw512_4way( &ctx.bmw, input, 80 );
+    bmw512_4way_update( &ctx.bmw, input, 80 );
    bmw512_4way_close( &ctx.bmw, vhash );

-    blake512_4way( &ctx.blake, vhash, 64 );
+    blake512_4way_update( &ctx.blake, vhash, 64 );
    blake512_4way_close( &ctx.blake, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -92,7 +92,7 @@ void anime_4way_hash( void *state, const void *input )

    if ( mm256_anybits0( vh_mask ) )
    {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }

@@ -111,7 +111,7 @@ void anime_4way_hash( void *state, const void *input )

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -119,23 +119,23 @@ void anime_4way_hash( void *state, const void *input )
    if ( mm256_anybits1( vh_mask ) )
    {
       blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

    skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -143,13 +143,13 @@ void anime_4way_hash( void *state, const void *input )
    if ( mm256_anybits1( vh_mask ) )
    {
       keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }
    if ( mm256_anybits0( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,7 +2,10 @@

 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
+  gate->hash      = (void*)&hmq1725_8way_hash;
+#elif defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY 1
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define HMQ1725_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );

-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+
+void hmq1725_8way_hash( void *state, const void *input );
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(HMQ1725_4WAY)

 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -9,16 +9,23 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+#endif

 #if defined (QUARK_8WAY)

 typedef struct {
    blake512_8way_context  blake;
    bmw512_8way_context    bmw;
-    hashState_groestl      groestl;
    jh512_8way_context     jh;
    skein512_8way_context  skein;
    keccak512_8way_context keccak;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+#else
+    hashState_groestl       groestl;
+#endif
 } quark_8way_ctx_holder;

 quark_8way_ctx_holder quark_8way_ctx __attribute__ ((aligned (128)));
@@ -27,10 +34,14 @@ void init_quark_8way_ctx()
 {
     blake512_8way_init( &quark_8way_ctx.blake );
     bmw512_8way_init( &quark_8way_ctx.bmw );
-     init_groestl( &quark_8way_ctx.groestl, 64 );
     skein512_8way_init( &quark_8way_ctx.skein );
     jh512_8way_init( &quark_8way_ctx.jh );
     keccak512_8way_init( &quark_8way_ctx.keccak );
+#if defined(__VAES__)
+     groestl512_4way_init( &quark_8way_ctx.groestl, 64 );
+#else
+     init_groestl( &quark_8way_ctx.groestl, 64 );
+#endif
 }

 void quark_8way_hash( void *state, const void *input )
@@ -38,6 +49,7 @@ void quark_8way_hash( void *state, const void *input )
    uint64_t vhash[8*8] __attribute__ ((aligned (128)));
    uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
    uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
+    uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
    uint64_t hash0[8] __attribute__ ((aligned (64)));
    uint64_t hash1[8] __attribute__ ((aligned (64)));
    uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -49,6 +61,7 @@ void quark_8way_hash( void *state, const void *input )
    __m512i* vh  = (__m512i*)vhash;
    __m512i* vhA = (__m512i*)vhashA;
    __m512i* vhB = (__m512i*)vhashB;
+    __m512i* vhC = (__m512i*)vhashC;
    __mmask8 vh_mask;
    quark_8way_ctx_holder ctx;
    const uint32_t mask = 8;
@@ -63,23 +76,28 @@ void quark_8way_hash( void *state, const void *input )
    bmw512_8way_update( &ctx.bmw, vhash, 64 );
    bmw512_8way_close( &ctx.bmw, vhash );

-// AVX 512 cmpeq returns a bit mask instead of a vector mask.
-// This should simplify things but the logic doesn't seem to be working.
-// The problem appears to be related to the test to skip a hash if it isn't
-// to be used. Skipping the test for all 8 way hashes seems to have
-// fixed it. The hash selection blending works if the hash is produced
-// but the hash wasn't being produced when it should.
-// Both decisions are based on the same data, the __mmask8. It works
-// as a blend mask but not in a logical comparison, maybe the type is the
-// problem. Maybe a cast to int or movm is needed to make it work.
-// It's now moot because the hash can only be skipped 1 in 256 iterations
-// when hashing parallel 8 ways.
-// The performance impact of the workaround should be negligible.
-// It's a problem for another day.
-
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

+    
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     if ( ( vh_mask & 0x0f ) != 0x0f )
+     {
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     }
+     if ( ( vh_mask & 0xf0 ) != 0xf0 )
+     {     
+        groestl512_4way_init( &ctx.groestl, 64 );
+        groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+     }
+     rintrlv_4x128_8x64( vhashC, vhashA, vhashB, 512 );
+
+#else
+
    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );

@@ -131,16 +149,31 @@ void quark_8way_hash( void *state, const void *input )
                                               (char*)hash7, 512 );
    }

-    intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+    intrlv_8x64( vhashC, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                         hash7, 512 );

+#endif
+
    if ( vh_mask & 0xff )
    {
       skein512_8way_update( &ctx.skein, vhash, 64 );
       skein512_8way_close( &ctx.skein, vhashB );
    }

-    mm512_blend_hash_8x64( vh, vhA, vhB, vh_mask );
+    mm512_blend_hash_8x64( vh, vhC, vhB, vh_mask );
+
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else

    dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  vhash, 512 );
@@ -165,6 +198,8 @@ void quark_8way_hash( void *state, const void *input )
    intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 512 );

+#endif
+
    jh512_8way_update( &ctx.jh, vhash, 64 );
    jh512_8way_close( &ctx.jh, vhash );

@@ -303,10 +338,10 @@ void quark_4way_hash( void *state, const void *input )

    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );

-    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_update( &ctx.blake, input, 80 );
    blake512_4way_close( &ctx.blake, vhash );

-    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_update( &ctx.bmw, vhash, 64 );
    bmw512_4way_close( &ctx.bmw, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -341,7 +376,7 @@ void quark_4way_hash( void *state, const void *input )

    if ( mm256_anybits1( vh_mask ) )   
    {
-       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_update( &ctx.skein, vhash, 64 );
       skein512_4way_close( &ctx.skein, vhashB );
    }

@@ -360,7 +395,7 @@ void quark_4way_hash( void *state, const void *input )

    intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_update( &ctx.jh, vhash, 64 );
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -368,24 +403,24 @@ void quark_4way_hash( void *state, const void *input )
    if ( mm256_anybits0( vh_mask ) )   
    {
       blake512_4way_init( &ctx.blake );
-       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_update( &ctx.blake, vhash, 64 );
       blake512_4way_close( &ctx.blake, vhashA );
    }

    if ( mm256_anybits1( vh_mask ) )
    {
       bmw512_4way_init( &ctx.bmw );
-       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_update( &ctx.bmw, vhash, 64 );
       bmw512_4way_close( &ctx.bmw, vhashB );
    }

    mm256_blend_hash_4x64( vh, vhA, vhB, vh_mask );

-    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_update( &ctx.keccak, vhash, 64 );
    keccak512_4way_close( &ctx.keccak, vhash );

    skein512_4way_init( &ctx.skein );
-    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_update( &ctx.skein, vhash, 64 );
    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), zero );
@@ -393,14 +428,14 @@ void quark_4way_hash( void *state, const void *input )
    if ( mm256_anybits0( vh_mask ) )    
    {
       keccak512_4way_init( &ctx.keccak );
-       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_update( &ctx.keccak, vhash, 64 );
       keccak512_4way_close( &ctx.keccak, vhashA );
    }

    if ( mm256_anybits1( vh_mask ) )
    {
       jh512_4way_init( &ctx.jh );
-       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_update( &ctx.jh, vhash, 64 );
       jh512_4way_close( &ctx.jh, vhashB );
    }

--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -15,7 +15,7 @@ bool register_quark_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_quark;
  gate->hash      = (void*)&quark_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  return true;
 };

--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -9,6 +9,10 @@
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif

 #if defined(QUBIT_4WAY)

@@ -16,10 +20,14 @@ typedef struct
 {
    luffa_4way_context      luffa;
    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
    simd_4way_context       simd;
-    simd_2way_context       simd2;
+#if defined(__VAES__)
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    sph_shavite512_context  shavite;
    hashState_echo          echo;
+#endif
 } qubit_4way_ctx_holder;

 qubit_4way_ctx_holder qubit_4way_ctx;
@@ -27,10 +35,14 @@ qubit_4way_ctx_holder qubit_4way_ctx;
 void init_qubit_4way_ctx()
 {
    cube_4way_init( &qubit_4way_ctx.cube, 512, 16, 32 );
-    sph_shavite512_init(&qubit_4way_ctx.shavite);
    simd_4way_init( &qubit_4way_ctx.simd, 512 );
-    simd_2way_init( &qubit_4way_ctx.simd2, 512 );
-    init_echo(&qubit_4way_ctx.echo, 512);
+#if defined(__VAES__)
+    shavite512_4way_init( &qubit_4way_ctx.shavite );
+    echo_4way_init( &qubit_4way_ctx.echo, 512 );
+#else
+    sph_shavite512_init( &qubit_4way_ctx.shavite );
+    init_echo( &qubit_4way_ctx.echo, 512 );
+#endif
 };

 void qubit_4way_hash( void *output, const void *input )
@@ -48,6 +60,13 @@ void qubit_4way_hash( void *output, const void *input )
     luffa_4way_close( &ctx.luffa, vhash );
     
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhash, vhash, 64 );
+
+#else
+
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     
     sph_shavite512( &ctx.shavite, hash0, 64 );
@@ -66,33 +85,45 @@ void qubit_4way_hash( void *output, const void *input )
     sph_shavite512_close( &ctx.shavite, hash3 );

     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+
+#endif
+
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhash, vhash, 512 );
+
+     dintrlv_4x128( output, output+32, output+64, output+96, vhash, 256 );
+    
+#else
+
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );

-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash0,
+                            (const BitSequence*)hash0, 512 );
     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 512 );
     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 512 );
     memcpy( &ctx.echo, &qubit_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_final_echo( &ctx.echo, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 512 );

     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
     memcpy( output+64, hash2, 32 );
     memcpy( output+96, hash3, 32 );
+#endif
 }

 int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (128)));
+     uint32_t hash[8*4] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -16,7 +16,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  return true;
 };

--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -7,16 +7,147 @@
 #include "ripemd-hash-4way.h"

 #define LBRY_INPUT_SIZE 112
-#define LBRY_MIDSTATE    64
+#define LBRY_MIDSTATE    96
 #define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+static __thread sha256_16way_context sha256_16w_mid;
+
+void lbry_16way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) vhashA[16<<4];
+   uint32_t _ALIGN(64) vhashB[16<<4];
+   uint32_t _ALIGN(64) vhashC[16<<4];
+   uint32_t _ALIGN(64) h0[32];
+   uint32_t _ALIGN(64) h1[32];
+   uint32_t _ALIGN(64) h2[32];
+   uint32_t _ALIGN(64) h3[32];
+   uint32_t _ALIGN(64) h4[32];
+   uint32_t _ALIGN(64) h5[32];
+   uint32_t _ALIGN(64) h6[32];
+   uint32_t _ALIGN(64) h7[32];
+   uint32_t _ALIGN(64) h8[32];
+   uint32_t _ALIGN(64) h9[32];
+   uint32_t _ALIGN(64) h10[32];
+   uint32_t _ALIGN(64) h11[32];
+   uint32_t _ALIGN(64) h12[32];
+   uint32_t _ALIGN(64) h13[32];
+   uint32_t _ALIGN(64) h14[32];
+   uint32_t _ALIGN(64) h15[32];
+   sha256_16way_context    ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_8way_context     ctx_sha512;
+   ripemd160_16way_context ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_16w_mid, sizeof(ctx_sha256) );
+   sha256_16way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<4), LBRY_TAIL );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   dintrlv_16x32( h0, h1, h2, h3, h4, h5, h6, h7,
+                  h8, h9, h10, h11, h12, h13, h14, h15, vhashA, 256 );
+   intrlv_8x64( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 256 );
+   intrlv_8x64( vhashB, h8, h9, h10, h11, h12, h13, h14, h15, 256 );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashA, 32 );
+   sha512_8way_close( &ctx_sha512, vhashA );
+
+   sha512_8way_init( &ctx_sha512 );
+   sha512_8way_update( &ctx_sha512, vhashB, 32 );
+   sha512_8way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   dintrlv_8x64( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 512 );
+   dintrlv_8x64( h8, h9, h10, h11, h12, h13, h14, h15, vhashB, 512 );
+   intrlv_16x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7,
+                         h8, h9, h10, h11, h12, h13, h14, h15, 512 );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA, 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_16way_init( &ctx_ripemd );
+   ripemd160_16way_update( &ctx_ripemd, vhashA+(8<<4), 32 );
+   ripemd160_16way_close( &ctx_ripemd, vhashC );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashB, 20 );
+   sha256_16way_update( &ctx_sha256, vhashC, 20 );
+   sha256_16way_close( &ctx_sha256, vhashA );
+
+   sha256_16way_init( &ctx_sha256 );
+   sha256_16way_update( &ctx_sha256, vhashA, 32 );
+   sha256_16way_close( &ctx_sha256, output );
+}
+
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[32*16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<4]);
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 27;   // aligned
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   // we need bigendian data...
+   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) );
+   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
+   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
+   intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata,
+        edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 );
+
+   sha256_16way_init( &sha256_16w_mid );
+   sha256_16way_update( &sha256_16w_mid, vdata, LBRY_MIDSTATE );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32(
+                         n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                         n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
+      lbry_16way_hash( hash, vdata );
+
+      for ( int i = 0; i < 16; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_16x32( lane_hash, hash, i, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(LBRY_8WAY)

 static __thread sha256_8way_context sha256_8w_mid;

 void lbry_8way_hash( void* output, const void* input )
 {
-   uint32_t _ALIGN(64) vhashA[16<<3];
+   uint32_t _ALIGN(128) vhashA[16<<3];
   uint32_t _ALIGN(64) vhashB[16<<3];
   uint32_t _ALIGN(64) vhashC[16<<3];
   uint32_t _ALIGN(32) h0[32];
@@ -32,11 +163,11 @@ void lbry_8way_hash( void* output, const void* input )
   ripemd160_8way_context  ctx_ripemd;

   memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
-   sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8way_update( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
   sha256_8way_close( &ctx_sha256, vhashA );

   sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
   sha256_8way_close( &ctx_sha256, vhashA );

   // reinterleave to do sha512 4-way 64 bit twice.
@@ -45,11 +176,11 @@ void lbry_8way_hash( void* output, const void* input )
   intrlv_4x64( vhashB, h4, h5, h6, h7, 256 );

   sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashA, 32 );
+   sha512_4way_update( &ctx_sha512, vhashA, 32 );
   sha512_4way_close( &ctx_sha512, vhashA );

   sha512_4way_init( &ctx_sha512 );
-   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_update( &ctx_sha512, vhashB, 32 );
   sha512_4way_close( &ctx_sha512, vhashB );

   // back to 8-way 32 bit
@@ -58,20 +189,20 @@ void lbry_8way_hash( void* output, const void* input )
   intrlv_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );

   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA, 32 );
   ripemd160_8way_close( &ctx_ripemd, vhashB );

   ripemd160_8way_init( &ctx_ripemd );
-   ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8way_update( &ctx_ripemd, vhashA+(8<<3), 32 );
   ripemd160_8way_close( &ctx_ripemd, vhashC );

   sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashB, 20 );
-   sha256_8way( &ctx_sha256, vhashC, 20 );
+   sha256_8way_update( &ctx_sha256, vhashB, 20 );
+   sha256_8way_update( &ctx_sha256, vhashC, 20 );
   sha256_8way_close( &ctx_sha256, vhashA );

   sha256_8way_init( &ctx_sha256 );
-   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_update( &ctx_sha256, vhashA, 32 );
   sha256_8way_close( &ctx_sha256, output );
 }

@@ -81,21 +212,16 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t edata[32] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[27];
   const uint32_t first_nonce = pdata[27];
   const uint32_t Htarg = ptarget[7];
-   uint32_t edata[32] __attribute__ ((aligned (64)));
   __m256i  *noncev = (__m256i*)vdata + 27;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

-   uint64_t htmax[] = {          0,        0xF,       0xFF,
-                             0xFFF,     0xFFFF, 0x10000000 };
-   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                        0xFFFFF000, 0xFFFF0000,          0 };
-
   // we need bigendian data...
   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
@@ -106,33 +232,30 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
   casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) );
   casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) );
   intrlv_8x32( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 1024 );
+                       edata, edata, edata, edata, 1024 );
+
   sha256_8way_init( &sha256_8w_mid );
-   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+   sha256_8way_update( &sha256_8w_mid, vdata, LBRY_MIDSTATE );

-   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   do
   {
-      uint32_t mask = masks[m];
-      do
-      {
-        *noncev = mm256_bswap_32( _mm256_set_epi32(
-                                          n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
-         lbry_8way_hash( hash, vdata );
+      *noncev = mm256_bswap_32( _mm256_set_epi32(
+                                       n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
+      lbry_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( hash7[ i ] <= Htarg ) )
+      {
+         // deinterleave hash for lane
+         extr_lane_8x32( lane_hash, hash, i, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
-            // deinterleave hash for lane
-            extr_lane_8x32( lane_hash, hash, i, 256 );
-            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            {
-              pdata[27] = n + i;
-              submit_lane_solution( work, lane_hash, mythr, i );
-            }
+            pdata[27] = n + i;
+            submit_lane_solution( work, lane_hash, mythr, i );
         }
-         n += 8;
-      } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
-      break;
-   }
+      }
+      n += 8;
+   } while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return 0;
 }
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -98,16 +98,23 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = AVX2_OPT | SHA_OPT;
-#if defined (LBRY_8WAY)
+//  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
+#if defined (LBRY_16WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_16way;
+  gate->hash                  = (void*)&lbry_16way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
+#elif defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #elif defined (LBRY_4WAY)
  gate->scanhash              = (void*)&scanhash_lbry_4way;
  gate->hash                  = (void*)&lbry_4way_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT;
 #else 
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
+  gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT;
 #endif
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -4,11 +4,19 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LBRY_16WAY 1
+#elif defined(__AVX2__)
+  #define LBRY_8WAY 1
+#endif
+/*
 #if !defined(__SHA__)
 #if defined(__AVX2__)
  #define LBRY_8WAY
 #endif
 #endif
+*/

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -18,18 +26,23 @@

 bool register_lbry_algo( algo_gate_t* gate );

-#if defined(LBRY_8WAY)
+#if defined(LBRY_16WAY)
+
+void lbry_16way_hash( void *state, const void *input );
+int scanhash_lbry_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(LBRY_8WAY)

 void lbry_8way_hash( void *state, const void *input );
 int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-/*
+
 #elif defined(LBRY_4WAY)

 void lbry_4way_hash( void *state, const void *input );
 int scanhash_lbry_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
-*/
+
 #else

 void lbry_hash( void *state, const void *input );
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -80,9 +80,6 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 	// we need bigendian data...
        swab32_array( endiandata, pdata, 32 );

-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
 	for (int m=0; m < sizeof(masks); m++) {
 		if (Htarg <= htmax[m]) {
 			uint32_t mask = masks[m];
@@ -90,23 +87,11 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 				pdata[27] = ++n;
 				be32enc(&endiandata[27], n);
 				lbry_hash(hash64, &endiandata);
-#ifndef DEBUG_ALGO
 				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
+               pdata[27] = n;
+               submit_solution( work, hash64, mythr );
 				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
+			} while ( (n < max_nonce -8) && !work_restart[thr_id].restart);
 			break;
 		}
 	}
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -259,7 +259,8 @@ void ripemd160_4way_init( ripemd160_4way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len )
 {
   __m128i *vdata = (__m128i*)data;
   size_t ptr;
@@ -559,7 +560,8 @@ void ripemd160_8way_init( ripemd160_8way_context *sc )
   sc->count_high = sc->count_low = 0;
 }

-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -623,3 +625,303 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )

 #endif // __AVX2__

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+//  RIPEMD-160 16 way
+
+
+#define F16W_1(x, y, z) \
+   _mm512_xor_si512( _mm512_xor_si512( x, y ), z )
+
+#define F16W_2(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( y, z ), x ), z )
+
+#define F16W_3(x, y, z) \
+   _mm512_xor_si512( _mm512_or_si512( x, mm512_not( y ) ), z )
+
+#define F16W_4(x, y, z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( x, y ), z ), y )
+
+#define F16W_5(x, y, z) \
+   _mm512_xor_si512( x, _mm512_or_si512( y, mm512_not( z ) ) )
+
+#define RR_16W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
+                _mm512_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 m512_const1_64( k ) ), s ), e ); \
+   c = mm512_rol_32( c, 10 );\
+} while (0)
+
+#define ROUND1_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_16W(a, b, c, d, e, f, s, r, k)  \
+        RR_16W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_16way_round( ripemd160_16way_context *sc )
+{
+   const __m512i *in = (__m512i*)sc->buf;
+   __m512i *h  = (__m512i*)sc->val;
+   register __m512i A1, B1, C1, D1, E1;
+   register __m512i A2, B2, C2, D2, E2;
+   __m512i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_16W( A, B, C, D, E, F16W_1, 11, in[ 0], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 14, in[ 1], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1, 15, in[ 2], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 12, in[ 3], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  5, in[ 4], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[ 5], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1,  7, in[ 6], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  9, in[ 7], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1, 11, in[ 8], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1, 13, in[ 9], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1, 14, in[10], 1 );
+   ROUND1_16W( E, A, B, C, D, F16W_1, 15, in[11], 1 );
+   ROUND1_16W( D, E, A, B, C, F16W_1,  6, in[12], 1 );
+   ROUND1_16W( C, D, E, A, B, F16W_1,  7, in[13], 1 );
+   ROUND1_16W( B, C, D, E, A, F16W_1,  9, in[14], 1 );
+   ROUND1_16W( A, B, C, D, E, F16W_1,  8, in[15], 1 );
+
+   ROUND1_16W( E, A, B, C, D, F16W_2,  7, in[ 7], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  6, in[ 4], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2,  8, in[13], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2, 13, in[ 1], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 11, in[10], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2,  9, in[ 6], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  7, in[15], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 15, in[ 3], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[12], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 12, in[ 0], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 15, in[ 9], 2 );
+   ROUND1_16W( D, E, A, B, C, F16W_2,  9, in[ 5], 2 );
+   ROUND1_16W( C, D, E, A, B, F16W_2, 11, in[ 2], 2 );
+   ROUND1_16W( B, C, D, E, A, F16W_2,  7, in[14], 2 );
+   ROUND1_16W( A, B, C, D, E, F16W_2, 13, in[11], 2 );
+   ROUND1_16W( E, A, B, C, D, F16W_2, 12, in[ 8], 2 );
+
+   ROUND1_16W( D, E, A, B, C, F16W_3, 11, in[ 3], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[10], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  6, in[14], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3,  7, in[ 4], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3, 14, in[ 9], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3, 13, in[ 8], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 14, in[ 2], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3, 13, in[ 0], 3 );
+   ROUND1_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND1_16W( B, C, D, E, A, F16W_3,  5, in[13], 3 );
+   ROUND1_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND1_16W( E, A, B, C, D, F16W_3,  7, in[ 5], 3 );
+   ROUND1_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+
+   ROUND1_16W( C, D, E, A, B, F16W_4, 11, in[ 1], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4, 12, in[ 9], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4, 14, in[11], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4, 15, in[10], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 0], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 15, in[ 8], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  9, in[12], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[ 4], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  9, in[13], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4, 14, in[ 3], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4,  5, in[ 7], 4 );
+   ROUND1_16W( B, C, D, E, A, F16W_4,  6, in[15], 4 );
+   ROUND1_16W( A, B, C, D, E, F16W_4,  8, in[14], 4 );
+   ROUND1_16W( E, A, B, C, D, F16W_4,  6, in[ 5], 4 );
+   ROUND1_16W( D, E, A, B, C, F16W_4,  5, in[ 6], 4 );
+   ROUND1_16W( C, D, E, A, B, F16W_4, 12, in[ 2], 4 );
+
+   ROUND1_16W( B, C, D, E, A, F16W_5,  9, in[ 4], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 15, in[ 0], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5,  5, in[ 5], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5, 11, in[ 9], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  6, in[ 7], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  8, in[12], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 13, in[ 2], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 12, in[10], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  5, in[14], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5, 12, in[ 1], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5, 13, in[ 3], 5 );
+   ROUND1_16W( A, B, C, D, E, F16W_5, 14, in[ 8], 5 );
+   ROUND1_16W( E, A, B, C, D, F16W_5, 11, in[11], 5 );
+   ROUND1_16W( D, E, A, B, C, F16W_5,  8, in[ 6], 5 );
+   ROUND1_16W( C, D, E, A, B, F16W_5,  5, in[15], 5 );
+   ROUND1_16W( B, C, D, E, A, F16W_5,  6, in[13], 5 );
+
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[ 5], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5,  9, in[14], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  9, in[ 7], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 11, in[ 0], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 13, in[ 9], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5, 15, in[ 2], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 15, in[11], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5,  5, in[ 4], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5,  7, in[13], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5,  7, in[ 6], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  8, in[15], 1 );
+   ROUND2_16W( E, A, B, C, D, F16W_5, 11, in[ 8], 1 );
+   ROUND2_16W( D, E, A, B, C, F16W_5, 14, in[ 1], 1 );
+   ROUND2_16W( C, D, E, A, B, F16W_5, 14, in[10], 1 );
+   ROUND2_16W( B, C, D, E, A, F16W_5, 12, in[ 3], 1 );
+   ROUND2_16W( A, B, C, D, E, F16W_5,  6, in[12], 1 );
+
+   ROUND2_16W( E, A, B, C, D, F16W_4,  9, in[ 6], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4, 13, in[11], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 15, in[ 3], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[ 7], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 12, in[ 0], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4,  8, in[13], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  9, in[ 5], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4, 11, in[10], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4,  7, in[14], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4,  7, in[15], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 12, in[ 8], 2 );
+   ROUND2_16W( D, E, A, B, C, F16W_4,  7, in[12], 2 );
+   ROUND2_16W( C, D, E, A, B, F16W_4,  6, in[ 4], 2 );
+   ROUND2_16W( B, C, D, E, A, F16W_4, 15, in[ 9], 2 );
+   ROUND2_16W( A, B, C, D, E, F16W_4, 13, in[ 1], 2 );
+   ROUND2_16W( E, A, B, C, D, F16W_4, 11, in[ 2], 2 );
+
+   ROUND2_16W( D, E, A, B, C, F16W_3,  9, in[15], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  7, in[ 5], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 15, in[ 1], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 11, in[ 3], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  8, in[ 7], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  6, in[14], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3,  6, in[ 6], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 14, in[ 9], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 12, in[11], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3, 13, in[ 8], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[12], 3 );
+   ROUND2_16W( C, D, E, A, B, F16W_3, 14, in[ 2], 3 );
+   ROUND2_16W( B, C, D, E, A, F16W_3, 13, in[10], 3 );
+   ROUND2_16W( A, B, C, D, E, F16W_3, 13, in[ 0], 3 );
+   ROUND2_16W( E, A, B, C, D, F16W_3,  7, in[ 4], 3 );
+   ROUND2_16W( D, E, A, B, C, F16W_3,  5, in[13], 3 );
+
+   ROUND2_16W( C, D, E, A, B, F16W_2, 15, in[ 8], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  5, in[ 6], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2,  8, in[ 4], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2, 11, in[ 1], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 14, in[ 3], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 14, in[11], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  6, in[15], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 14, in[ 0], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  6, in[ 5], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2,  9, in[12], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2, 12, in[ 2], 4 );
+   ROUND2_16W( B, C, D, E, A, F16W_2,  9, in[13], 4 );
+   ROUND2_16W( A, B, C, D, E, F16W_2, 12, in[ 9], 4 );
+   ROUND2_16W( E, A, B, C, D, F16W_2,  5, in[ 7], 4 );
+   ROUND2_16W( D, E, A, B, C, F16W_2, 15, in[10], 4 );
+   ROUND2_16W( C, D, E, A, B, F16W_2,  8, in[14], 4 );
+
+   ROUND2_16W( B, C, D, E, A, F16W_1,  8, in[12], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[15], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 12, in[10], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  9, in[ 4], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 12, in[ 1], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  5, in[ 5], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1, 14, in[ 8], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1,  6, in[ 7], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1,  8, in[ 6], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 13, in[ 2], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1,  6, in[13], 5 );
+   ROUND2_16W( A, B, C, D, E, F16W_1,  5, in[14], 5 );
+   ROUND2_16W( E, A, B, C, D, F16W_1, 15, in[ 0], 5 );
+   ROUND2_16W( D, E, A, B, C, F16W_1, 13, in[ 3], 5 );
+   ROUND2_16W( C, D, E, A, B, F16W_1, 11, in[ 9], 5 );
+   ROUND2_16W( B, C, D, E, A, F16W_1, 11, in[11], 5 );
+
+   tmp =  _mm512_add_epi32( _mm512_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm512_add_epi32( _mm512_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm512_add_epi32( _mm512_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm512_add_epi32( _mm512_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm512_add_epi32( _mm512_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+void ripemd160_16way_init( ripemd160_16way_context *sc )
+{
+   sc->val[0] = m512_const1_64( 0x6745230167452301 );
+   sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m512_const1_64( 0x1032547610325476 );
+   sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+                      size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_16way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_512( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_16way_round( sc );
+       memset_zero_512( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm512_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm512_set1_epi32( high );
+    ripemd160_16way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m512i( dst, u ) = sc->val[u];
+}
+
+#endif  // AVX512
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -16,7 +16,8 @@ typedef struct
 } __attribute__ ((aligned (64))) ripemd160_4way_context;

 void ripemd160_4way_init( ripemd160_4way_context *sc );
-void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
+void ripemd160_4way_update( ripemd160_4way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );

 #if defined (__AVX2__)
@@ -26,13 +27,28 @@ typedef struct
   __m256i buf[64>>2];
   __m256i val[5];
   uint32_t count_high, count_low;
-} __attribute__ ((aligned (64))) ripemd160_8way_context;
+} __attribute__ ((aligned (128))) ripemd160_8way_context;

 void ripemd160_8way_init( ripemd160_8way_context *sc );
-void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
+void ripemd160_8way_update( ripemd160_8way_context *sc, const void *data,
+                            size_t len );
 void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+typedef struct
+{
+   __m512i buf[64>>2];
+   __m512i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (128))) ripemd160_16way_context;
+
+void ripemd160_16way_init( ripemd160_16way_context *sc );
+void ripemd160_16way_update( ripemd160_16way_context *sc, const void *data,
+                      size_t len );
+void ripemd160_16way_close( ripemd160_16way_context *sc, void *dst );
+
+#endif // AVX512
 #endif // __AVX2__
 #endif // __SSE4_2__
 #endif // RIPEMD_HASH_4WAY_H__
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -41,13 +41,9 @@
 #define SHA2_HASH_4WAY_H__ 1

 #include <stddef.h>
-#include "sph_types.h"
 #include "simd-utils.h"

 #if defined(__SSE2__)
-//#if defined(__SSE4_2__)
-
-//#define SPH_SIZE_sha256   256

 // SHA-256 4 way

@@ -56,12 +52,15 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));

 void sha256_4way_init( sha256_4way_context *sc );
-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
+void sha256_4way_update( sha256_4way_context *sc, const void *data,
+                         size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );

+#endif  // SSE2
+
 #if defined (__AVX2__)

 // SHA-256 8 way
@@ -71,13 +70,32 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
 void sha256_8way_close( sha256_8way_context *sc, void *dst );

-//#define SPH_SIZE_sha512   512
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+typedef struct {
+   __m512i buf[64>>2];
+   __m512i val[8];
+   uint32_t count_high, count_low;
+   bool initialized;
+} sha256_16way_context __attribute__ ((aligned (128)));
+
+void sha256_16way_init( sha256_16way_context *sc );
+void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
+void sha256_16way_close( sha256_16way_context *sc, void *dst );
+
+#endif // AVX512
+
+#if defined (__AVX2__)

 // SHA-512 4 way

@@ -86,30 +104,31 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));

 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-// SHA-256 11 way hybrid
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+#endif  // AVX2
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
 typedef struct {
-   __m256i  bufx[64>>2];
-   __m256i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_11way_context;
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));

-void sha256_11way_init( sha256_11way_context *ctx );
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                 const void *datay, const void *dataz, size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
-	                 void *dstz  );
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
+
+#endif  // AVX512

-#endif  // __AVX2__
-#endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -39,47 +39,31 @@
 // SHA-256 32 bit

 /*
-static const sph_u32 H256[8] = {
-        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
-        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
-        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
-        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+static const uint32_t H256[8] =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 */

-static const sph_u32 K256[64] = {
-        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
-        SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
-        SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
-        SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
-        SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
-        SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
-        SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
-        SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
-        SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
-        SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
-        SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
-        SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
-        SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
-        SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
-        SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
-        SPH_C32(0x06CA6351), SPH_C32(0x14292967),
-        SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
-        SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
-        SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
-        SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
-        SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
-        SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
-        SPH_C32(0xD192E819), SPH_C32(0xD6990624),
-        SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
-        SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
-        SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
-        SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
-        SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
-        SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
-        SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
-        SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
-        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+static const uint32_t K256[64] =
+{
+   0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+   0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+   0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+   0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+   0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+   0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+   0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+   0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+   0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+   0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+   0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+   0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+   0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+   0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+   0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };

 // SHA-256 4 way
@@ -248,7 +232,7 @@ void sha256_4way_init( sha256_4way_context *sc )
 */
 }

-void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
+void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
   size_t ptr;
@@ -273,7 +257,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
         ptr = 0;
      }
      clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
      sc->count_low = clow2;
      if ( clow2 < clow )
         sc->count_high++;
@@ -306,10 +290,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )

    sc->buf[ pad >> 2 ] =
                 mm128_bswap_32( m128_const1_32( high ) );
-//                 mm128_bswap_32( _mm_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm128_bswap_32( m128_const1_32( low ) );
-//                 mm128_bswap_32( _mm_set1_epi32( low ) );
    sha256_4way_round( sc, sc->buf, sc->val );

    mm128_block_bswap_32( dst, sc->val );
@@ -483,7 +465,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 */
 }

-void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -508,7 +490,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
         ptr = 0;
      }
      clow = sc->count_low;
-      clow2 = SPH_T32( clow + clen );
+      clow2 = clow + clen;
      sc->count_low = clow2;
      if ( clow2 < clow )
         sc->count_high++;
@@ -549,5 +531,233 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    mm256_block_bswap_32( dst, sc->val );
 }

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-256 16 way
+
+#define CHx16(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJx16(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  2), mm512_ror_32(x, 13) ), mm512_ror_32( x, 22) )
+
+#define BSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  6), mm512_ror_32(x, 11) ), mm512_ror_32( x, 25) )
+
+#define SSG2_0x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x,  7), mm512_ror_32(x, 18) ), _mm512_srli_epi32(x, 3) ) 
+
+#define SSG2_1x16(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+       mm512_ror_32(x, 17), mm512_ror_32(x, 19) ), _mm512_srli_epi32(x, 10) )
+
+#define SHA2x16_MEXP( a, b, c, d ) \
+     mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] );
+
+#define SHA2s_16WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
+  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
+  D  = _mm512_add_epi32( D,  T1 ); \
+  H  = _mm512_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_16way_round( sha256_16way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   register  __m512i A, B, C, D, E, F, G, H;
+   __m512i W[16];
+
+   mm512_block_bswap_32( W  , in   );
+   mm512_block_bswap_32( W+8, in+8 );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E6676A09E667 );
+      B = m512_const1_64( 0xBB67AE85BB67AE85 );
+      C = m512_const1_64( 0x3C6EF3723C6EF372 );
+      D = m512_const1_64( 0xA54FF53AA54FF53A );
+      E = m512_const1_64( 0x510E527F510E527F );
+      F = m512_const1_64( 0x9B05688C9B05688C );
+      G = m512_const1_64( 0x1F83D9AB1F83D9AB );
+      H = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   }
+
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x16_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x16_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x16_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x16_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x16_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x16_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x16_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x16_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x16_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x16_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x16_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x16_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x16_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x16_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x16_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x16_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi32( r[0], A );
+      r[1] = _mm512_add_epi32( r[1], B );
+      r[2] = _mm512_add_epi32( r[2], C );
+      r[3] = _mm512_add_epi32( r[3], D );
+      r[4] = _mm512_add_epi32( r[4], E );
+      r[5] = _mm512_add_epi32( r[5], F );
+      r[6] = _mm512_add_epi32( r[6], G );
+      r[7] = _mm512_add_epi32( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi32( A, m512_const1_64( 0x6A09E6676A09E667 ) );
+      r[1] = _mm512_add_epi32( B, m512_const1_64( 0xBB67AE85BB67AE85 ) );
+      r[2] = _mm512_add_epi32( C, m512_const1_64( 0x3C6EF3723C6EF372 ) );
+      r[3] = _mm512_add_epi32( D, m512_const1_64( 0xA54FF53AA54FF53A ) );
+      r[4] = _mm512_add_epi32( E, m512_const1_64( 0x510E527F510E527F ) );
+      r[5] = _mm512_add_epi32( F, m512_const1_64( 0x9B05688C9B05688C ) );
+      r[6] = _mm512_add_epi32( G, m512_const1_64( 0x1F83D9AB1F83D9AB ) );
+      r[7] = _mm512_add_epi32( H, m512_const1_64( 0x5BE0CD195BE0CD19 ) );
+   }
+}
+
+void sha256_16way_init( sha256_16way_context *sc )
+{
+   sc->initialized = false;
+   sc->count_high = sc->count_low = 0;
+}
+
+
+void sha256_16way_update( sha256_16way_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_16way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_16way_close( sha256_16way_context *sc, void *dst )
+{
+    unsigned ptr;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+    sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         sha256_16way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm512_bswap_32( m512_const1_32( low ) );
+
+    sha256_16way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_32( dst, sc->val );
+}
+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -15,19 +15,19 @@ void sha256q_8way_hash( void* output, const void* input )
   sha256_8way_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );

-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
   sha256_8way_close( &ctx, vhash );

   sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, vhash );

   sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, vhash );

   sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, output );
 }

@@ -61,7 +61,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
   // Need big endian data
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
@@ -108,19 +108,19 @@ void sha256q_4way_hash( void* output, const void* input )
   sha256_4way_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );

-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
   sha256_4way_close( &ctx, vhash );

   sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, vhash );

   sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, vhash );

   sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, output );
 }

@@ -154,7 +154,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -15,15 +15,15 @@ void sha256t_8way_hash( void* output, const void* input )
   sha256_8way_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );

-   sha256_8way( &ctx, input + (64<<3), 16 );
+   sha256_8way_update( &ctx, input + (64<<3), 16 );
   sha256_8way_close( &ctx, vhash );

   sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, vhash );

   sha256_8way_init( &ctx );
-   sha256_8way( &ctx, vhash, 32 );
+   sha256_8way_update( &ctx, vhash, 32 );
   sha256_8way_close( &ctx, output );
 }

@@ -59,7 +59,7 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   // Need big endian data
   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way( &sha256_ctx8, vdata, 64 );
+   sha256_8way_update( &sha256_ctx8, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
@@ -101,15 +101,15 @@ void sha256t_4way_hash( void* output, const void* input )
   sha256_4way_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );

-   sha256_4way( &ctx, input + (64<<2), 16 );
+   sha256_4way_update( &ctx, input + (64<<2), 16 );
   sha256_4way_close( &ctx, vhash );

   sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, vhash );

   sha256_4way_init( &ctx );
-   sha256_4way( &ctx, vhash, 32 );
+   sha256_4way_update( &ctx, vhash, 32 );
   sha256_4way_close( &ctx, output );
 }

@@ -143,7 +143,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way( &sha256_ctx4, vdata, 64 );
+   sha256_4way_update( &sha256_ctx4, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,60 +36,290 @@
 #include <string.h>
 #include "sha-hash-4way.h"

-// SHA-512 4 way 64 bit
-
 /*
-static const sph_u64 H512[8] = {
-        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+static const uit64_t H512[8] =
+{
+   0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+   0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+   0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+   0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };
 */

-static const sph_u64 K512[80] = {
-	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
-	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
-	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
-	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
-	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
-	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
-	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
-	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
-	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
-	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
-	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
-	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
-	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
-	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
-	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
-	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
-	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
-	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
-	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
-	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
-	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
-	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
-	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
-	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
-	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
-	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
-	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
-	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
-	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
-	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
-	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
-	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
-	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
-	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
-	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
-	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
-	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
-	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
-	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
-	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+static const uint64_t K512[80] =
+{
+	0x428A2F98D728AE22, 0x7137449123EF65CD,
+	0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019,
+	0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE,
+	0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
+	0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
+	0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
+	0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210,
+	0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
+	0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926,
+	0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
+	0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001,
+	0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910,
+	0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
+	0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
+	0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
+	0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
+	0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207,
+	0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
+	0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493,
+	0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
+	0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way 64 bit
+
+#define CH8W(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJ8W(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+
+#define BSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+
+#define SSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+
+#define SSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
+
+static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
+{
+   __m512i w0a, w1a, w0b, w1b;
+   w0a = mm512_ror_64( w0, 1 );
+   w1a = mm512_ror_64( w1,19 );
+   w0b = mm512_ror_64( w0, 8 );
+   w1b = mm512_ror_64( w1,61 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   w0b = _mm512_srli_epi64( w0, 7 );
+   w1b = _mm512_srli_epi64( w1, 6 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   return _mm512_add_epi64( w0a, w1a );
+}
+
+
+#define SSG8W_512x2_0( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-15], 1 ); \
+  X1a = mm512_ror_64( W[i-14], 1 ); \
+  X0b = mm512_ror_64( W[i-15], 8 ); \
+  X1b = mm512_ror_64( W[i-14], 8 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SSG8W_512x2_1( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-2],19 ); \
+  X1a = mm512_ror_64( W[i-1],19 ); \
+  X0b = mm512_ror_64( W[i-2],61 ); \
+  X1b = mm512_ror_64( W[i-1],61 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
+  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  D  = _mm512_add_epi64( D, T1 ); \
+  H  = _mm512_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   int i;
+   register __m512i A, B, C, D, E, F, G, H;
+   __m512i W[80];
+
+   mm512_block_bswap_64( W  , in );
+   mm512_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E667F3BCC908 );
+      B = m512_const1_64( 0xBB67AE8584CAA73B );
+      C = m512_const1_64( 0x3C6EF372FE94F82B );
+      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m512_const1_64( 0x510E527FADE682D1 );
+      F = m512_const1_64( 0x9B05688C2B3E6C1F );
+      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m512_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi64( r[0], A );
+      r[1] = _mm512_add_epi64( r[1], B );
+      r[2] = _mm512_add_epi64( r[2], C );
+      r[3] = _mm512_add_epi64( r[3], D );
+      r[4] = _mm512_add_epi64( r[4], E );
+      r[5] = _mm512_add_epi64( r[5], F );
+      r[6] = _mm512_add_epi64( r[6], G );
+      r[7] = _mm512_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_8way_init( sha512_8way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+}
+
+void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_8way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_8way_close( sha512_8way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+    const __m512i shuff_bswap64 = m512_const_64(
+                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_8way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+    sha512_8way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_64( dst, sc->val );
+}
+
+
+#endif   // AVX512
+
+// SHA-512 4 way 64 bit
+
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -254,7 +484,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }

-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6

+
+#if defined(__AVX2__)
+
+#define DECL_STATE8   \
+   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
+           A08, A09, A0A, A0B; \
+   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
+           B8, B9, BA, BB, BC, BD, BE, BF; \
+   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
+           C8, C9, CA, CB, CC, CD, CE, CF; \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
+           M8, M9, MA, MB, MC, MD, ME, MF; \
+   sph_u32 Wlow, Whigh;
+
+#define READ_STATE8(state) do \
+{ \
+   if ( (state)->state_loaded ) \
+   { \
+      A00 = (state)->A[0]; \
+      A01 = (state)->A[1]; \
+      A02 = (state)->A[2]; \
+      A03 = (state)->A[3]; \
+      A04 = (state)->A[4]; \
+      A05 = (state)->A[5]; \
+      A06 = (state)->A[6]; \
+      A07 = (state)->A[7]; \
+      A08 = (state)->A[8]; \
+      A09 = (state)->A[9]; \
+      A0A = (state)->A[10]; \
+      A0B = (state)->A[11]; \
+      B0 = (state)->B[0]; \
+      B1 = (state)->B[1]; \
+      B2 = (state)->B[2]; \
+      B3 = (state)->B[3]; \
+      B4 = (state)->B[4]; \
+      B5 = (state)->B[5]; \
+      B6 = (state)->B[6]; \
+      B7 = (state)->B[7]; \
+      B8 = (state)->B[8]; \
+      B9 = (state)->B[9]; \
+      BA = (state)->B[10]; \
+      BB = (state)->B[11]; \
+      BC = (state)->B[12]; \
+      BD = (state)->B[13]; \
+      BE = (state)->B[14]; \
+      BF = (state)->B[15]; \
+      C0 = (state)->C[0]; \
+      C1 = (state)->C[1]; \
+      C2 = (state)->C[2]; \
+      C3 = (state)->C[3]; \
+      C4 = (state)->C[4]; \
+      C5 = (state)->C[5]; \
+      C6 = (state)->C[6]; \
+      C7 = (state)->C[7]; \
+      C8 = (state)->C[8]; \
+      C9 = (state)->C[9]; \
+      CA = (state)->C[10]; \
+      CB = (state)->C[11]; \
+      CC = (state)->C[12]; \
+      CD = (state)->C[13]; \
+      CE = (state)->C[14]; \
+      CF = (state)->C[15]; \
+   } \
+   else \
+   { \
+       (state)->state_loaded = true; \
+       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A02 = m256_const1_64( 0xE782B699E782B699 ); \
+       A03 = m256_const1_64( 0x5530463255304632 ); \
+       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
+       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
+       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
+       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
+       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
+       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
+       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
+       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
+       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
+       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
+       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
+       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
+       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
+       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
+       BD = m256_const1_64( 0x75941D9975941D99 ); \
+       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
+       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
+       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
+       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
+       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
+       C3 = m256_const1_64( 0x8134F3598134F359 ); \
+       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
+       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
+       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
+       C7 = m256_const1_64( 0x0405278004052780 ); \
+       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
+       C9 = m256_const1_64( 0x5194358F5194358F ); \
+       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
+       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
+       CC = m256_const1_64( 0x950C3434950C3434 ); \
+       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
+       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
+       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
+   } \
+   Wlow = (state)->Wlow; \
+   Whigh = (state)->Whigh; \
+} while (0)
+
+#define WRITE_STATE8(state)   do { \
+      (state)->A[0] = A00; \
+      (state)->A[1] = A01; \
+      (state)->A[2] = A02; \
+      (state)->A[3] = A03; \
+      (state)->A[4] = A04; \
+      (state)->A[5] = A05; \
+      (state)->A[6] = A06; \
+      (state)->A[7] = A07; \
+      (state)->A[8] = A08; \
+      (state)->A[9] = A09; \
+      (state)->A[10] = A0A; \
+      (state)->A[11] = A0B; \
+      (state)->B[0] = B0; \
+      (state)->B[1] = B1; \
+      (state)->B[2] = B2; \
+      (state)->B[3] = B3; \
+      (state)->B[4] = B4; \
+      (state)->B[5] = B5; \
+      (state)->B[6] = B6; \
+      (state)->B[7] = B7; \
+      (state)->B[8] = B8; \
+      (state)->B[9] = B9; \
+      (state)->B[10] = BA; \
+      (state)->B[11] = BB; \
+      (state)->B[12] = BC; \
+      (state)->B[13] = BD; \
+      (state)->B[14] = BE; \
+      (state)->B[15] = BF; \
+      (state)->C[0] = C0; \
+      (state)->C[1] = C1; \
+      (state)->C[2] = C2; \
+      (state)->C[3] = C3; \
+      (state)->C[4] = C4; \
+      (state)->C[5] = C5; \
+      (state)->C[6] = C6; \
+      (state)->C[7] = C7; \
+      (state)->C[8] = C8; \
+      (state)->C[9] = C9; \
+      (state)->C[10] = CA; \
+      (state)->C[11] = CB; \
+      (state)->C[12] = CC; \
+      (state)->C[13] = CD; \
+      (state)->C[14] = CE; \
+      (state)->C[15] = CF; \
+      (state)->Wlow = Wlow; \
+      (state)->Whigh = Whigh; \
+   } while (0)
+
+#define DECODE_BLOCK8 \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD8 \
+do { \
+    B0 = _mm256_add_epi32( B0, M0 );\
+    B1 = _mm256_add_epi32( B1, M1 );\
+    B2 = _mm256_add_epi32( B2, M2 );\
+    B3 = _mm256_add_epi32( B3, M3 );\
+    B4 = _mm256_add_epi32( B4, M4 );\
+    B5 = _mm256_add_epi32( B5, M5 );\
+    B6 = _mm256_add_epi32( B6, M6 );\
+    B7 = _mm256_add_epi32( B7, M7 );\
+    B8 = _mm256_add_epi32( B8, M8 );\
+    B9 = _mm256_add_epi32( B9, M9 );\
+    BA = _mm256_add_epi32( BA, MA );\
+    BB = _mm256_add_epi32( BB, MB );\
+    BC = _mm256_add_epi32( BC, MC );\
+    BD = _mm256_add_epi32( BD, MD );\
+    BE = _mm256_add_epi32( BE, ME );\
+    BF = _mm256_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB8 \
+do { \
+    C0 = _mm256_sub_epi32( C0, M0 ); \
+    C1 = _mm256_sub_epi32( C1, M1 ); \
+    C2 = _mm256_sub_epi32( C2, M2 ); \
+    C3 = _mm256_sub_epi32( C3, M3 ); \
+    C4 = _mm256_sub_epi32( C4, M4 ); \
+    C5 = _mm256_sub_epi32( C5, M5 ); \
+    C6 = _mm256_sub_epi32( C6, M6 ); \
+    C7 = _mm256_sub_epi32( C7, M7 ); \
+    C8 = _mm256_sub_epi32( C8, M8 ); \
+    C9 = _mm256_sub_epi32( C9, M9 ); \
+    CA = _mm256_sub_epi32( CA, MA ); \
+    CB = _mm256_sub_epi32( CB, MB ); \
+    CC = _mm256_sub_epi32( CC, MC ); \
+    CD = _mm256_sub_epi32( CD, MD ); \
+    CE = _mm256_sub_epi32( CE, ME ); \
+    CF = _mm256_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W8 \
+do { \
+   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
+   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+} while (0)
+
+#define SWAP_BC8 \
+do { \
+    mm256_swap512_256( B0, C0 ); \
+    mm256_swap512_256( B1, C1 ); \
+    mm256_swap512_256( B2, C2 ); \
+    mm256_swap512_256( B3, C3 ); \
+    mm256_swap512_256( B4, C4 ); \
+    mm256_swap512_256( B5, C5 ); \
+    mm256_swap512_256( B6, C6 ); \
+    mm256_swap512_256( B7, C7 ); \
+    mm256_swap512_256( B8, C8 ); \
+    mm256_swap512_256( B9, C9 ); \
+    mm256_swap512_256( BA, CA ); \
+    mm256_swap512_256( BB, CB ); \
+    mm256_swap512_256( BC, CC ); \
+    mm256_swap512_256( BD, CD ); \
+    mm256_swap512_256( BE, CE ); \
+    mm256_swap512_256( BF, CF ); \
+} while (0)
+
+#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+            _mm256_andnot_si256( xb3, xb2 ), \
+            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
+               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
+                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0_8   do { \
+      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_1_8   do { \
+      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_2_8   do { \
+      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define APPLY_P8 \
+do { \
+    B0 = mm256_ror_32( B0, 15 ); \
+    B1 = mm256_ror_32( B1, 15 ); \
+    B2 = mm256_ror_32( B2, 15 ); \
+    B3 = mm256_ror_32( B3, 15 ); \
+    B4 = mm256_ror_32( B4, 15 ); \
+    B5 = mm256_ror_32( B5, 15 ); \
+    B6 = mm256_ror_32( B6, 15 ); \
+    B7 = mm256_ror_32( B7, 15 ); \
+    B8 = mm256_ror_32( B8, 15 ); \
+    B9 = mm256_ror_32( B9, 15 ); \
+    BA = mm256_ror_32( BA, 15 ); \
+    BB = mm256_ror_32( BB, 15 ); \
+    BC = mm256_ror_32( BC, 15 ); \
+    BD = mm256_ror_32( BD, 15 ); \
+    BE = mm256_ror_32( BE, 15 ); \
+    BF = mm256_ror_32( BF, 15 ); \
+    PERM_STEP_0_8; \
+    PERM_STEP_1_8; \
+    PERM_STEP_2_8; \
+    A0B = _mm256_add_epi32( A0B, C6 ); \
+    A0A = _mm256_add_epi32( A0A, C5 ); \
+    A09 = _mm256_add_epi32( A09, C4 ); \
+    A08 = _mm256_add_epi32( A08, C3 ); \
+    A07 = _mm256_add_epi32( A07, C2 ); \
+    A06 = _mm256_add_epi32( A06, C1 ); \
+    A05 = _mm256_add_epi32( A05, C0 ); \
+    A04 = _mm256_add_epi32( A04, CF ); \
+    A03 = _mm256_add_epi32( A03, CE ); \
+    A02 = _mm256_add_epi32( A02, CD ); \
+    A01 = _mm256_add_epi32( A01, CC ); \
+    A00 = _mm256_add_epi32( A00, CB ); \
+    A0B = _mm256_add_epi32( A0B, CA ); \
+    A0A = _mm256_add_epi32( A0A, C9 ); \
+    A09 = _mm256_add_epi32( A09, C8 ); \
+    A08 = _mm256_add_epi32( A08, C7 ); \
+    A07 = _mm256_add_epi32( A07, C6 ); \
+    A06 = _mm256_add_epi32( A06, C5 ); \
+    A05 = _mm256_add_epi32( A05, C4 ); \
+    A04 = _mm256_add_epi32( A04, C3 ); \
+    A03 = _mm256_add_epi32( A03, C2 ); \
+    A02 = _mm256_add_epi32( A02, C1 ); \
+    A01 = _mm256_add_epi32( A01, C0 ); \
+    A00 = _mm256_add_epi32( A00, CF ); \
+    A0B = _mm256_add_epi32( A0B, CE ); \
+    A0A = _mm256_add_epi32( A0A, CD ); \
+    A09 = _mm256_add_epi32( A09, CC ); \
+    A08 = _mm256_add_epi32( A08, CB ); \
+    A07 = _mm256_add_epi32( A07, CA ); \
+    A06 = _mm256_add_epi32( A06, C9 ); \
+    A05 = _mm256_add_epi32( A05, C8 ); \
+    A04 = _mm256_add_epi32( A04, C7 ); \
+    A03 = _mm256_add_epi32( A03, C6 ); \
+    A02 = _mm256_add_epi32( A02, C5 ); \
+    A01 = _mm256_add_epi32( A01, C4 ); \
+    A00 = _mm256_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W8   do { \
+      if ((Wlow = T32(Wlow + 1)) == 0) \
+         Whigh = T32(Whigh + 1); \
+   } while (0)
+
+static void
+shabal_8way_init( void *cc, unsigned size )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+
+   if ( size == 512 )
+   { // copy immediate constants directly to working registers later.
+       sc->state_loaded = false;
+   }
+   else
+   {  // No users
+       sc->state_loaded = true;
+       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
+       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
+       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
+       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
+       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
+       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
+
+       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
+       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
+       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
+       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
+       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
+       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
+       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
+       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
+       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
+       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
+       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
+
+       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
+       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
+       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
+       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
+       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
+       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
+       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
+       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
+       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
+       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
+   }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;
+   size_t ptr;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE8( sc );
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK8;
+         INPUT_BLOCK_ADD8;
+         XOR_W8;
+         APPLY_P8;
+         INPUT_BLOCK_SUB8;
+         SWAP_BC8;
+         INCR_W8;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE8(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm256_set1_epi32( zz );
+   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE8(sc);
+   DECODE_BLOCK8;
+   INPUT_BLOCK_ADD8;
+   XOR_W8;
+   APPLY_P8;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC8;
+      XOR_W8;
+      APPLY_P8;
+   }
+
+   __m256i *d = (__m256i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_8way_init( void *cc )
+{
+   shabal_8way_init(cc, 256);
+}
+
+void
+shabal256_8way_update( void *cc, const void *data, size_t len )
+{
+   shabal_8way_core( cc, data, len );
+}
+
+void
+shabal256_8way_close( void *cc, void *dst )
+{
+   shabal_8way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+   shabal_8way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_8way_init(void *cc)
+{
+   shabal_8way_init(cc, 512);
+}
+
+void
+shabal512_8way_update(void *cc, const void *data, size_t len)
+{
+   shabal_8way_core(cc, data, len);
+}
+
+void
+shabal512_8way_close(void *cc, void *dst)
+{
+   shabal_8way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   shabal_8way_close(cc, ub, n, dst, 16);
+}
+
+
+#endif  // AVX2
+
 /*
 * We copy the state into local variables, so that the compiler knows
 * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
+
+
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
+
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)

+/*
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
+                                   _mm_set1_epi32(5UL) ) \
+  __m128i t2 = _mm_xor_si128( xa0, xc ); \
+  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
+  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
+              _mm_xor_si128( t2, \
+                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
+*/
+
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }

 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }

 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{

 #define SPH_SIZE_shabal512   512

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[16];
+   __m256i A[12], B[16], C[16];
+   sph_u32 Whigh, Wlow;
+   size_t ptr;
+   bool state_loaded;
+} shabal_8way_context __attribute__ ((aligned (64)));
+
+typedef shabal_8way_context shabal256_8way_context;
+typedef shabal_8way_context shabal512_8way_context;
+
+void shabal256_8way_init( void *cc );
+void shabal256_8way_update( void *cc, const void *data, size_t len );
+void shabal256_8way_close( void *cc, void *dst );
+void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_8way_init( void *cc );
+void shabal512_8way_update( void *cc, const void *data, size_t len );
+void shabal512_8way_close( void *cc, void *dst );
+void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+
+#endif
+
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;

 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                       void *dst );

 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
+//#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@

 #include <stdio.h>

+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.
+
 #if defined(__AVX2__)


@@ -16,8 +22,8 @@ static const uint32_t IV512[] =


 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
+                       mm256_ror128_32( b ), 0x88 )

 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   {
      // round 1, 5, 9

-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );

     if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );

     if ( r == 1 )
        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
     k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p3 = _mm256_xor_si256( p3, x );

     k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
     k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
     k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
     k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );

     if ( r == 2 )
        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     // round 3, 7, 11

-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k00, zero ) ), k13 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k01, zero ) ), k00 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k02, zero ) ), k01 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k03, zero ) ), k02 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p1 = _mm256_xor_si256( p1, x );

-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k10, zero ) ), k03 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k11, zero ) ), k10 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k12, zero ) ), k11 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k13, zero ) ), k12 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

   // round 13

-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

   p3 = _mm256_xor_si256( p3, x );

-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );

-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );

   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -0,0 +1,399 @@
+#include "shavite-hash-4way.h"
+#include <stdint.h>
+
+static const uint32_t IV512[] =
+{
+        0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
+        0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
+        0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
+        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
+};
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+#define mm512_ror2x512hi_1x32( a, b ) \
+   _mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
+                                    mm512_ror128_32( b ) )
+
+static void
+c512_4way( shavite512_4way_context *ctx, const void *msg )
+{
+   register __m512i X;
+   register __m512i P0, P1, P2, P3;
+   register __m512i K0, K1, K2, K3, K4, K5, K6, K7;
+   __m512i *M = (__m512i*)msg;
+   __m512i *H = (__m512i*)ctx->h;
+   int r;
+
+   P0 = H[0];
+   P1 = H[1];
+   P2 = H[2];
+   P3 = H[3];
+
+   K0 = M[0];
+   K1 = M[1];
+   K2 = M[2];
+   K3 = M[3];
+   K4 = M[4];
+   K5 = M[5];
+   K6 = M[6];
+   K7 = M[7];
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K1 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K2 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K3 ), m512_zero );
+
+   P0 = _mm512_xor_si512( P0, X );
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K5 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K6 ), m512_zero );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512(  X, K7 ), m512_zero );
+
+   P2 = _mm512_xor_si512( P2, X );
+
+   // round
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+
+     K0 = _mm512_xor_si512( K7, mm512_ror128_32(
+                                  _mm512_aesenc_epi128( K0, m512_zero ) ) );
+
+     if ( r == 0 )
+        K0 = _mm512_xor_si512( K0, _mm512_set4_epi32( 
+		              ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K0,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) );
+
+     if ( r == 1 )
+        K1 = _mm512_xor_si512( K1, _mm512_set4_epi32(
+	                 ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) ); 
+
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K1,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K2,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P3 = _mm512_xor_si512( P3, X );
+
+     K4 = _mm512_xor_si512( K3,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K4,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K5,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K6,
+		           mm512_ror128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) );
+
+     if ( r == 2 )
+        K7 = _mm512_xor_si512( K7, _mm512_set4_epi32(
+                    ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+ 
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+     P1 = _mm512_xor_si512( P1, X );
+     
+     // round 2, 6, 10
+
+     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P2 = _mm512_xor_si512( P2, X );
+
+     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P0 = _mm512_xor_si512( P0, X );
+
+     // round 3, 7, 11
+
+     K0 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K0, m512_zero ) ), K7 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P1 = _mm512_xor_si512( P1, X );
+
+     K4 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K6, m512_zero ) ), K5 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( mm512_ror128_32(
+                               _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P3 = _mm512_xor_si512( P3, X );
+
+     // round 4, 8, 12
+
+     K0 = _mm512_xor_si512( K0, mm512_ror2x512hi_1x32( K6, K7 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero );
+     K1 = _mm512_xor_si512( K1, mm512_ror2x512hi_1x32( K7, K0 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+     K2 = _mm512_xor_si512( K2, mm512_ror2x512hi_1x32( K0, K1 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+     K3 = _mm512_xor_si512( K3, mm512_ror2x512hi_1x32( K1, K2 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+     P0 = _mm512_xor_si512( P0, X );
+
+     K4 = _mm512_xor_si512( K4, mm512_ror2x512hi_1x32( K2, K3 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero );
+     K5 = _mm512_xor_si512( K5, mm512_ror2x512hi_1x32( K3, K4 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+     K6 = _mm512_xor_si512( K6, mm512_ror2x512hi_1x32( K4, K5 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+     K7 = _mm512_xor_si512( K7, mm512_ror2x512hi_1x32( K5, K6 ) );
+     X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+     P2 = _mm512_xor_si512( P2, X );
+   }
+
+   // round 13
+
+   K0 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K0, m512_zero ) ), K7  );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero );
+   K1 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K1, m512_zero ) ), K0 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero );
+   K2 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K2, m512_zero ) ), K1 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero );
+   K3 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K3, m512_zero ) ), K2 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero );
+
+   P3 = _mm512_xor_si512( P3, X );
+
+   K4 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K4, m512_zero ) ), K3 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero );
+   K5 = _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
+
+   K6 = mm512_ror128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
+   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
+	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
+
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
+   K7= _mm512_xor_si512( mm512_ror128_32(
+			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
+   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero );
+
+   P1 = _mm512_xor_si512( P1, X );
+
+   H[0] = _mm512_xor_si512( H[0], P2 );
+   H[1] = _mm512_xor_si512( H[1], P3 );
+   H[2] = _mm512_xor_si512( H[2], P0 );
+   H[3] = _mm512_xor_si512( H[3], P1 );
+}
+
+void shavite512_4way_init( shavite512_4way_context *ctx )
+{
+    __m512i *h = (__m512i*)ctx->h;
+    __m128i *iv = (__m128i*)IV512;
+   
+   h[0] = m512_const1_128( iv[0] );
+   h[1] = m512_const1_128( iv[1] );
+   h[2] = m512_const1_128( iv[2] );
+   h[3] = m512_const1_128( iv[3] );
+
+   ctx->ptr    = 0;
+   ctx->count0 = 0;
+   ctx->count1 = 0;
+   ctx->count2 = 0;
+   ctx->count3 = 0;
+}
+
+// not tested, use update_close
+void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
+                             size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 2 )
+         clen = len << 2;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= clen >> 2;
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_4way( ctx, buf );
+         ptr = 0;
+      }
+   }
+   ctx->ptr = ptr;
+}
+
+// not tested
+void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
+{
+    unsigned char *buf;
+    union 
+    {
+       uint32_t u32[4];
+       uint16_t u16[8];
+    } count;
+
+    buf = ctx->buf;
+    uint32_t vp = ctx->ptr>>6;
+
+    // Terminating byte then zero pad
+    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+
+    // Zero pad full vectors up to count
+    for ( ; vp < 6; vp++ )      
+        casti_m512i( buf, vp ) = m512_zero;
+
+    // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+    // Count is misaligned to 16 bits and straddles a vector.
+    // Use u32 overlay to stage then u16 to load buf.
+    count.u32[0] = ctx->count0 += (ctx->ptr << 1);  // ptr/4 * 8
+    count.u32[1] = ctx->count1;
+    count.u32[2] = ctx->count2;
+    count.u32[3] = ctx->count3;
+
+    casti_m512i( buf, 6 ) = m512_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+                
+    c512_4way( ctx, buf);
+
+    casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
+    casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
+    casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
+    casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
+}
+
+void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
+                                   const void *data, size_t len )
+{
+   unsigned char *buf = ctx->buf;
+   size_t         ptr = ctx->ptr;
+
+   // process full blocks and load buf with remainder.
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = (sizeof ctx->buf) - ptr;
+      if ( clen > len << 2 )
+         clen = len << 2;
+      memcpy( buf + ptr, data, clen );
+      data = (const unsigned char *)data + clen;
+      ptr += clen;
+      len -= (clen >> 2);
+      if ( ptr == sizeof ctx->buf )
+      {
+         if ( ( ctx->count0 = ctx->count0 + 1024 )  == 0 )
+         {
+             ctx->count1 = ctx->count1 + 1;
+             if ( ctx->count1 == 0 )
+             {
+                ctx->count2 = ctx->count2 + 1;
+                if ( ctx->count2 == 0 )
+                   ctx->count3 = ctx->count3 + 1;
+             }
+         }
+         c512_4way( ctx, buf );
+         ptr = 0;
+      }
+   }
+
+   uint32_t vp = ptr>>6;
+   // Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
+   // Count is misaligned to 16 bits and straddles 2 vectors.
+   // Use u32 overlay to stage then u16 to load buf.
+   union
+   {
+      uint32_t u32[4];
+      uint16_t u16[8];
+   } count;
+
+   count.u32[0] = ctx->count0 += (ptr << 1);  // ptr/4 * 8
+   count.u32[1] = ctx->count1;
+   count.u32[2] = ctx->count2;
+   count.u32[3] = ctx->count3;
+
+   if ( vp == 0 )    // empty buf, xevan.
+   { 
+      casti_m512i( buf, 0 ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + 1, 5 );
+      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
+   }
+   else     // half full buf, everyone else.
+   {
+    casti_m512i( buf, vp++ ) = m512_const2_64( 0, 0x0000000000000080 );
+      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
+   }
+
+    casti_m512i( buf, 6 ) = m512_const1_128(
+                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
+    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
+                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
+
+   c512_4way( ctx, buf);
+
+   casti_m512i( dst, 0 ) = casti_m512i( ctx->h, 0 );
+   casti_m512i( dst, 1 ) = casti_m512i( ctx->h, 1 );
+   casti_m512i( dst, 2 ) = casti_m512i( ctx->h, 2 );
+   casti_m512i( dst, 3 ) = casti_m512i( ctx->h, 3 );
+}
+
+#endif // VAES
--- a/algo/shavite/shavite-hash-4way.h
+++ b/algo/shavite/shavite-hash-4way.h
@@ -0,0 +1,25 @@
+#ifndef SHAVITE_HASH_4WAY_H__
+#define SHAVITE_HASH_4WAY_H__ 1
+
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  
+#include "simd-utils.h"
+
+typedef struct {
+        unsigned char buf[128<<2];
+        uint32_t h[16<<2];
+        size_t ptr;
+        uint32_t count0, count1, count2, count3;
+} shavite512_4way_context __attribute__ ((aligned (64)));
+
+void shavite512_4way_init( shavite512_4way_context *ctx );
+void shavite512_4way_update( shavite512_4way_context *ctx, const void *data,
+	                     size_t len );
+void shavite512_4way_close( shavite512_4way_context *ctx, void *dst );
+void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,
+		                   const void *data, size_t len );
+
+#endif // VAES
+
+#endif // SHAVITE_HASH_4WAY_H__
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -100,9 +100,20 @@ c512( sph_shavite_big_context *sc, const void *msg )
   p3 = h[3];   

   // round
+
+//  working proof of concept   
+/*
+   __m512i K = m512_const1_128( m[0] );
+   __m512i X = _mm512_xor_si512( m512_const1_128( p1 ), K );
+   X = _mm512_aesenc_epi128( X, m512_zero );
+   k00 = _mm512_castsi512_si128( K );
+   x = _mm512_castsi512_si128( X );
+*/
+
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
   x = _mm_aesenc_si128( x, zero );
+
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, zero );
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -18,76 +18,18 @@ void skeinhash_8way( void *state, const void *input )
     uint64_t vhash64[8*8] __attribute__ ((aligned (128)));
     skein512_8way_context ctx_skein;

-//#if defined(__SHA__)
-//     uint32_t hash0[16] __attribute__ ((aligned (64)));
-//     uint32_t hash1[16] __attribute__ ((aligned (64)));
-//     uint32_t hash2[16] __attribute__ ((aligned (64)));
-//     uint32_t hash3[16] __attribute__ ((aligned (64)));
-//     uint32_t hash4[16] __attribute__ ((aligned (64)));
-//     uint32_t hash5[16] __attribute__ ((aligned (64)));
-//     uint32_t hash6[16] __attribute__ ((aligned (64)));
-//     uint32_t hash7[16] __attribute__ ((aligned (64)));
-//     SHA256_CTX           ctx_sha256;
-//#else
     uint32_t vhash32[16*8] __attribute__ ((aligned (128)));
     sha256_8way_context ctx_sha256;
-//#endif

     skein512_8way_init( &ctx_skein );
     skein512_8way_update( &ctx_skein, input, 80 );
     skein512_8way_close( &ctx_skein, vhash64 );
-/*
-#if defined(__SHA__)      
-     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash64, 512 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
-     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
-     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
-     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
-     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash4, 64 );
-     SHA256_Final( (unsigned char*)hash4, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash5, 64 );
-     SHA256_Final( (unsigned char*)hash5, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash6, 64 );
-     SHA256_Final( (unsigned char*)hash6, &ctx_sha256 );
-
-     SHA256_Init( &ctx_sha256 );
-     SHA256_Update( &ctx_sha256, (unsigned char*)hash7, 64 );
-     SHA256_Final( (unsigned char*)hash7, &ctx_sha256 );
-     
-     intrlv_8x32( state, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                  hash7, 256 );
-#else
-*/

     rintrlv_8x64_8x32( vhash32, vhash64, 512 );
-//     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-//                   vhash64, 512 );
-//     intrlv_8x32( vhash32, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-//                   hash7, 512 );

     sha256_8way_init( &ctx_sha256 );
-     sha256_8way( &ctx_sha256, vhash32, 64 );
+     sha256_8way_update( &ctx_sha256, vhash32, 64 );
     sha256_8way_close( &ctx_sha256, state );
-//#endif
 }

 int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
@@ -176,7 +118,7 @@ void skeinhash_4way( void *state, const void *input )
     rintrlv_4x64_4x32( vhash32, vhash64, 512 );

     sha256_4way_init( &ctx_sha256 );
-     sha256_4way( &ctx_sha256, vhash32, 64 );
+     sha256_4way_update( &ctx_sha256, vhash32, 64 );
     sha256_4way_close( &ctx_sha256, state );
 #endif
 }
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -93,12 +93,12 @@ typedef sph_skein_4way_big_context skein256_4way_context;
 void skein512_4way_init( skein512_4way_context *sc );
 void skein512_4way_update( void *cc, const void *data, size_t len );
 void skein512_4way_close( void *cc, void *dst );
-#define skein512_4way skein512_4way_update
+//#define skein512_4way skein512_4way_update

 void skein256_4way_init( skein256_4way_context *sc );
 void skein256_4way_update( void *cc, const void *data, size_t len );
 void skein256_4way_close( void *cc, void *dst );
-#define skein256_4way skein256_4way_update
+//#define skein256_4way skein256_4way_update

 #ifdef __cplusplus
 }
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -68,11 +68,11 @@ void skein2hash_4way( void *output, const void *input )
   uint64_t hash[16*4] __attribute__ ((aligned (64)));

   skein512_4way_init( &ctx );
-   skein512_4way( &ctx, input, 80 );
+   skein512_4way_update( &ctx, input, 80 );
   skein512_4way_close( &ctx, hash );

   skein512_4way_init( &ctx );
-   skein512_4way( &ctx, hash, 64 );
+   skein512_4way_update( &ctx, hash, 64 );
   skein512_4way_close( &ctx, output );
 }

--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -50,41 +50,138 @@
 #include <string.h>
 #include "sm3-hash-4way.h"

-#ifdef __SSE4_2__
+#ifdef __AVX2__

-void sm3_4way_init( sm3_4way_ctx_t *ctx )
+#define P0_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x,  9 ), \
+                                          mm256_rol_32( x, 17 ) ) ) 
+
+#define P1_8W(x) \
+   _mm256_xor_si256( x, _mm256_xor_si256( mm256_rol_32( x, 15 ), \
+                                          mm256_rol_32( x, 23 ) ) ) 
+
+#define FF0_8W(x,y,z) \
+   _mm256_xor_si256( x, _mm256_xor_si256( y, z ) )
+
+#define FF1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_or_si256( _mm256_and_si256( x, y ), \
+                                     _mm256_and_si256( x, z ) ), \
+                                     _mm256_and_si256( y, z ) )
+
+#define GG0_8W(x,y,z)  FF0_8W(x,y,z)
+
+#define GG1_8W(x,y,z) \
+   _mm256_or_si256( _mm256_and_si256( x, y ), \
+                    _mm256_andnot_si256( x, z ) )
+
+void sm3_8way_compress( __m256i *digest, __m256i *block )
 {
-	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
-	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
-	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
-	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
-	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
-	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
-	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
-	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
-	ctx->nblocks = 0;
-	ctx->num = 0;
+   __m256i W[68], W1[64];
+   __m256i A = digest[ 0 ];
+   __m256i B = digest[ 1 ];
+   __m256i C = digest[ 2 ];
+   __m256i D = digest[ 3 ];
+   __m256i E = digest[ 4 ];
+   __m256i F = digest[ 5 ];
+   __m256i G = digest[ 6 ];
+   __m256i H = digest[ 7 ];
+   __m256i SS1, SS2, TT1, TT2, T;
+   int j;
+
+   for ( j = 0; j < 16; j++ )
+      W[j] = mm256_bswap_32( block[j] );
+
+   for ( j = 16; j < 68; j++ )
+      W[j] = _mm256_xor_si256( P1_8W( _mm256_xor_si256(
+                                      _mm256_xor_si256( W[ j-16 ], W[ j-9 ] ),
+                                      mm256_rol_32( W[ j-3 ], 15 ) ) ),
+                  _mm256_xor_si256( mm256_rol_32( W[ j-13 ], 7 ), W[ j-6 ] ) );
+
+   for( j = 0; j < 64; j++ )
+       W1[j] = _mm256_xor_si256( W[j], W[j+4] );
+
+   T = _mm256_set1_epi32( 0x79CC4519UL );
+   for( j =0; j < 16; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( E, _mm256_add_epi32(
+                      mm256_rol_32( A, 12 ), mm256_rol_var_32( T, j ) ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF0_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG0_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   T = _mm256_set1_epi32( 0x7A879D8AUL );
+   for( j =16; j < 64; j++ )
+   {
+      SS1 = mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32(
+                  mm256_rol_32(A,12), E ), mm256_rol_var_32( T, j&31 ) ), 7 );
+      SS2 = _mm256_xor_si256( SS1, mm256_rol_32( A, 12 ) );
+      TT1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       FF1_8W( A, B, C ), D ), SS2 ), W1[j] );
+      TT2 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32(
+                                       GG1_8W( E, F, G ), H ), SS1 ), W[j] );
+      D = C;
+      C = mm256_rol_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm256_rol_32( F, 19 );
+      F = E;
+      E = P0_8W( TT2 );
+   }
+
+   digest[0] = _mm256_xor_si256( digest[0], A );
+   digest[1] = _mm256_xor_si256( digest[1], B );
+   digest[2] = _mm256_xor_si256( digest[2], C );
+   digest[3] = _mm256_xor_si256( digest[3], D );
+   digest[4] = _mm256_xor_si256( digest[4], E );
+   digest[5] = _mm256_xor_si256( digest[5], F );
+   digest[6] = _mm256_xor_si256( digest[6], G );
+   digest[7] = _mm256_xor_si256( digest[7], H );
 }

-void sm3_4way( void *cc, const void *data, size_t len )
+void sm3_8way_init( sm3_8way_ctx_t *ctx )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *block = (__m128i*)ctx->block;
-   __m128i *vdata = (__m128i*)data;
+   ctx->digest[0] = _mm256_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm256_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm256_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm256_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm256_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm256_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm256_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm256_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}

+void sm3_8way_update( void *cc, const void *data, size_t len )
+{
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *block = (__m256i*)ctx->block;
+   __m256i *vdata = (__m256i*)data;
   if ( ctx->num )
   {
      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
      if ( len < left )
      {
-         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
+         memcpy_256( block + (ctx->num >> 2), vdata , len>>2 );
         ctx->num += len;
         return;
      }
      else
      {
-         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
-         sm3_4way_compress( ctx->digest, block );
+         memcpy_256( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_8way_compress( ctx->digest, block );
         ctx->nblocks++;
         vdata += left>>2;
         len -= left;
@@ -92,49 +189,53 @@ void sm3_4way( void *cc, const void *data, size_t len )
   }
   while ( len >= SM3_BLOCK_SIZE )
   {
-      sm3_4way_compress( ctx->digest, vdata );
+      sm3_8way_compress( ctx->digest, vdata );
      ctx->nblocks++;
      vdata += SM3_BLOCK_SIZE>>2;
      len -= SM3_BLOCK_SIZE;
   }
   ctx->num = len;
   if ( len )
-      memcpy_128( block, vdata, len>>2 );
+      memcpy_256( block, vdata, len>>2 );
 }

-void sm3_4way_close( void *cc, void *dst )
+void sm3_8way_close( void *cc, void *dst )
 {
-   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
-   __m128i *hash = (__m128i*)dst;
-   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
-   __m128i *block = (__m128i*)ctx->block;
+   sm3_8way_ctx_t *ctx = (sm3_8way_ctx_t*)cc;
+   __m256i *hash = (__m256i*)dst;
+   __m256i *count = (__m256i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m256i *block = (__m256i*)ctx->block;
   int i;

-   block[ctx->num] = _mm_set1_epi32( 0x80 );
+   block[ctx->num] = _mm256_set1_epi32( 0x80 );

   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
   {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
-                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
   }
   else
   {
-      memset_zero_128( block + (ctx->num >> 2) + 1, 
+      memset_zero_256( block + (ctx->num >> 2) + 1,
                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
-      sm3_4way_compress( ctx->digest, block );
-      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+      sm3_8way_compress( ctx->digest, block );
+      memset_zero_256( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm128_bswap_32(
-                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[0] = mm256_bswap_32(
+                  _mm256_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm256_bswap_32( _mm256_set1_epi32( ( ctx->nblocks << 9 ) +
                                              ( ctx->num     << 3 ) ) );
-   sm3_4way_compress( ctx->digest, block );
+   sm3_8way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm128_bswap_32( ctx->digest[i] );
+     hash[i] = mm256_bswap_32( ctx->digest[i] );
 }

+#endif
+
+#if defined(__SSE2__)
+
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
                                               mm128_rol_32( x, 17 ) ) ) 
 #define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
@@ -227,5 +328,88 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   digest[7] = _mm_xor_si128( digest[7], H );
 }

+void sm3_4way_init( sm3_4way_ctx_t *ctx )
+{
+   ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
+   ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
+   ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
+   ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
+   ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
+   ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
+   ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
+   ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
+   ctx->nblocks = 0;
+   ctx->num = 0;
+}
+
+void sm3_4way_update( void *cc, const void *data, size_t len )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *block = (__m128i*)ctx->block;
+   __m128i *vdata = (__m128i*)data;
+
+   if ( ctx->num )
+   {
+      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+      if ( len < left )
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 );
+         ctx->num += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_4way_compress( ctx->digest, block );
+         ctx->nblocks++;
+         vdata += left>>2;
+         len -= left;
+      }
+   }
+   while ( len >= SM3_BLOCK_SIZE )
+   {
+      sm3_4way_compress( ctx->digest, vdata );
+      ctx->nblocks++;
+      vdata += SM3_BLOCK_SIZE>>2;
+      len -= SM3_BLOCK_SIZE;
+   }
+   ctx->num = len;
+   if ( len )
+      memcpy_128( block, vdata, len>>2 );
+}
+
+void sm3_4way_close( void *cc, void *dst )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *hash = (__m128i*)dst;
+   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m128i *block = (__m128i*)ctx->block;
+   int i;
+
+   block[ctx->num] = _mm_set1_epi32( 0x80 );
+
+   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 );
+   }
+   else
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1,
+                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
+      sm3_4way_compress( ctx->digest, block );
+      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+   }
+
+   count[0] = mm128_bswap_32(
+                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                              ( ctx->num     << 3 ) ) );
+   sm3_4way_compress( ctx->digest, block );
+
+   for ( i = 0; i < 8 ; i++ )
+     hash[i] = mm128_bswap_32( ctx->digest[i] );
+}
+
 #endif

--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -48,14 +48,13 @@
 */

 #ifndef SPH_SM3_HASH_4WAY_H
-#define SPH_SM3_HASH_4WAY_H
+#define SPH_SM3_HASH_4WAY_H 1

 #define SM3_DIGEST_LENGTH	32
 #define SM3_BLOCK_SIZE		64
 #define SM3_CBLOCK		(SM3_BLOCK_SIZE)
 #define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)

-
 #include <sys/types.h>
 #include <stdint.h>
 #include <string.h>
@@ -65,7 +64,6 @@
 extern "C" {
 #endif

-
 typedef struct {
   __m128i block[16] __attribute__ ((aligned (64)));
   __m128i digest[8];
@@ -74,15 +72,24 @@ typedef struct {
 } sm3_4way_ctx_t;

 void sm3_4way_init( sm3_4way_ctx_t *ctx );
-//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
-//                      size_t data_len );
-//void sm3_4way_final( sm3_4way_ctx_t *ctx,
-//                      unsigned char digest[SM3_DIGEST_LENGTH] );
-void sm3_4way_compress( __m128i *digest, __m128i *block );
-
-void sm3_4way(void *cc, const void *data, size_t len);
+void sm3_4way_update(void *cc, const void *data, size_t len);
 void sm3_4way_close(void *cc, void *dst);

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i block[16] __attribute__ ((aligned (64)));
+   __m256i digest[8];
+   uint32_t nblocks;
+   uint32_t num;
+} sm3_8way_ctx_t;
+
+void sm3_8way_init( sm3_8way_ctx_t *ctx );
+void sm3_8way_update(void *cc, const void *data, size_t len);
+void sm3_8way_close(void *cc, void *dst);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -14,21 +14,32 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif

 #if defined (C11_8WAY)

 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
    simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
    hashState_echo          echo;
+#endif
 } c11_8way_ctx_holder;

 c11_8way_ctx_holder c11_8way_ctx;
@@ -37,20 +48,28 @@ void init_c11_8way_ctx()
 {
     blake512_8way_init( &c11_8way_ctx.blake );
     bmw512_8way_init( &c11_8way_ctx.bmw );
-     init_groestl( &c11_8way_ctx.groestl, 64 );
     skein512_8way_init( &c11_8way_ctx.skein );
     jh512_8way_init( &c11_8way_ctx.jh );
     keccak512_8way_init( &c11_8way_ctx.keccak );
     luffa_4way_init( &c11_8way_ctx.luffa, 512 );
     cube_4way_init( &c11_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &c11_8way_ctx.shavite );
     simd_4way_init( &c11_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &c11_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &c11_8way_ctx.shavite );
+     echo_4way_init( &c11_8way_ctx.echo, 512 );
+#else
+     init_groestl( &c11_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &c11_8way_ctx.shavite );
     init_echo( &c11_8way_ctx.echo, 512 );
+#endif
 }

 void c11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -70,11 +89,21 @@ void c11_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+     
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

-     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -91,10 +120,11 @@ void c11_8way_hash( void *state, const void *input )
     memcpy( &ctx.groestl, &c11_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

-     // 4way
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );

+#endif
+
     // 4 JH
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
@@ -107,23 +137,27 @@ void c11_8way_hash( void *state, const void *input )
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

-     // Serial
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     // 7 Luffa + 8 cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

-     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &c11_8way_ctx.shavite,
@@ -155,16 +189,29 @@ void c11_8way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );

-     // 10 Simd
-     intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 512 );
-     intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 512 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 512 );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );

-     // 11 Echo
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &c11_8way_ctx.echo, sizeof(hashState_echo) );
@@ -189,6 +236,8 @@ void c11_8way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );

+#endif
+
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -283,11 +332,11 @@ void c11_4way_hash( void *state, const void *input )
     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );

     // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

     // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
@@ -306,15 +355,15 @@ void c11_4way_hash( void *state, const void *input )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

     // 5 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

     // 6 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

     // Serial
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -15,7 +15,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  return true;
 };

--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -84,13 +84,13 @@ void timetravel_4way_hash(void *output, const void *input)
      switch ( permutation[i] )
      {
        case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
           blake512_4way_close( &ctx.blake, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
           bmw512_4way_close( &ctx.bmw, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -112,19 +112,19 @@ void timetravel_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
           jh512_4way_close( &ctx.jh, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
           keccak512_4way_close( &ctx.keccak, vhashB );
           if ( i == 7 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -90,13 +90,13 @@ void timetravel10_4way_hash(void *output, const void *input)
      switch ( permutation[i] )
      {
        case 0:
-           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_update( &ctx.blake, vhashA, dataLen );
           blake512_4way_close( &ctx.blake, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 1:
-           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_update( &ctx.bmw, vhashA, dataLen );
           bmw512_4way_close( &ctx.bmw, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
@@ -118,19 +118,19 @@ void timetravel10_4way_hash(void *output, const void *input)
              intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
        break;
        case 3:
-           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_update( &ctx.skein, vhashA, dataLen );
           skein512_4way_close( &ctx.skein, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 4:
-           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_update( &ctx.jh, vhashA, dataLen );
           jh512_4way_close( &ctx.jh, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
        break;
        case 5:
-           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_update( &ctx.keccak, vhashA, dataLen );
           keccak512_4way_close( &ctx.keccak, vhashB );
           if ( i == 9 )
              dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 );
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -6,6 +6,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/echo/echo-hash-4way.h"
+#endif

 #if defined(TRIBUS_8WAY)

@@ -14,6 +17,8 @@ static __thread jh512_8way_context ctx_mid;
 void tribus_hash_8way( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -24,7 +29,11 @@ void tribus_hash_8way( void *state, const void *input )
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     jh512_8way_context     ctx_jh;
     keccak512_8way_context ctx_keccak;
+#if defined(__VAES__)
+     echo_4way_context      ctx_echo;
+#else
     hashState_echo         ctx_echo;
+#endif

     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
     jh512_8way_update( &ctx_jh, input + (64<<3), 16 );
@@ -34,10 +43,23 @@ void tribus_hash_8way( void *state, const void *input )
     keccak512_8way_update( &ctx_keccak, vhash, 64 );
     keccak512_8way_close( &ctx_keccak, vhash );

+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+     
+     echo_4way_init( &ctx_echo, 512 );
+     echo_4way_update_close( &ctx_echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx_echo, 512 );
+     echo_4way_update_close( &ctx_echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash, 512 );

-     // hash echo serially
     init_echo( &ctx_echo, 512 );
     update_final_echo( &ctx_echo, (BitSequence *) hash0,
                        (const BitSequence *) hash0, 512 );
@@ -63,6 +85,8 @@ void tribus_hash_8way( void *state, const void *input )
     update_final_echo( &ctx_echo, (BitSequence *) hash7,
                        (const BitSequence *) hash7, 512 );

+#endif
+
     memcpy( state,       hash0, 32 );
     memcpy( state+32,    hash1, 32 );
     memcpy( state+64,    hash2, 32 );
--- a/algo/x11/tribus-gate.c
+++ b/algo/x11/tribus-gate.c
@@ -2,7 +2,7 @@

 bool register_tribus_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
 #if defined (TRIBUS_8WAY)
  gate->scanhash      = (void*)&scanhash_tribus_8way;
  gate->hash          = (void*)&tribus_hash_8way;
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -14,21 +14,32 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif

 #if defined (X11_8WAY)

 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
    simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
    hashState_echo          echo;
+#endif
 } x11_8way_ctx_holder;

 x11_8way_ctx_holder x11_8way_ctx;
@@ -37,20 +48,28 @@ void init_x11_8way_ctx()
 {
     blake512_8way_init( &x11_8way_ctx.blake );
     bmw512_8way_init( &x11_8way_ctx.bmw );
-     init_groestl( &x11_8way_ctx.groestl, 64 );
     skein512_8way_init( &x11_8way_ctx.skein );
     jh512_8way_init( &x11_8way_ctx.jh );
     keccak512_8way_init( &x11_8way_ctx.keccak );
     luffa_4way_init( &x11_8way_ctx.luffa, 512 );
     cube_4way_init( &x11_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x11_8way_ctx.shavite );
     simd_4way_init( &x11_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &x11_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x11_8way_ctx.shavite );
+     echo_4way_init( &x11_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x11_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x11_8way_ctx.shavite );
     init_echo( &x11_8way_ctx.echo, 512 );
+#endif
 }

 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -59,7 +78,6 @@ void x11_8way_hash( void *state, const void *input )
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
-
     x11_8way_ctx_holder ctx;
     memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
@@ -68,7 +86,18 @@ void x11_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

@@ -95,10 +124,11 @@ void x11_8way_hash( void *state, const void *input )
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

-     // 4way
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );

+#endif
+
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

@@ -108,20 +138,26 @@ void x11_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
@@ -154,13 +190,28 @@ void x11_8way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );

-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );

     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
@@ -186,6 +237,8 @@ void x11_8way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );

+#endif
+
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -282,11 +335,11 @@ void x11_4way_hash( void *state, const void *input )
     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );

     // 1 Blake 4way
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

     // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
@@ -305,15 +358,15 @@ void x11_4way_hash( void *state, const void *input )
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

     // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

     // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

     // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -15,7 +15,7 @@ bool register_x11_algo( algo_gate_t *gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT ;
  return true;
 };

--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -85,12 +85,12 @@ void x11evo_4way_hash( void *state, const void *input )
      switch ( idx )
      {
         case 0:
-            blake512_4way( &ctx.blake, input, 80 );
+            blake512_4way_update( &ctx.blake, input, 80 );
            blake512_4way_close( &ctx.blake, vhash );
            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
         break;
         case 1:
-            bmw512_4way( &ctx.bmw, vhash, 64 );
+            bmw512_4way_update( &ctx.bmw, vhash, 64 );
            bmw512_4way_close( &ctx.bmw, vhash );
            if ( i >= len-1 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
@@ -112,19 +112,19 @@ void x11evo_4way_hash( void *state, const void *input )
               intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 );
         break;
         case 3:
-            skein512_4way( &ctx.skein, vhash, 64 );
+            skein512_4way_update( &ctx.skein, vhash, 64 );
            skein512_4way_close( &ctx.skein, vhash );
            if ( i >= len-1 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
         break;
         case 4:
-            jh512_4way( &ctx.jh, vhash, 64 );
+            jh512_4way_update( &ctx.jh, vhash, 64 );
            jh512_4way_close( &ctx.jh, vhash );
            if ( i >= len-1 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
         break;
         case 5:
-            keccak512_4way( &ctx.keccak, vhash, 64 );
+            keccak512_4way_update( &ctx.keccak, vhash, 64 );
            keccak512_4way_close( &ctx.keccak, vhash );
            if ( i >= len-1 )
               dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 );
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -15,22 +15,33 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif

 #if defined (X11GOST_8WAY)

 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
-    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;    
    keccak512_8way_context  keccak;    
    sph_gost512_context     gost;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
-    sph_shavite512_context  shavite;
    simd_4way_context       simd;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
    hashState_echo          echo;
+#endif
 } x11gost_8way_ctx_holder;

 x11gost_8way_ctx_holder x11gost_8way_ctx;
@@ -39,21 +50,29 @@ void init_x11gost_8way_ctx()
 {
     blake512_8way_init( &x11gost_8way_ctx.blake );
     bmw512_8way_init( &x11gost_8way_ctx.bmw );
-     init_groestl( &x11gost_8way_ctx.groestl, 64 );
     skein512_8way_init( &x11gost_8way_ctx.skein );
     jh512_8way_init( &x11gost_8way_ctx.jh );
     keccak512_8way_init( &x11gost_8way_ctx.keccak );
     sph_gost512_init( &x11gost_8way_ctx.gost );
     luffa_4way_init( &x11gost_8way_ctx.luffa, 512 );
     cube_4way_init( &x11gost_8way_ctx.cube, 512, 16, 32 );
-     sph_shavite512_init( &x11gost_8way_ctx.shavite );
     simd_4way_init( &x11gost_8way_ctx.simd, 512 );
+#if defined(__VAES__)
+     groestl512_4way_init( &x11gost_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x11gost_8way_ctx.shavite );
+     echo_4way_init( &x11gost_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x11gost_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x11gost_8way_ctx.shavite );
     init_echo( &x11gost_8way_ctx.echo, 512 );
+#endif
 }

 void x11gost_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -72,7 +91,18 @@ void x11gost_8way_hash( void *state, const void *input )
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );

-     // Serial
+#if defined(__VAES__)
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );

@@ -99,10 +129,11 @@ void x11gost_8way_hash( void *state, const void *input )
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );

-     // 4way
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );

+#endif
+
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

@@ -140,20 +171,28 @@ void x11gost_8way_hash( void *state, const void *input )
     sph_gost512( &ctx.gost, hash7, 64 );
     sph_gost512_close( &ctx.gost, hash7 );

+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
-     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );

-     sph_shavite512( &ctx.shavite, hash0, 64 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x11gost_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
@@ -184,14 +223,29 @@ void x11gost_8way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );

-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
-     simd_4way_init( &ctx.simd, 512 );
-     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );

+#endif
+
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x11gost_8way_ctx.echo, sizeof(hashState_echo) );
@@ -216,6 +270,8 @@ void x11gost_8way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );

+#endif
+
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
@@ -310,10 +366,10 @@ void x11gost_4way_hash( void *state, const void *input )
     x11gost_4way_ctx_holder ctx;
     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );

-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );

     // Serial
@@ -333,13 +389,13 @@ void x11gost_4way_hash( void *state, const void *input )
     // 4way
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );

-     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );

-     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_update( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );

-     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );

     // Serial
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
-
-#if defined(X12_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,273 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
+
+#if defined(X12_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
+} x12_8way_ctx_holder;
+
+x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x12_8way_ctx()
+{
+     blake512_8way_init( &x12_8way_ctx.blake );
+     bmw512_8way_init( &x12_8way_ctx.bmw );
+     skein512_8way_init( &x12_8way_ctx.skein );
+     jh512_8way_init( &x12_8way_ctx.jh );
+     keccak512_8way_init( &x12_8way_ctx.keccak );
+     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
+     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
+     simd_4way_init( &x12_8way_ctx.simd, 512 );
+     hamsi512_8way_init( &x12_8way_ctx.hamsi );
+#if defined(__VAES__)
+     groestl512_4way_init( &x12_8way_ctx.groestl, 64 );
+     shavite512_4way_init( &x12_8way_ctx.shavite );
+     echo_4way_init( &x12_8way_ctx.echo, 512 );
+#else
+     init_groestl( &x12_8way_ctx.groestl, 64 );
+     sph_shavite512_init( &x12_8way_ctx.shavite );
+     init_echo( &x12_8way_ctx.echo, 512 );
+#endif
+};
+
+void x12_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
+
+     x12_8way_ctx_holder ctx;
+     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, 64 );
+
+#if defined(__VAES__)
+
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
+     shavite512_4way_init( &ctx.shavite );
+     shavite512_4way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
+
+#else
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhashA, hash0, hash1, hash2, hash3 );
+     intrlv_4x128_512( vhashB, hash4, hash5, hash6, hash7 );
+
+#endif
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, 512 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+
+#if defined(__VAES__)
+
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashA, vhashA, 512 );
+     echo_4way_init( &ctx.echo, 512 );
+     echo_4way_update_close( &ctx.echo, vhashB, vhashB, 512 );
+
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashA, vhashA, 512 );
+     groestl512_4way_init( &ctx.groestl, 64 );
+     groestl512_4way_update_close( &ctx.groestl, vhashB, vhashB, 512 );
+
+     rintrlv_4x128_8x64( vhash, vhashA, vhashB, 512 );
+
+#else
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhashA );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhashB );
+     
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+#endif
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, state );
+}
+
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        x12_8way_hash( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X12_4WAY)

 typedef struct {
    blake512_4way_context   blake;
@@ -63,45 +322,13 @@ void x12_4way_hash( void *state, const void *input )
     x12_4way_ctx_holder ctx;
     memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );

-     // 1 Blake
-     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_update( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

-     // 2 Bmw
-     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_update( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-
-     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-
-     // Parallel 4way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
-
-     // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
-
-     // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
-
-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     // 7 Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +337,6 @@ void x12_4way_hash( void *state, const void *input )
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +345,6 @@ void x12_4way_hash( void *state, const void *input )
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );

-     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +360,6 @@ void x12_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     // 10 Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,22 +368,26 @@ void x12_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-     hamsi512_4way( &ctx.hamsi, vhash, 64 );
+     skein512_4way_update( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way_update( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way_update( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     hamsi512_4way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

     dintrlv_4x64( state, state+32, state+64, state+96, vhash, 256 );
--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@

 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
+  init_x12_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x12_8way;
+  gate->hash      = (void*)&x12_8way_hash;
+#elif defined (X12_4WAY)
  init_x12_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_4way;
  gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x12;
  gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  return true;
 };

--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X12_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X12_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X12_4WAY 1
 #endif

 bool register_x12_algo( algo_gate_t* gate );

-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
+
+void x12_8way_hash( void *state, const void *input );
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x12_8way_ctx();
+
+#elif defined(X12_4WAY)

 void x12_4way_hash( void *state, const void *input );
-
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_4way_ctx();

-#endif
+#else

 void x12hash( void *state, const void *input );
-
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_ctx();

 #endif

+#endif
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	3572cb53c4	v3.11.0	2020-01-02 23:54:08 -05:00
Jay D Dee	241bc26767	v3.10.6	2019-12-25 01:26:26 -05:00
Jay D Dee	c65b0ff7a6	v3.10.5	2019-12-21 13:19:29 -05:00