v3.10.5

2025-09-17 23:44:27 +00:00 · 2019-12-21 13:19:29 -05:00
parent a17ff6f189
commit c65b0ff7a6
72 changed files with 9090 additions and 1336 deletions
--- a/69
+++ b/69
@@ -1,12 +1,14 @@


-Requirements:
+1. Requirements:
+---------------

 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.

-Building on linux prerequisites:
+2. Building on linux prerequisites:
+-----------------------------------

 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu

 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
-will give a clue as to the missing package.
+be installed manually. There may be others, read the compiler error messages,
+they will give a clue as to the missing package.

 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
 but different package names. 

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
-compiler version, to CFLAGS:
-"-march=native" or "-march=znver1" or "-msha".
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
+support depending on your CPU and compiler version:
+
+"-march=native" is always the best choice
+
+"-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
+
+"-msha"  Add SHA to other tuning options

 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.

-Extract cpuminer source.
+3. Download cpuminer-opt
+------------------------

-tar xvzf cpuminer-opt-x.y.z.tar.gz
-cd cpuminer-opt-x.y.z
+Download the source code for the latest realease from the official repository.

-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases

-./autogen.sh
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
-make
+Extract the source code.

-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
+
+
+Alternatively it can be cloned from git.
+
+$ git clone https://github.com/JayDDee/cpuminer-opt.git
+ 
+4. Build cpuminer-opt
+---------------------
+
+It is recomended to Build with default options, this will usuallly
+produce the best results.
+
+$ ./build.sh to build on Linux or execute the following commands.
+
+or 
+
+$ ./autogen.sh
+$ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+$ make -j n
+
+n is the number of threads.
+
+5. Start mining.
+----------------
+
+$ ./cpuminer -a algo -o url -u username -p password

-./cpuminer -a algo -o url -u username -p password

 Windows
+-------
+
+See also INSTAL_WINDOWS
+
+The following procedure is obsolete and uses an old compiler.

 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/sponge-2way.c \
+  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/18
+++ b/18
@@ -1,6 +1,8 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.

+See also README.md for list of supported algorithms,
+
 Security warning
 ----------------

@@ -31,7 +33,21 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------

-v3.10.2
+v3.10.5
+
+AVX512 for x17, sonoa, xevan, hmq1725, lyra2rev3, lyra2rev2. 
+Faster hmq1725 AVX2.
+
+v3.10.4
+
+AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
+
+v3.10.3
+
+AVX512 for x12, x13, x14, x15.
+Fixed x12 AVX2 invalid shares.
+
+v.10.2

 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
 Fixed c11 AVX2 invalid shares.
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,9 +184,9 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define  rotr32  mm256_swap32_64
-#define  rotr24  mm256_ror3x8_64
-#define  rotr16  mm256_ror1x16_64
+#define  rotr32( x )  mm256_ror_64( x, 32 )
+#define  rotr24( x )  mm256_ror_64( x, 24 )
+#define  rotr16( x )  mm256_ror_64( x, 16 )
 #define  rotr63( x )  mm256_rol_64( x,  1 )

 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
+#define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);

 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
+#define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
+#define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
+#define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
+#define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);

 // Blake-512 4 way
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
 }

 void
-blake256_16way_close_update(void *cc, void *dst)
+blake256_16way_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
 }

 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
 }

 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
 }

 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
 }

 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
 }

 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -874,6 +874,57 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 mm256_xor4( qt[24], qt[25], qt[26], qt[27] ),
                 mm256_xor4( qt[28], qt[29], qt[30], qt[31] ) ) );

+#define DH1L( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_slli_epi32( xh, sl ), \
+                                    _mm256_srli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH1R( m, sl, sr, a, b, c ) \
+   _mm256_add_epi32( \
+               _mm256_xor_si256( M[m], \
+                  _mm256_xor_si256( _mm256_srli_epi32( xh, sl ), \
+                                    _mm256_slli_epi32( qt[a], sr ) ) ), \
+               _mm256_xor_si256( _mm256_xor_si256( xl, qt[b] ), qt[c] ) )
+
+#define DH2L( m, rl, sl, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_slli_epi32( xl, sl ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+#define DH2R( m, rl, sr, h, a, b, c ) \
+   _mm256_add_epi32( _mm256_add_epi32( \
+       mm256_rol_32( dH[h], rl ), \
+          _mm256_xor_si256( _mm256_xor_si256( xh, qt[a] ), M[m] )), \
+                 _mm256_xor_si256( _mm256_srli_epi32( xl, sr ), \
+                                   _mm256_xor_si256( qt[b], qt[c] ) ) );
+
+   dH[ 0] = DH1L(  0,  5,  5, 16, 24, 0 );
+   dH[ 1] = DH1R(  1,  7,  8, 17, 25, 1 );
+   dH[ 2] = DH1R(  2,  5,  5, 18, 26, 2 );
+   dH[ 3] = DH1R(  3,  1,  5, 19, 27, 3 );
+   dH[ 4] = DH1R(  4,  3,  0, 20, 28, 4 );
+   dH[ 5] = DH1L(  5,  6,  6, 21, 29, 5 );
+   dH[ 6] = DH1R(  6,  4,  6, 22, 30, 6 );
+   dH[ 7] = DH1R(  7, 11,  2, 23, 31, 7 );
+   dH[ 8] = DH2L(  8,  9,  8,  4, 24, 23,  8 );
+   dH[ 9] = DH2R(  9, 10,  6,  5, 25, 16,  9 );
+   dH[10] = DH2L( 10, 11,  6,  6, 26, 17, 10 );
+   dH[11] = DH2L( 11, 12,  4,  7, 27, 18, 11 );
+   dH[12] = DH2R( 12, 13,  3,  0, 28, 19, 12 );
+   dH[13] = DH2R( 13, 14,  4,  1, 29, 20, 13 );
+   dH[14] = DH2R( 14, 15,  7,  2, 30, 21, 14 );
+   dH[15] = DH2R( 15, 16,  2,  3, 31, 22, 15 );
+
+#undef DH1L
+#undef DH1R
+#undef DH2L
+#undef DH2R
+
+/*   
   dH[ 0] = _mm256_add_epi32(
                 _mm256_xor_si256( M[0],
                      _mm256_xor_si256( _mm256_slli_epi32( xh, 5 ),
@@ -954,6 +1005,7 @@ void compress_small_8way( const __m256i *M, const __m256i H[16],
                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
                 _mm256_xor_si256( _mm256_srli_epi32( xl, 2 ),
                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+*/
 }

 static const __m256i final_s8[16] =
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap64_128( x4 );
-        x5 = mm512_swap64_128( x5 );
-        x6 = mm512_swap64_128( x6 );
-        x7 = mm512_swap64_128( x7 );
+        x4 = mm512_swap128_64( x4 );
+        x5 = mm512_swap128_64( x5 );
+        x6 = mm512_swap128_64( x6 );
+        x7 = mm512_swap128_64( x7 );
        x4 = _mm512_add_epi32( x0, x4 );
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap32_64( x4 );
-        x5 = mm512_swap32_64( x5 );
-        x6 = mm512_swap32_64( x6 );
-        x7 = mm512_swap32_64( x7 );
+        x4 = mm512_swap64_32( x4 );
+        x5 = mm512_swap64_32( x5 );
+        x6 = mm512_swap64_32( x6 );
+        x7 = mm512_swap64_32( x7 );
    }

    _mm512_store_si512( (__m512i*)sp->h,     x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
-        x5 = mm256_swap64_128( x5 );
-        x6 = mm256_swap64_128( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
-        x5 = mm256_swap32_64( x5 );
-        x6 = mm256_swap32_64( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
    }

    _mm256_store_si256( (__m256i*)sp->h,     x0 );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x2 = mm256_swap128_64( x2 );
+        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x2 = mm256_swap64_32( x2 );
+        x3 = mm256_swap64_32( x3 );
    }

    _mm256_store_si256( (__m256i*)sp->x,     x0 );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };

+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// Hamsi 8 way 
+
+#define INPUT_BIG8 \
+do { \
+  __m512i db = *buf; \
+  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
+     dm = mm512_negate_32( _mm512_or_si512( dm, \
+                                          _mm512_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[0] ) ) ); \
+     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[1] ) ) ); \
+     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[2] ) ) ); \
+     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[3] ) ) ); \
+     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[4] ) ) ); \
+     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[5] ) ) ); \
+     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[6] ) ) ); \
+     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
+                                          m512_const1_64( tp[7] ) ) ); \
+     tp += 8; \
+     db = _mm512_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX8( a, b, c, d ) \
+do { \
+  __m512i t; \
+  t = a; \
+  a = _mm512_and_si512( a, c ); \
+  a = _mm512_xor_si512( a, d ); \
+  c = _mm512_xor_si512( c, b ); \
+  c = _mm512_xor_si512( c, a ); \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, b ); \
+  t = _mm512_xor_si512( t, c ); \
+  b = d; \
+  d = _mm512_or_si512( d, t ); \
+  d = _mm512_xor_si512( d, a ); \
+  a = _mm512_and_si512( a, b ); \
+  t = _mm512_xor_si512( t, a ); \
+  b = _mm512_xor_si512( b, d ); \
+  b = _mm512_xor_si512( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm512_not( t ); \
+} while (0)
+
+#define L8( a, b, c, d ) \
+do { \
+   a = mm512_rol_32( a, 13 ); \
+   c = mm512_rol_32( c,  3 ); \
+   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
+   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
+                                              _mm512_slli_epi32( a, 3 ) ) ); \
+   b = mm512_rol_32( b, 1 ); \
+   d = mm512_rol_32( d, 7 ); \
+   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
+   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
+                                              _mm512_slli_epi32( b, 7 ) ) ); \
+   a = mm512_rol_32( a,  5 ); \
+   c = mm512_rol_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG8 \
+   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG8(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG8(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+
+#define ROUND_BIG8(rc, alpha) \
+do { \
+   __m512i t0, t1, t2, t3; \
+   s0 = _mm512_xor_si512( s0, m512_const1_64( \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
+\
+  SBOX8( s0, s4, s8, sC ); \
+  SBOX8( s1, s5, s9, sD ); \
+  SBOX8( s2, s6, sA, sE ); \
+  SBOX8( s3, s7, sB, sF ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
+                                        _mm512_bslli_epi128( s5, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
+                                        _mm512_bslli_epi128( sE, 4 ) ); \
+  L8( s0, t1, s9, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
+  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( s6, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
+                                        _mm512_bslli_epi128( sF, 4 ) ); \
+  L8( s1, t1, sA, t3 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
+                                        _mm512_bslli_epi128( s7, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
+                                        _mm512_bslli_epi128( sC, 4 ) ); \
+  L8( s2, t1, sB, t3 ); \
+  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
+                                        _mm512_bslli_epi128( s4, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  L8( s3, t1, s8, t3 ); \
+  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
+  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
+                                        _mm512_bslli_epi128( sB, 4 ) ); \
+  L8( t0, t1, t2, t3 ); \
+  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
+  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
+  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
+  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
+  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
+  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
+  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
+  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
+\
+  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
+  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
+                                        _mm512_bslli_epi128( sD, 4 ) ); \
+  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
+  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
+  L8( t0, t1, t2, t3 ); \
+  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
+  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
+  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
+  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
+  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
+  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
+  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
+  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
+} while (0)
+
+#define P_BIG8 \
+do { \
+   ROUND_BIG8(0, alpha_n); \
+   ROUND_BIG8(1, alpha_n); \
+   ROUND_BIG8(2, alpha_n); \
+   ROUND_BIG8(3, alpha_n); \
+   ROUND_BIG8(4, alpha_n); \
+   ROUND_BIG8(5, alpha_n); \
+} while (0)
+
+#define PF_BIG8 \
+do { \
+   ROUND_BIG8( 0, alpha_f); \
+   ROUND_BIG8( 1, alpha_f); \
+   ROUND_BIG8( 2, alpha_f); \
+   ROUND_BIG8( 3, alpha_f); \
+   ROUND_BIG8( 4, alpha_f); \
+   ROUND_BIG8( 5, alpha_f); \
+   ROUND_BIG8( 6, alpha_f); \
+   ROUND_BIG8( 7, alpha_f); \
+   ROUND_BIG8( 8, alpha_f); \
+   ROUND_BIG8( 9, alpha_f); \
+   ROUND_BIG8(10, alpha_f); \
+   ROUND_BIG8(11, alpha_f); \
+} while (0)
+
+#define T_BIG8 \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
+{
+   DECL_STATE_BIG8
+   uint32_t tmp = num << 6;
+
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG8( sc );
+   while ( num-- > 0 )
+   {
+      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG8;
+      P_BIG8;
+      T_BIG8;
+      buf++;
+   }
+   WRITE_STATE_BIG8( sc );
+}
+
+void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
+{
+   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG8
+   READ_STATE_BIG8( sc );
+   INPUT_BIG8;
+   PF_BIG8;
+   T_BIG8;
+   WRITE_STATE_BIG8( sc );
+}
+
+
+void hamsi512_8way_init( hamsi_8way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sc->count_high = sc->count_low = 0;
+
+   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
+   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
+   sc->h[2] = m512_const1_64( 0x302c206272672031 );
+   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
+   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
+   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
+   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
+   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
+}
+
+void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
+                           size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+
+   hamsi_8way_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_512( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
+{
+   __m512i pad[1];
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
+                               cl, ch, cl, ch, cl, ch, cl, ch );
+//   pad[0] =  m512_const2_32( cl, ch );
+   sc->buf[0] = m512_const1_64( 0x80 );
+   hamsi_8way_big( sc, sc->buf, 1 );
+   hamsi_8way_big_final( sc, pad );
+
+   mm512_block_bswap_32( (__m512i*)dst, sc->h );
+}
+
+
+#endif // AVX512
+
+
+// Hamsi 4 way

 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
   sc->h[0x7] = c7; \
 } while (0)

+/*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
+*/

 #define ROUND_BIG(rc, alpha) \
 do { \
   __m256i t0, t1, t2, t3; \
   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
-        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
-   s9 = _mm256_xor_si256( s9, m256_const1_64( \
-        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
-   sA = _mm256_xor_si256( sA, m256_const1_64( \
-        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
-   sB = _mm256_xor_si256( sB, m256_const1_64( \
-        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
-   sC = _mm256_xor_si256( sC, m256_const1_64( \
-        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
-   sD = _mm256_xor_si256( sD, m256_const1_64( \
-        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
-   sE = _mm256_xor_si256( sE, m256_const1_64( \
-        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
-   sF = _mm256_xor_si256( sF, m256_const1_64( \
-        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
  SBOX( s0, s4, s8, sC ); \
  SBOX( s1, s5, s9, sD ); \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;

 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
+      size_t len );
+#define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+typedef struct {
+   __m512i h[8];
+   __m512i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_8way_big_context;
+
+typedef hamsi_8way_big_context hamsi512_8way_context;
+
+void hamsi512_8way_init( hamsi512_8way_context *sc );
+void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
+                           size_t len );
+void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
+
+
+
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/haval/haval-8way-helper.c
+++ b/algo/haval/haval-8way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_update)
+( haval_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      uint32_t clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE_8W;
+         IN_PREPARE_8W(sc->buf);
+         RSTATE_8W;
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+         WSTATE_8W;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE_8W;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m256_one_32;
+   current += 4;   
+   RSTATE_8W;
+   if ( current > 116UL )
+   {
+      memset_zero_256( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE_8W(sc->buf);
+         SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE_8W(sc->buf);
+      SPH_XCAT(CORE_8W, PASSES)(INW_8W);
+   } while (0);
+   WSTATE_8W;
+   haval_8way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "haval-hash-4way.h"

-// won't compile with sse4.2
+// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
 //#if defined (__SSE4_2__)
 #if defined(__AVX__)

@@ -518,6 +518,301 @@ do { \

 #define INMSG(i)   msg[i]

+#if defined(__AVX2__)
+
+// Haval-256 8 way 32 bit avx2
+
+#define F1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( x0, \
+       _mm256_xor_si256( _mm256_and_si256(_mm256_xor_si256( x0, x4 ), x1 ), \
+                      _mm256_xor_si256( _mm256_and_si256( x2, x5 ), \
+                                     _mm256_and_si256( x3, x6 ) ) ) ) \
+
+#define F2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+      _mm256_and_si256( x2, \
+         _mm256_xor_si256( _mm256_andnot_si256( x3, x1 ), \
+                        _mm256_xor_si256( _mm256_and_si256( x4, x5 ), \
+                                       _mm256_xor_si256( x6, x0 ) ) ) ), \
+         _mm256_xor_si256( \
+             _mm256_and_si256( x4, _mm256_xor_si256( x1, x5 ) ), \
+             _mm256_xor_si256( _mm256_and_si256( x3, x5 ), x0 ) ) ) \
+
+#define F3_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+    _mm256_and_si256( x3, \
+      _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                     _mm256_xor_si256( x6, x0 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256(_mm256_and_si256( x1, x4 ), \
+                                   _mm256_and_si256( x2, x5 ) ), x0 ) )
+
+#define F4_8W(x6, x5, x4, x3, x2, x1, x0) \
+  _mm256_xor_si256( \
+     _mm256_xor_si256( \
+        _mm256_and_si256( x3, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x2 ), \
+                                         _mm256_or_si256( x4, x6 ) ), x5 ) ), \
+        _mm256_and_si256( x4, \
+           _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( mm256_not(x2), x5 ), \
+                          _mm256_xor_si256( x1, x6 ) ), x0 ) ) ), \
+     _mm256_xor_si256( _mm256_and_si256( x2, x6 ), x0 ) )
+
+
+#define F5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   _mm256_xor_si256( \
+       _mm256_and_si256( x0, \
+            mm256_not( _mm256_xor_si256( \
+                    _mm256_and_si256( _mm256_and_si256( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm256_xor_si256( _mm256_xor_si256( _mm256_and_si256( x1, x4 ), \
+                                    _mm256_and_si256( x2, x5 ) ), \
+                                    _mm256_and_si256( x3, x6 ) ) )
+
+#define FP3_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F1_8W(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F2_8W(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F3_8W(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F4_8W(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5_8W(x6, x5, x4, x3, x2, x1, x0) \
+   F5_8W(x2, x5, x0, x6, x4, x3, x1)
+
+#define STEP_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
+                                      mm256_ror_32( x7, 11 ) ), \
+                       _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
+} while (0)
+
+#define PASS1_8W(n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(pass_count + 0), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(pass_count + 1), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(pass_count + 2), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(pass_count + 3), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(pass_count + 4), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(pass_count + 5), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(pass_count + 6), SPH_C32(0x00000000)); \
+         STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(pass_count + 7), SPH_C32(0x00000000)); \
+         } \
+   } while (0)
+
+#define PASSG_8W(p, n, in)   do { \
+      unsigned pass_count; \
+      for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+         STEP_8W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+            in(MP ## p[pass_count + 0]), \
+            RK ## p[pass_count + 0]); \
+         STEP_8W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+            in(MP ## p[pass_count + 1]), \
+            RK ## p[pass_count + 1]); \
+         STEP_8W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+            in(MP ## p[pass_count + 2]), \
+            RK ## p[pass_count + 2]); \
+         STEP_8W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+            in(MP ## p[pass_count + 3]), \
+            RK ## p[pass_count + 3]); \
+         STEP_8W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+            in(MP ## p[pass_count + 4]), \
+            RK ## p[pass_count + 4]); \
+         STEP_8W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+            in(MP ## p[pass_count + 5]), \
+            RK ## p[pass_count + 5]); \
+         STEP_8W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+            in(MP ## p[pass_count + 6]), \
+            RK ## p[pass_count + 6]); \
+         STEP_8W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+            in(MP ## p[pass_count + 7]), \
+            RK ## p[pass_count + 7]); \
+         } \
+   } while (0)
+
+#define PASS2_8W(n, in)    PASSG_8W(2, n, in)
+#define PASS3_8W(n, in)    PASSG_8W(3, n, in)
+#define PASS4_8W(n, in)    PASSG_8W(4, n, in)
+#define PASS5_8W(n, in)    PASSG_8W(5, n, in)
+
+#define SAVE_STATE_8W \
+   __m256i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE_8W \
+do { \
+   s0 = _mm256_add_epi32( s0, u0 ); \
+   s1 = _mm256_add_epi32( s1, u1 ); \
+   s2 = _mm256_add_epi32( s2, u2 ); \
+   s3 = _mm256_add_epi32( s3, u3 ); \
+   s4 = _mm256_add_epi32( s4, u4 ); \
+   s5 = _mm256_add_epi32( s5, u5 ); \
+   s6 = _mm256_add_epi32( s6, u6 ); \
+   s7 = _mm256_add_epi32( s7, u7 ); \
+} while (0)
+
+#define CORE_8W5(in)  do { \
+      SAVE_STATE_8W; \
+      PASS1_8W(5, in); \
+      PASS2_8W(5, in); \
+      PASS3_8W(5, in); \
+      PASS4_8W(5, in); \
+      PASS5_8W(5, in); \
+      UPDATE_STATE_8W; \
+   } while (0)
+
+#define DSTATE_8W   __m256i s0, s1, s2, s3, s4, s5, s6, s7
+
+#define RSTATE_8W \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+#define WSTATE_8W \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+static void
+haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = m256_const1_32( 0x243F6A88UL );
+   sc->s1 = m256_const1_32( 0x85A308D3UL );
+   sc->s2 = m256_const1_32( 0x13198A2EUL );
+   sc->s3 = m256_const1_32( 0x03707344UL );
+   sc->s4 = m256_const1_32( 0xA4093822UL );
+   sc->s5 = m256_const1_32( 0x299F31D0UL );
+   sc->s6 = m256_const1_32( 0x082EFA98UL );
+   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+
+}
+#define IN_PREPARE_8W(indata) const __m256i *const load_ptr_8w = (indata)
+
+#define INW_8W(i)   load_ptr_8w[ i ] 
+
+static void
+haval_8way_out( haval_8way_context *sc, void *dst )
+{
+   __m256i *buf = (__m256i*)dst;
+   DSTATE_8W;
+   RSTATE_8W;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+#undef PASSES
+#define PASSES   5
+#include "haval-8way-helper.c"
+
+#define API_8W(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _8way_init(void *cc) \
+{ \
+   haval_8way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_update (void *cc, const void *data, size_t len) \
+{ \
+   haval ## y ## _8way_update(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _8way_close(void *cc, void *dst) \
+{ \
+   haval ## y ## _8way_close(cc, dst); \
+} \
+
+API_8W(256, 5)
+
+#define RVAL_8W \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL_8W \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG_8W(i)   msg[i]
+
+
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -59,7 +59,7 @@
 */

 #ifndef HAVAL_HASH_4WAY_H__
-#define HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__ 1

 #if defined(__AVX__)

@@ -84,10 +84,30 @@ typedef haval_4way_context haval256_5_4way_context;

 void haval256_5_4way_init( void *cc );

-void haval256_5_4way( void *cc, const void *data, size_t len );
+void haval256_5_4way_update( void *cc, const void *data, size_t len );
+#define haval256_5_4way haval256_5_4way_update

 void haval256_5_4way_close( void *cc, void *dst );

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[32];
+   __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   uint32_t count_high, count_low;
+} haval_8way_context __attribute__ ((aligned (64)));
+
+typedef haval_8way_context haval256_5_8way_context;
+
+void haval256_5_8way_init( void *cc );
+
+void haval256_5_8way_update( void *cc, const void *data, size_t len );
+
+void haval256_5_8way_close( void *cc, void *dst );
+
+#endif // AVX2
+
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   int size = ROW_LEN_BYTES * 4; // nRows;

-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
+//   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
+   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
+   init_lyra2rev3_16way_ctx();;
+#else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
+#endif
 #endif
   return l2v3_wholeMatrix;
 }

 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
+  gate->hash      = (void*)&lyra2rev3_16way_hash;
+#elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
-  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
@@ -85,10 +94,14 @@ bool lyra2rev2_thread_init()
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined (LYRA2REV2_8WAY)
+   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
+   init_lyra2rev2_8way_ctx();;
+#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV2_4WAY)
   init_lyra2rev2_4way_ctx();;
 #else
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
 #endif
   return l2v2_wholeMatrix;
@@ -96,14 +109,17 @@ bool lyra2rev2_thread_init()

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV2_4WAY)
+#if defined (LYRA2REV2_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
+  gate->hash      = (void*)&lyra2rev2_8way_hash;
+#elif defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,27 @@
 #include <stdint.h>
 #include "lyra2.h"

-#if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
-#endif

-#if defined(__SSE2__)
-  #define LYRA2REV3_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV3_16WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV3_8WAY 1
+#elif defined(__SSE2__)
+  #define LYRA2REV3_4WAY 1
 #endif

 extern __thread uint64_t* l2v3_wholeMatrix;

 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
+#if defined(LYRA2REV3_16WAY)
+
+void lyra2rev3_16way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_16way_ctx();
+
+#elif defined(LYRA2REV3_8WAY)

 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
@@ -41,15 +50,24 @@ bool init_lyra2rev3_ctx();

 //////////////////////////////////

-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define LYRA2REV2_8WAY 1
+#elif defined(__AVX2__)
+  #define LYRA2REV2_4WAY 1
 #endif

 extern __thread uint64_t* l2v2_wholeMatrix;

 bool register_lyra2rev2_algo( algo_gate_t* gate );

-#if defined(LYRA2REV2_4WAY)
+#if defined(LYRA2REV2_8WAY)
+
+void lyra2rev2_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev2_8way_ctx();
+
+#elif defined(LYRA2REV2_4WAY)

 void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -26,6 +26,19 @@
 #include "lyra2.h"
 #include "sponge.h"

+//  LYRA2RE 8 cols 8 rows used by lyea2re, allium, phi2, x22i, x25x.
+//
+//  LYRA2REV2 4 cols 4 rows used by lyra2rev2.
+//
+//  LYRA2REV3 4 cols 4 rows with an extra twist in calculating
+//  rowa in the wandering phase. Used by lyra2rev3.
+// 
+//  LYRA2Z various cols & rows and supports 80 input. Used by lyra2z,
+//  lyra2z330, lyra2h, 
+
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
 * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
@@ -46,176 +59,137 @@
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */

-int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
-               const uint64_t timeCost, const uint64_t nRows,
-               const uint64_t nCols )
+// For lyra2rev3.
+// convert a simple offset to an index into interleaved data.
+// good for state and 4 row matrix. 
+// index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
+
+#define offset_to_index( o ) \
+   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
+
+
+int LYRA2REV2_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+             const void *pwd, const uint64_t pwdlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2;
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1;
   //====================================================================/

-   //=== Initializing the Memory Matrix and pointers to it =============//
-   //Tries to allocate enough space for the whole memory matrix
-
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
-
-   //=== Getting the password + salt + basil padded with 10*1 ==========//
-   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
-   //but this ensures that the password copied locally will be overwritten as soon as possible
-
-   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;

-   byte *ptrByte = (byte*) wholeMatrix;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;

-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
+   // now build the rest interleaving on the fly.

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
-
-   //================= Initializing the Sponge State ====================//
-   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
-
-//   initState( state );
-
-   //========================= Setup Phase =============================//
-   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

   ptrWord = wholeMatrix;

-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-/*
-   for (i = 0; i < nBlocksInput; i++)
-   {
-       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
-       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
-   }
-*/
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );

   //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[ 2 * ROW_LEN_INT64 ],  nCols );

   do
   {
     //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+     reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64],
+                                        &wholeMatrix[ 2* row*ROW_LEN_INT64],
+                                        nCols );

-      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
-      //update prev: it now points to the last row ever computed
+     rowa0 = (rowa0 + step) & (window - 1);

     prev = row;
-      //updates row: goes to the next row to be computed
     row++;

-      //Checks if all rows in the window where visited.
-      if (rowa == 0)
+     if ( rowa0 == 0 )
     {
-         step = window + gap; //changes the step: approximately doubles its value
-         window *= 2; //doubles the size of the re-visitation window
-         gap = -gap; //inverts the modifier to the step
+        step = window + gap;
+        window *= 2; 
+        gap = -gap;
     }
-
-   } while (row < nRows);
+   } while ( row < nRows );

   //===================== Wandering Phase =============================//
-   row = 0; //Resets the visitation to the first row of the memory matrix
-   for (tau = 1; tau <= timeCost; tau++)
+   row = 0;
+   for ( tau = 1; tau <= timeCost; tau++ )
   {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
      do
      {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);

-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
-
-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-           //update prev: it now points to the last row ever computed
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );
         prev = row;

-           //updates row: goes to the next row to be computed
-           //----------------------------------------------------
         row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
-           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //----------------------------------------------------

      } while (row != 0);
   }

   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64 ],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64 ] );
   //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );

   return 0;
 }

+// This version is currently only used by REv3 and has some hard coding
+// specific to v3 such as input data size of 32 bytes.
+//
+// Similarly with REv2. Thedifference with REv3 isn't clear and maybe
+// they can be merged.
+//
+// RE is used by RE, allium. The main difference between RE and REv2
+// in the matrix size.
+//
+// Z also needs to support 80 byte input as well as 32 byte, and odd
+// matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
+
+
 /////////////////////////////////////////////////

 // 2 way 256
@@ -223,22 +197,29 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 // Data is interleaved 2x256.

 int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
-      const void *pwd, const uint64_t pwdlen, const void *salt,
-      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
-      const uint64_t nCols )
+                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
+                    uint64_t nRows, uint64_t nCols )
+
+// hard coded for 32 byte input as well as matrix size.
+// Other required versions include 80 byte input and different block
+// sizez
+
+//int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
+//      const void *pwd, const uint64_t pwdlen, const void *salt,
+//      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
+//      const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
-   int64_t row = 2; //index of row to be processed
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
-   int64_t tau; //Time Loop iterator
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-//   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
-   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t _ALIGN(256) state[32];
+   int64_t row = 2; 
+   int64_t prev = 1;
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
+   int64_t tau; 
+   int64_t step = 1;
+   int64_t window = 2;
+   int64_t gap = 1; 
+   uint64_t instance0 = 0;
   uint64_t instance1 = 0;
   //====================================================================/

@@ -248,7 +229,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   uint64_t *ptrWord = wholeMatrix;

 //  2 way 256 rewrite. Salt always == password, and data is interleaved,
-//  need to build in parallel:
+//  need to build in parallel as pw isalready interleaved.
+
+   
 //  {   password,    (64 or 80 bytes)
 //      salt,        (64 or 80 bytes) =  same as password
 //      Klen,        (u64)  = 32 bytes
@@ -262,73 +245,54 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
 //      1            (byte)
 //   }
   
-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+// input is usually 32 maybe 64, both are aligned to 256 bit vector.
+// 80 byte inpput is not aligned complicating matters for lyra2z.   

-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
   
-   byte *ptrByte = (byte*) wholeMatrix;
+   uint64_t *ptr = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

-   //Prepends the password
-   memcpy(ptrByte, pwd, pwdlen);
-   ptrByte += pwdlen;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
 
-   //Concatenates the salt
-   memcpy(ptrByte, salt, saltlen);
-   ptrByte += saltlen;
+   // now build the rest interleaving on the fly.

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
-
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = pwdlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = saltlen;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = timeCost;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nRows;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-   v64 = nCols;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
-   ptrByte += sizeof(uint64_t);
-
-   //Now comes the padding
-   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
-   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
-   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
-
-// from here on it's all simd acces to state and matrix
-// define vector pointers and adjust sizes and pointer offsets
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;

   ptrWord = wholeMatrix;

-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
+
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                           &wholeMatrix[2*ROW_LEN_INT64],  nCols );

   do
   {

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );

-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);

      prev = row;
      row++;

-      if (rowa == 0)
+      if (rowa0 == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
@@ -340,37 +304,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   row = 0;
   for (tau = 1; tau <= timeCost; tau++)
   {
-      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+      step = ( (tau & 1) == 0 ) ? -1 : ( nRows >> 1 ) - 1;
      do
      {
-        // This part is not parallel, rowa will be different for each lane.
-        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+        instance0 = state[ offset_to_index( instance0 ) ];
+        instance1 = (&state[4])[ offset_to_index( instance1 ) ];

-        // index = 2 * instance / 4 * 4 + instance % 4
-        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
-                           + ( instance0 & 0x3 )
-        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
-                           + ( instance1 & 0x3 )
+        rowa0 = state[ offset_to_index( instance0 )  ]
+                & (unsigned int)(nRows-1);
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
+                & (unsigned int)(nRows-1);

-        instance0 = state[ index0 ] & 0xf;
-        instance1 = (state+4)[ index1 ] & 0xf;
-
-        rowa0 = state[ instance0 ];
-        rowa1 = (state+4)[ instance1 ];
-
-        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
-                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
-                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
-/*
-           instance = state[instance & 0xF];
-           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
-
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
-*/
-        // End of divergence.
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                      nCols );

        prev = row;
        row = (row + step) & (unsigned int)(nRows-1); 
@@ -378,13 +327,17 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
       } while ( row != 0 );
   }

-   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
-   squeeze( state, K, (unsigned int) kLen );
+   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64],
+                            &wholeMatrix[2*rowa1*ROW_LEN_INT64] );
+
+   squeeze_2way( state, K, (unsigned int) kLen );

   return 0;
 }

+#endif // AVX512

+#if 0

 //////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
@@ -532,22 +485,26 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    return 0;
 }

+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
-             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
+                  const uint64_t pwdlen, const uint64_t timeCost,
                  const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t rowa0 = 0;
+   int64_t rowa1 = 0;
   int64_t tau; //Time Loop iterator
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
   //====================================================================/

   //=== Initializing the Memory Matrix and pointers to it =============//
@@ -573,15 +530,36 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
 #endif

   uint64_t *ptrWord = wholeMatrix;
+   uint64_t *pw = (uint64_t*)pwd;

   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible

   //First, we clean enough blocks for the password, salt, basil and padding
-   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+   int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;

+   uint64_t *ptr = wholeMatrix;
+
+   memcpy( ptr, pw, 2*pwdlen ); // password 
+   ptr += pwdlen>>2;
+   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
+   ptr += pwdlen>>2;
+
+   // now build the rest interleaving on the fly.
+
+   ptr[0] = ptr[ 4] = kLen;
+   ptr[1] = ptr[ 5] = pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
+   ptr[3] = ptr[ 7] = timeCost;
+   ptr[8] = ptr[12] = nRows;
+   ptr[9] = ptr[13] = nCols;
+   ptr[10] = ptr[14] = 0x80;
+   ptr[11] = ptr[15] = 0x0100000000000000;
+
+   
+/*   
   byte *ptrByte = (byte*) wholeMatrix;

   //Prepends the password
@@ -630,7 +608,9 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

   ptrWord = wholeMatrix;

-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+*/
+
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
 /*
   for (i = 0; i < nBlocksInput; i++)
   {
@@ -639,21 +619,22 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   }
 */
   //Initializes M[0] and M[1]
-   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
-                      nCols);
+   reducedDuplexRow1_2way( state, &wholeMatrix[0],
+                                  &wholeMatrix[ 2 * ROW_LEN_INT64], nCols );

   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)

-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
+                                         &wholeMatrix[ 2* row*ROW_LEN_INT64 ],
+                                         nCols );

      //updates the value of row* (deterministically picked during Setup))
-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);
      //update prev: it now points to the last row ever computed

      prev = row;
@@ -661,7 +642,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
      row++;

      //Checks if all rows in the window where visited.
-      if (rowa == 0)
+      if (rowa0 == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
@@ -674,21 +655,18 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   row = 0; //Resets the visitation to the first row of the memory matrix
   for (tau = 1; tau <= timeCost; tau++)
   {
-       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
-       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
      do
      {
-           //Selects a pseudorandom index row*
-           //-----------------------------------------------
-           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+        rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
+        rowa1 = state[ 4 ] & (unsigned int)(nRows-1);

-           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
-           //-------------------------------------------
+        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
+                                      &wholeMatrix[ 2* row *ROW_LEN_INT64 ],
+                                      nCols );

-           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
-           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
           //update prev: it now points to the last row ever computed
           prev = row;

@@ -703,9 +681,10 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,

   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
-   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
+                            &wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
   //Squeezes the key
-   squeeze(state, K, (unsigned int) kLen);
+   squeeze_2way( state, K, (unsigned int) kLen );

   //================== Freeing the memory =============================//
   _mm_free(wholeMatrix);
@@ -713,3 +692,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   return 0;
 }

+#endif
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -327,7 +327,6 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,

   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
-
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,

 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);

+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+
+int LYRA2REV2_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
+#endif
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,150 @@
 #include "lyra2-gate.h"
 #include <memory.h>
-
-#if defined (LYRA2REV2_4WAY)	
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"
+
+#if defined (LYRA2REV2_8WAY)
+
+typedef struct {
+   blake256_8way_context     blake;
+   keccak256_8way_context    keccak;
+   cube_4way_context          cube;
+   skein256_8way_context     skein;
+   bmw256_8way_context          bmw;
+} lyra2v2_8way_ctx_holder __attribute__ ((aligned (64)));
+
+static lyra2v2_8way_ctx_holder l2v2_8way_ctx;
+
+bool init_lyra2rev2_8way_ctx()
+{
+   keccak256_8way_init( &l2v2_8way_ctx.keccak );
+   cube_4way_init( &l2v2_8way_ctx.cube, 256, 16, 32 );
+   skein256_8way_init( &l2v2_8way_ctx.skein );
+   bmw256_8way_init( &l2v2_8way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev2_8way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
+   uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   lyra2v2_8way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v2_8way_ctx, sizeof(l2v2_8way_ctx) );
+
+   blake256_8way( &ctx.blake, input + (64<<3), 16 );
+   blake256_8way_close( &ctx.blake, vhash );
+
+   rintrlv_8x32_8x64( vhashA, vhash, 256 );
+
+   keccak256_8way_update( &ctx.keccak, vhashA, 32 );
+   keccak256_8way_close( &ctx.keccak, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV2_2WAY( l2v2_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+
+   intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                hash7, 256 );
+
+   skein256_8way_update( &ctx.skein, vhash, 32 );
+   skein256_8way_close( &ctx.skein, vhash );
+
+   rintrlv_8x64_4x128( vhashA, vhashB, vhash, 256 );
+
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
+
+   intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, 
+                hash7, 256 );
+
+   bmw256_8way_update( &ctx.bmw, vhash, 32 );
+   bmw256_8way_close( &ctx.bmw, state );
+}
+
+int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   __m256i *noncev = (__m256i*)vdata + 19;   // aligned
+   int thr_id = mythr->id; 
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+
+   blake256_8way_init( &l2v2_8way_ctx.blake );
+   blake256_8way_update( &l2v2_8way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
+                                                  n+3, n+2, n+1, n ) );
+
+      lyra2rev2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart);
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV2_4WAY)

 typedef struct {
   blake256_4way_context     blake;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,180 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
+#include "algo/cubehash/cube-hash-2way.h"

-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
+
+typedef struct {
+   blake256_16way_context     blake;
+   cube_4way_context          cube;
+   bmw256_16way_context       bmw;
+} lyra2v3_16way_ctx_holder;
+
+static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
+
+bool init_lyra2rev3_16way_ctx()
+{
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
+   bmw256_16way_init( &l2v3_16way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_16way_hash( void *state, const void *input )
+{
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (64)));
+   uint32_t hash2[8] __attribute__ ((aligned (64)));
+   uint32_t hash3[8] __attribute__ ((aligned (64)));
+   uint32_t hash4[8] __attribute__ ((aligned (64)));
+   uint32_t hash5[8] __attribute__ ((aligned (64)));
+   uint32_t hash6[8] __attribute__ ((aligned (64)));
+   uint32_t hash7[8] __attribute__ ((aligned (64)));
+   uint32_t hash8[8] __attribute__ ((aligned (64)));
+   uint32_t hash9[8] __attribute__ ((aligned (64)));
+   uint32_t hash10[8] __attribute__ ((aligned (64)));
+   uint32_t hash11[8] __attribute__ ((aligned (64)));
+   uint32_t hash12[8] __attribute__ ((aligned (64)));
+   uint32_t hash13[8] __attribute__ ((aligned (64)));
+   uint32_t hash14[8] __attribute__ ((aligned (64)));
+   uint32_t hash15[8] __attribute__ ((aligned (64)));
+   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
+
+   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
+   blake256_16way_close( &ctx.blake, vhash );
+
+   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
+           vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
+   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
+   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
+   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
+   cube_4way_init( &ctx.cube, 256, 16, 32 );
+   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
+
+   intrlv_2x256( vhash, hash0, hash1, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash0, hash1, vhash, 256 );
+   intrlv_2x256( vhash, hash2, hash3, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash2, hash3, vhash, 256 );
+   intrlv_2x256( vhash, hash4, hash5, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash4, hash5, vhash, 256 );
+   intrlv_2x256( vhash, hash6, hash7, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash6, hash7, vhash, 256 );
+   intrlv_2x256( vhash, hash8, hash9, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash8, hash9, vhash, 256 );
+   intrlv_2x256( vhash, hash10, hash11, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash10, hash11, vhash, 256 );
+   intrlv_2x256( vhash, hash12, hash13, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash12, hash13, vhash, 256 );
+   intrlv_2x256( vhash, hash14, hash15, 256 );
+   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, 1, 4, 4 );
+   dintrlv_2x256( hash14, hash15, vhash, 256 );
+
+   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
+             hash15, 256 );
+
+   bmw256_16way_update( &ctx.bmw, vhash, 32 );
+   bmw256_16way_close( &ctx.bmw, state );
+}
+
+
+int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &hash[7<<4];
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
+   const uint32_t Htarg = ptarget[7];
+   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
+   const int thr_id = mythr->id;
+
+   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+
+   blake256_16way_init( &l2v3_16way_ctx.blake );
+   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
+
+   do
+   {
+      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
+                                                  n+11, n+10, n+ 9, n+ 8,
+                                                  n+ 7, n+ 6, n+ 5, n+ 4,
+                                                  n+ 3, n+ 2, n+ 1, n ) );
+
+      lyra2rev3_16way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash7[lane] <= Htarg ) )
+      {
+         extr_lane_16x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+             pdata[19] = n + lane;
+             submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (LYRA2REV3_8WAY)

 typedef struct {
   blake256_8way_context     blake;
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "algo-gate.h"
+//#include "algo-gate.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -40,18 +40,25 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
-       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
-       out += BLOCK_LEN_M256I*2;
+       memcpy_512( out, state, BLOCK_LEN_M256I );
+       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
+       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+    memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
 }

-inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
+inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                                               const uint64_t *In1 ) 
 {
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m512i*)In;
+    __m512i in[3];
+    casti_m256i( in, 0 ) = casti_m256i( In0, 0 );
+    casti_m256i( in, 1 ) = casti_m256i( In1, 1 );
+    casti_m256i( in, 2 ) = casti_m256i( In0, 2 );
+    casti_m256i( in, 3 ) = casti_m256i( In1, 3 );
+    casti_m256i( in, 4 ) = casti_m256i( In0, 4 );
+    casti_m256i( in, 5 ) = casti_m256i( In1, 5 );
    
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -90,7 +97,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
    state1 = _mm512_xor_si512( state1, in[1] );

    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len * 2;
+    In += block_len*2;
  }

  _mm512_store_si512( (__m512i*)State,     state0 );
@@ -109,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,


    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -132,7 +139,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
       out[2] = state2;

       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I * 2;
+       out -= BLOCK_LEN_M256I;

       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -143,15 +150,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-// This function has to deal with gathering 2 256 bit rowin vectors from
-// non-contiguous memory. Extra work and performance penalty.

 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                 uint64_t *rowOut, uint64_t nCols )
 {
    int i;
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m256i*)rowIn;
+    __m512i *in = (__m512i*)rowIn;
+    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );

    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -171,28 +177,25 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );

         //Input: next column (i.e., next block in sequence)
-         in0 += BLOCK_LEN_M256I;
-         in1 += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_M256I;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I * 2;
+         out -= BLOCK_LEN_M256I;
    }

-    _mm512_store_si256( (__m512i*)State,     state0 );
-    _mm512_store_si256( (__m512i*)State + 1, state1 );
-    _mm512_store_si256( (__m512i*)State + 2, state2 );
-    _mm512_store_si256( (__m512i*)State + 3, state3 );
-   }
+    _mm512_store_si512( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                       uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols )
 {
    int i;
-
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    __m512i  t0, t1, t2;

    state0 = _mm512_load_si512( (__m512i*)State     );
@@ -209,7 +212,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       state2 = _mm512_xor_si512( state2,
                                  _mm512_add_epi64( in[2], inout[2] ) );

-       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );

       out[0] = _mm512_xor_si512( state0, in[0] );
       out[1] = _mm512_xor_si512( state1, in[1] );
@@ -221,17 +224,18 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       t2 = _mm512_permutex_epi64( state2, 0x93 );

       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+

       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in    += BLOCK_LEN_M256I;
+       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I * 2;
+       out   -= BLOCK_LEN_M256I;
    }

    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -240,49 +244,61 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }

-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
-                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
-                uint64_t nCols )
+// big ugly workaound for pointer aliasing, use a union of pointers.
+// Access matrix using m512i for in and out, m256i for inout
+
+inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols)
 {
   int i;
-
   register __m512i state0, state1, state2, state3;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m256i *in0 = (__m256i*)rowIn0;
-    __m2512* in    = (__m512i*)rowIn;
-    __m2512* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut;
+   __m512i *in = (__m512i*)rowIn;
+   __m256i *inout0 = (__m256i*)rowInOut0;
+   __m256i *inout1 = (__m256i*)rowInOut1;
+   __m512i *out = (__m512i*)rowOut;
+   __m512i io[3];
+   povly inout;
+   inout.v512 = &io[0];
    __m512i t0, t1, t2;

-    _mm_prefetch( in0,     _MM_HINT_T0 );
-    _mm_prefetch( in1,     _MM_HINT_T0 );
-    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
-    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
-    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
-   
   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
    
-      //Absorbing "M[prev] [+] M[row*]"
+    _mm_prefetch( in,     _MM_HINT_T0 );
+    _mm_prefetch( inout0,     _MM_HINT_T0 );
+    _mm_prefetch( inout1,     _MM_HINT_T0 );
+    _mm_prefetch( in     + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
+    _mm_prefetch( in     + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
+    _mm_prefetch( in     + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
+    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );

-//         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
-//         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
-//         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
-      t0 = mm512_concat_256( in1[0], in0[0] );
-      t1 = mm512_concat_256( in1[1], in0[1] );
-      t2 = mm512_concat_256( in1[2], in0[2] );
+    
+    for ( i = 0; i < nCols; i++ )
+    {
+
+      //Absorbing "M[prev] [+] M[row*]"
+      inout.v256[0] = inout0[0];
+      inout.v256[1] = inout1[1];
+      inout.v256[2] = inout0[2];
+      inout.v256[3] = inout1[3];
+      inout.v256[4] = inout0[4];
+      inout.v256[5] = inout1[5];

      state0 = _mm512_xor_si512( state0,
-                                     _mm512_add_epi64( t0, inout[0] ) );
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
      state1 = _mm512_xor_si512( state1,
-                                     _mm512_add_epi64( t1, inout[1] ) );
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
      state2 = _mm512_xor_si512( state2,
-                                     _mm512_add_epi64( t2, inout[2] ) );
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
+

      //Applies the reduced-round transformation f to the sponge's state
      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
@@ -292,22 +308,44 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
      out[1] = _mm512_xor_si512( out[1], state1 );
      out[2] = _mm512_xor_si512( out[2], state2 );

+      // if inout is the same row as out it was just overwritten, reload.
+      if ( rowOut == rowInOut0 )
+      {
+         inout.v256[0] = inout0[0];
+         inout.v256[2] = inout0[2];
+         inout.v256[4] = inout0[4];
+      }
+      if ( rowOut == rowInOut1 )
+      {
+         inout.v256[1] = inout1[1];
+         inout.v256[3] = inout1[3];
+         inout.v256[5] = inout1[5];
+      }
+
      //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
      t0 = _mm512_permutex_epi64( state0, 0x93 );
      t1 = _mm512_permutex_epi64( state1, 0x93 );
      t2 = _mm512_permutex_epi64( state2, 0x93 );

-      inout[0] = _mm512_xor_si512( inout[0],
-                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
-      inout[1] = _mm512_xor_si512( inout[1],
-                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
-      inout[2] = _mm512_xor_si512( inout[2],
-                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
+                                   _mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
+      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
+                                   _mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
+      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
+                                   _mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
+      
+      inout0[0] = inout.v256[0];
+      inout1[1] = inout.v256[1];
+      inout0[2] = inout.v256[2];
+      inout1[3] = inout.v256[3];
+      inout0[4] = inout.v256[4];
+      inout1[5] = inout.v256[5];

       //Goes to next block
-       in    += BLOCK_LEN_M256I * 2;
-       out   += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       in     += BLOCK_LEN_M256I;
+       inout0 += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
+       out    += BLOCK_LEN_M256I;
   }

   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -376,6 +376,9 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
       _mm_prefetch( out -  9, _MM_HINT_T0 );
       _mm_prefetch( out - 11, _MM_HINT_T0 );

+//printf("S RSR0 col= %d, out= %x\n",i,out);
+
+
       out[0] = state0;
       out[1] = state1;
       out[2] = state2;
@@ -706,11 +709,34 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       out[1] = _mm256_xor_si256( state1, in[1] );
       out[2] = _mm256_xor_si256( state2, in[2] );

+/*
+printf("s duplexsetup col= %d\n",i); 
+uint64_t * o = (uint64_t*)out;
+printf("S out %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S out %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
       //M[row*][col] = M[row*][col] XOR rotW(rand)
       t0 = _mm256_permute4x64_epi64( state0, 0x93 );
       t1 = _mm256_permute4x64_epi64( state1, 0x93 );
       t2 = _mm256_permute4x64_epi64( state2, 0x93 );

+/*
+uint64_t *t = (uint64_t*)&t0;
+printf("S t0 %016lx %016lx %016lx %016lx\n",t[0],t[1],t[2],t[3]);
+
+o = (uint64_t*)inout;
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout0 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/       
       inout[0] = _mm256_xor_si256( inout[0],
                                    _mm256_blend_epi32( t0, t2, 0x03 ) );
       inout[1] = _mm256_xor_si256( inout[1],
@@ -718,7 +744,17 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
       inout[2] = _mm256_xor_si256( inout[2],
                                    _mm256_blend_epi32( t2, t1, 0x03 ) );

-       //Inputs: next column (i.e., next block in sequence)
+/*
+o = (uint64_t*)inout;
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[0],o[1],o[2],o[3]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[4],o[5],o[6],o[7]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[8],o[9],o[10],o[11]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[12],o[13],o[14],o[15]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[16],o[17],o[18],o[19]);
+printf("S inout1 %016lx %016lx %016lx %016lx\n",o[20],o[21],o[22],o[23]);
+*/
+
+//Inputs: next column (i.e., next block in sequence)
       in    += BLOCK_LEN_M256I;
       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
@@ -949,6 +985,22 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
      _mm_prefetch( inout +  9, _MM_HINT_T0 );
      _mm_prefetch( inout + 11, _MM_HINT_T0 );

+/*
+uint64_t *io = (uint64_t*)inout;
+uint64_t *ii = (uint64_t*)in;
+
+printf("RDRS1 col= %d\n", i);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
+printf("RDRS1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
+printf("RDRS1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
+*/
+
+
      //Absorbing "M[prev] [+] M[row*]"
      state0 = _mm256_xor_si256( state0,
                                     _mm256_add_epi64( in[0], inout[0] ) );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );

 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror_1x64( s1); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_rol1x64_256( s3 ); \
-   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol1x64_256( s1 ); \
-   s2 = mm512_swap128_256( s2 ); \
-   s3 = mm512_ror1x64_256( s3 );
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_ror256_64( s1); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_rol256_64( s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s1 = mm512_rol256_64( s1 ); \
+   s2 = mm512_swap256_128( s2 ); \
+   s3 = mm512_ror256_64( s3 );

 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_ror256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_rol256_64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -203,24 +203,36 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+union _povly
+{
+   __m512i *v512;
+   __m256i *v256;
+   uint64_t *u64;
+};
+typedef union _povly povly;
+
 //---- Housekeeping
-void initState_2way( uint64_t state[/*16*/] );
+void initState_2way( uint64_t State[/*16*/] );

 //---- Squeezes
-void squeeze_2way( uint64_t *state, unsigned char *out, unsigned int len );
+void squeeze_2way( uint64_t *State, unsigned char *out, unsigned int len );
 void reducedSqueezeRow0_2way( uint64_t* state, uint64_t* row, uint64_t nCols );

 //---- Absorbs
-void absorbBlock_2way( uint64_t *state, const uint64_t *in );
-void absorbBlockBlake2Safe_2way( uint64_t *state, const uint64_t *in,
+void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
+                       const uint64_t *In1 );
+void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
                            const uint64_t nBlocks, const uint64_t block_len );

 //---- Duplexes
-void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                             uint64_t *rowOut, uint64_t nCols);
-void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
+void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+
+void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
+                            uint64_t *rowInOut0, uint64_t *rowInOut1,
+                            uint64_t *rowOut, uint64_t nCols);

 #endif

--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -2,7 +2,10 @@

 bool register_hmq1725_algo( algo_gate_t* gate )
 {
-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+  gate->scanhash  = (void*)&scanhash_hmq1725_8way;
+  gate->hash      = (void*)&hmq1725_8way_hash;
+#elif defined(HMQ1725_4WAY)
  gate->scanhash  = (void*)&scanhash_hmq1725_4way;
  gate->hash      = (void*)&hmq1725_4way_hash;
 #else
@@ -10,7 +13,7 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 65536.0;
  return true;
 };
--- a/algo/quark/hmq1725-gate.h
+++ b/algo/quark/hmq1725-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-//  #define HMQ1725_4WAY 1
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define HMQ1725_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define HMQ1725_4WAY 1
 #endif

 bool register_hmq1725_algo( algo_gate_t* gate );

-#if defined(HMQ1725_4WAY)
+#if defined(HMQ1725_8WAY)
+
+void hmq1725_8way_hash( void *state, const void *input );
+int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(HMQ1725_4WAY)

 void hmq1725_4way_hash( void *state, const void *input );
 int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -333,6 +333,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFFF)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -346,6 +347,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFFF0)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -359,6 +361,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFFF00)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -372,6 +375,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFFF000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -386,6 +390,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			if (((hash64[7]&0xFFFF0000)==0) && 
 					fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
@@ -399,6 +404,7 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 			hmq1725hash(hash64, endiandata);
 			if (fulltest(hash64, ptarget)) {
 				*hashes_done = n - first_nonce + 1;
+            work_set_target_ratio( work, hash64 );
 				return true;
 			}
 		} while (n < max_nonce && !work_restart[thr_id].restart);	
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -63,20 +63,6 @@ void quark_8way_hash( void *state, const void *input )
    bmw512_8way_update( &ctx.bmw, vhash, 64 );
    bmw512_8way_close( &ctx.bmw, vhash );

-// AVX 512 cmpeq returns a bit mask instead of a vector mask.
-// This should simplify things but the logic doesn't seem to be working.
-// The problem appears to be related to the test to skip a hash if it isn't
-// to be used. Skipping the test for all 8 way hashes seems to have
-// fixed it. The hash selection blending works if the hash is produced
-// but the hash wasn't being produced when it should.
-// Both decisions are based on the same data, the __mmask8. It works
-// as a blend mask but not in a logical comparison, maybe the type is the
-// problem. Maybe a cast to int or movm is needed to make it work.
-// It's now moot because the hash can only be skipped 1 in 256 iterations
-// when hashing parallel 8 ways.
-// The performance impact of the workaround should be negligible.
-// It's a problem for another day.
-
    vh_mask = _mm512_cmpeq_epi64_mask( _mm512_and_si512( vh[0], bit3_mask ),
                                       zero );

--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));

 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));

 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,30 +86,32 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));

 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
+                         size_t len );
+#define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );

-// SHA-256 11 way hybrid
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way
+
 typedef struct {
-   __m256i  bufx[64>>2];
-   __m256i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_11way_context;
+   __m512i buf[128>>3];
+   __m512i val[8];
+   uint64_t count;
+   bool initialized;
+} sha512_8way_context __attribute__ ((aligned (128)));

-void sha256_11way_init( sha256_11way_context *ctx );
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
-	                 const void *datay, const void *dataz, size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
-	                 void *dstz  );
+void sha512_8way_init( sha512_8way_context *sc);
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
+                         size_t len );
+void sha512_8way_close( sha512_8way_context *sc, void *dst );

+
+#endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,8 +36,6 @@
 #include <string.h>
 #include "sha-hash-4way.h"

-// SHA-512 4 way 64 bit
-
 /*
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };

+
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+// SHA-512 8 way 64 bit
+
+#define CH8W(X, Y, Z) \
+   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
+
+#define MAJ8W(X, Y, Z) \
+   _mm512_or_si512( _mm512_and_si512( X, Y ), \
+                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
+
+#define BSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
+
+#define BSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
+
+#define SSG8W_5_0(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
+
+#define SSG8W_5_1(x) \
+   _mm512_xor_si512( _mm512_xor_si512( \
+        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
+
+static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
+{
+   __m512i w0a, w1a, w0b, w1b;
+   w0a = mm512_ror_64( w0, 1 );
+   w1a = mm512_ror_64( w1,19 );
+   w0b = mm512_ror_64( w0, 8 );
+   w1b = mm512_ror_64( w1,61 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   w0b = _mm512_srli_epi64( w0, 7 );
+   w1b = _mm512_srli_epi64( w1, 6 );
+   w0a = _mm512_xor_si512( w0a, w0b );
+   w1a = _mm512_xor_si512( w1a, w1b );
+   return _mm512_add_epi64( w0a, w1a );
+}
+
+
+#define SSG8W_512x2_0( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-15], 1 ); \
+  X1a = mm512_ror_64( W[i-14], 1 ); \
+  X0b = mm512_ror_64( W[i-15], 8 ); \
+  X1b = mm512_ror_64( W[i-14], 8 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SSG8W_512x2_1( w0, w1, i ) do \
+{ \
+   __m512i X0a, X1a, X0b, X1b; \
+  X0a = mm512_ror_64( W[i-2],19 ); \
+  X1a = mm512_ror_64( W[i-1],19 ); \
+  X0b = mm512_ror_64( W[i-2],61 ); \
+  X1b = mm512_ror_64( W[i-1],61 ); \
+  X0a = _mm512_xor_si512( X0a, X0b ); \
+  X1a = _mm512_xor_si512( X1a, X1b ); \
+  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm512_xor_si512( X0a, X0b ); \
+  w1  = _mm512_xor_si512( X1a, X1b ); \
+} while(0)
+
+#define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m512i T1, T2; \
+  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
+  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
+  D  = _mm512_add_epi64( D, T1 ); \
+  H  = _mm512_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
+{
+   int i;
+   register __m512i A, B, C, D, E, F, G, H;
+   __m512i W[80];
+
+   mm512_block_bswap_64( W  , in );
+   mm512_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
+                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m512_const1_64( 0x6A09E667F3BCC908 );
+      B = m512_const1_64( 0xBB67AE8584CAA73B );
+      C = m512_const1_64( 0x3C6EF372FE94F82B );
+      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m512_const1_64( 0x510E527FADE682D1 );
+      F = m512_const1_64( 0x9B05688C2B3E6C1F );
+      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m512_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm512_add_epi64( r[0], A );
+      r[1] = _mm512_add_epi64( r[1], B );
+      r[2] = _mm512_add_epi64( r[2], C );
+      r[3] = _mm512_add_epi64( r[3], D );
+      r[4] = _mm512_add_epi64( r[4], E );
+      r[5] = _mm512_add_epi64( r[5], F );
+      r[6] = _mm512_add_epi64( r[6], G );
+      r[7] = _mm512_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_8way_init( sha512_8way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+}
+
+void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
+{
+   __m512i *vdata = (__m512i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_8way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_8way_close( sha512_8way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+    const __m512i shuff_bswap64 = m512_const_64(
+                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
+                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
+                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_8way_round( sc, sc->buf, sc->val );
+         memset_zero_512( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
+                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+    sha512_8way_round( sc, sc->buf, sc->val );
+
+    mm512_block_bswap_64( dst, sc->val );
+}
+
+
+#endif   // AVX512
+
+// SHA-512 4 way 64 bit
+
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }

-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6

+
+#if defined(__AVX2__)
+
+#define DECL_STATE8   \
+   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
+           A08, A09, A0A, A0B; \
+   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
+           B8, B9, BA, BB, BC, BD, BE, BF; \
+   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
+           C8, C9, CA, CB, CC, CD, CE, CF; \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
+           M8, M9, MA, MB, MC, MD, ME, MF; \
+   sph_u32 Wlow, Whigh;
+
+#define READ_STATE8(state) do \
+{ \
+   if ( (state)->state_loaded ) \
+   { \
+      A00 = (state)->A[0]; \
+      A01 = (state)->A[1]; \
+      A02 = (state)->A[2]; \
+      A03 = (state)->A[3]; \
+      A04 = (state)->A[4]; \
+      A05 = (state)->A[5]; \
+      A06 = (state)->A[6]; \
+      A07 = (state)->A[7]; \
+      A08 = (state)->A[8]; \
+      A09 = (state)->A[9]; \
+      A0A = (state)->A[10]; \
+      A0B = (state)->A[11]; \
+      B0 = (state)->B[0]; \
+      B1 = (state)->B[1]; \
+      B2 = (state)->B[2]; \
+      B3 = (state)->B[3]; \
+      B4 = (state)->B[4]; \
+      B5 = (state)->B[5]; \
+      B6 = (state)->B[6]; \
+      B7 = (state)->B[7]; \
+      B8 = (state)->B[8]; \
+      B9 = (state)->B[9]; \
+      BA = (state)->B[10]; \
+      BB = (state)->B[11]; \
+      BC = (state)->B[12]; \
+      BD = (state)->B[13]; \
+      BE = (state)->B[14]; \
+      BF = (state)->B[15]; \
+      C0 = (state)->C[0]; \
+      C1 = (state)->C[1]; \
+      C2 = (state)->C[2]; \
+      C3 = (state)->C[3]; \
+      C4 = (state)->C[4]; \
+      C5 = (state)->C[5]; \
+      C6 = (state)->C[6]; \
+      C7 = (state)->C[7]; \
+      C8 = (state)->C[8]; \
+      C9 = (state)->C[9]; \
+      CA = (state)->C[10]; \
+      CB = (state)->C[11]; \
+      CC = (state)->C[12]; \
+      CD = (state)->C[13]; \
+      CE = (state)->C[14]; \
+      CF = (state)->C[15]; \
+   } \
+   else \
+   { \
+       (state)->state_loaded = true; \
+       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
+       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
+       A02 = m256_const1_64( 0xE782B699E782B699 ); \
+       A03 = m256_const1_64( 0x5530463255304632 ); \
+       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
+       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
+       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
+       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
+       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
+       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
+       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
+       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
+       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
+       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
+       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
+       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
+       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
+       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
+       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
+       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
+       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
+       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
+       BD = m256_const1_64( 0x75941D9975941D99 ); \
+       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
+       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
+       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
+       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
+       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
+       C3 = m256_const1_64( 0x8134F3598134F359 ); \
+       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
+       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
+       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
+       C7 = m256_const1_64( 0x0405278004052780 ); \
+       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
+       C9 = m256_const1_64( 0x5194358F5194358F ); \
+       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
+       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
+       CC = m256_const1_64( 0x950C3434950C3434 ); \
+       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
+       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
+       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
+   } \
+   Wlow = (state)->Wlow; \
+   Whigh = (state)->Whigh; \
+} while (0)
+
+#define WRITE_STATE8(state)   do { \
+      (state)->A[0] = A00; \
+      (state)->A[1] = A01; \
+      (state)->A[2] = A02; \
+      (state)->A[3] = A03; \
+      (state)->A[4] = A04; \
+      (state)->A[5] = A05; \
+      (state)->A[6] = A06; \
+      (state)->A[7] = A07; \
+      (state)->A[8] = A08; \
+      (state)->A[9] = A09; \
+      (state)->A[10] = A0A; \
+      (state)->A[11] = A0B; \
+      (state)->B[0] = B0; \
+      (state)->B[1] = B1; \
+      (state)->B[2] = B2; \
+      (state)->B[3] = B3; \
+      (state)->B[4] = B4; \
+      (state)->B[5] = B5; \
+      (state)->B[6] = B6; \
+      (state)->B[7] = B7; \
+      (state)->B[8] = B8; \
+      (state)->B[9] = B9; \
+      (state)->B[10] = BA; \
+      (state)->B[11] = BB; \
+      (state)->B[12] = BC; \
+      (state)->B[13] = BD; \
+      (state)->B[14] = BE; \
+      (state)->B[15] = BF; \
+      (state)->C[0] = C0; \
+      (state)->C[1] = C1; \
+      (state)->C[2] = C2; \
+      (state)->C[3] = C3; \
+      (state)->C[4] = C4; \
+      (state)->C[5] = C5; \
+      (state)->C[6] = C6; \
+      (state)->C[7] = C7; \
+      (state)->C[8] = C8; \
+      (state)->C[9] = C9; \
+      (state)->C[10] = CA; \
+      (state)->C[11] = CB; \
+      (state)->C[12] = CC; \
+      (state)->C[13] = CD; \
+      (state)->C[14] = CE; \
+      (state)->C[15] = CF; \
+      (state)->Wlow = Wlow; \
+      (state)->Whigh = Whigh; \
+   } while (0)
+
+#define DECODE_BLOCK8 \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD8 \
+do { \
+    B0 = _mm256_add_epi32( B0, M0 );\
+    B1 = _mm256_add_epi32( B1, M1 );\
+    B2 = _mm256_add_epi32( B2, M2 );\
+    B3 = _mm256_add_epi32( B3, M3 );\
+    B4 = _mm256_add_epi32( B4, M4 );\
+    B5 = _mm256_add_epi32( B5, M5 );\
+    B6 = _mm256_add_epi32( B6, M6 );\
+    B7 = _mm256_add_epi32( B7, M7 );\
+    B8 = _mm256_add_epi32( B8, M8 );\
+    B9 = _mm256_add_epi32( B9, M9 );\
+    BA = _mm256_add_epi32( BA, MA );\
+    BB = _mm256_add_epi32( BB, MB );\
+    BC = _mm256_add_epi32( BC, MC );\
+    BD = _mm256_add_epi32( BD, MD );\
+    BE = _mm256_add_epi32( BE, ME );\
+    BF = _mm256_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB8 \
+do { \
+    C0 = _mm256_sub_epi32( C0, M0 ); \
+    C1 = _mm256_sub_epi32( C1, M1 ); \
+    C2 = _mm256_sub_epi32( C2, M2 ); \
+    C3 = _mm256_sub_epi32( C3, M3 ); \
+    C4 = _mm256_sub_epi32( C4, M4 ); \
+    C5 = _mm256_sub_epi32( C5, M5 ); \
+    C6 = _mm256_sub_epi32( C6, M6 ); \
+    C7 = _mm256_sub_epi32( C7, M7 ); \
+    C8 = _mm256_sub_epi32( C8, M8 ); \
+    C9 = _mm256_sub_epi32( C9, M9 ); \
+    CA = _mm256_sub_epi32( CA, MA ); \
+    CB = _mm256_sub_epi32( CB, MB ); \
+    CC = _mm256_sub_epi32( CC, MC ); \
+    CD = _mm256_sub_epi32( CD, MD ); \
+    CE = _mm256_sub_epi32( CE, ME ); \
+    CF = _mm256_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W8 \
+do { \
+   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
+   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
+} while (0)
+
+#define SWAP_BC8 \
+do { \
+    mm256_swap512_256( B0, C0 ); \
+    mm256_swap512_256( B1, C1 ); \
+    mm256_swap512_256( B2, C2 ); \
+    mm256_swap512_256( B3, C3 ); \
+    mm256_swap512_256( B4, C4 ); \
+    mm256_swap512_256( B5, C5 ); \
+    mm256_swap512_256( B6, C6 ); \
+    mm256_swap512_256( B7, C7 ); \
+    mm256_swap512_256( B8, C8 ); \
+    mm256_swap512_256( B9, C9 ); \
+    mm256_swap512_256( BA, CA ); \
+    mm256_swap512_256( BB, CB ); \
+    mm256_swap512_256( BC, CC ); \
+    mm256_swap512_256( BD, CD ); \
+    mm256_swap512_256( BE, CE ); \
+    mm256_swap512_256( BF, CF ); \
+} while (0)
+
+#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
+            _mm256_andnot_si256( xb3, xb2 ), \
+            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
+               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
+                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0_8   do { \
+      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_1_8   do { \
+      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define PERM_STEP_2_8   do { \
+      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
+      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
+      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
+      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
+      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
+      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
+      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
+      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
+      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
+      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
+      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
+      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
+      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
+      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
+      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+   } while (0)
+
+#define APPLY_P8 \
+do { \
+    B0 = mm256_ror_32( B0, 15 ); \
+    B1 = mm256_ror_32( B1, 15 ); \
+    B2 = mm256_ror_32( B2, 15 ); \
+    B3 = mm256_ror_32( B3, 15 ); \
+    B4 = mm256_ror_32( B4, 15 ); \
+    B5 = mm256_ror_32( B5, 15 ); \
+    B6 = mm256_ror_32( B6, 15 ); \
+    B7 = mm256_ror_32( B7, 15 ); \
+    B8 = mm256_ror_32( B8, 15 ); \
+    B9 = mm256_ror_32( B9, 15 ); \
+    BA = mm256_ror_32( BA, 15 ); \
+    BB = mm256_ror_32( BB, 15 ); \
+    BC = mm256_ror_32( BC, 15 ); \
+    BD = mm256_ror_32( BD, 15 ); \
+    BE = mm256_ror_32( BE, 15 ); \
+    BF = mm256_ror_32( BF, 15 ); \
+    PERM_STEP_0_8; \
+    PERM_STEP_1_8; \
+    PERM_STEP_2_8; \
+    A0B = _mm256_add_epi32( A0B, C6 ); \
+    A0A = _mm256_add_epi32( A0A, C5 ); \
+    A09 = _mm256_add_epi32( A09, C4 ); \
+    A08 = _mm256_add_epi32( A08, C3 ); \
+    A07 = _mm256_add_epi32( A07, C2 ); \
+    A06 = _mm256_add_epi32( A06, C1 ); \
+    A05 = _mm256_add_epi32( A05, C0 ); \
+    A04 = _mm256_add_epi32( A04, CF ); \
+    A03 = _mm256_add_epi32( A03, CE ); \
+    A02 = _mm256_add_epi32( A02, CD ); \
+    A01 = _mm256_add_epi32( A01, CC ); \
+    A00 = _mm256_add_epi32( A00, CB ); \
+    A0B = _mm256_add_epi32( A0B, CA ); \
+    A0A = _mm256_add_epi32( A0A, C9 ); \
+    A09 = _mm256_add_epi32( A09, C8 ); \
+    A08 = _mm256_add_epi32( A08, C7 ); \
+    A07 = _mm256_add_epi32( A07, C6 ); \
+    A06 = _mm256_add_epi32( A06, C5 ); \
+    A05 = _mm256_add_epi32( A05, C4 ); \
+    A04 = _mm256_add_epi32( A04, C3 ); \
+    A03 = _mm256_add_epi32( A03, C2 ); \
+    A02 = _mm256_add_epi32( A02, C1 ); \
+    A01 = _mm256_add_epi32( A01, C0 ); \
+    A00 = _mm256_add_epi32( A00, CF ); \
+    A0B = _mm256_add_epi32( A0B, CE ); \
+    A0A = _mm256_add_epi32( A0A, CD ); \
+    A09 = _mm256_add_epi32( A09, CC ); \
+    A08 = _mm256_add_epi32( A08, CB ); \
+    A07 = _mm256_add_epi32( A07, CA ); \
+    A06 = _mm256_add_epi32( A06, C9 ); \
+    A05 = _mm256_add_epi32( A05, C8 ); \
+    A04 = _mm256_add_epi32( A04, C7 ); \
+    A03 = _mm256_add_epi32( A03, C6 ); \
+    A02 = _mm256_add_epi32( A02, C5 ); \
+    A01 = _mm256_add_epi32( A01, C4 ); \
+    A00 = _mm256_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W8   do { \
+      if ((Wlow = T32(Wlow + 1)) == 0) \
+         Whigh = T32(Whigh + 1); \
+   } while (0)
+
+static void
+shabal_8way_init( void *cc, unsigned size )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+
+   if ( size == 512 )
+   { // copy immediate constants directly to working registers later.
+       sc->state_loaded = false;
+   }
+   else
+   {  // No users
+       sc->state_loaded = true;
+       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
+       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
+       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
+       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
+       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
+       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
+
+       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
+       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
+       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
+       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
+       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
+       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
+       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
+       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
+       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
+       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
+       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
+
+       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
+       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
+       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
+       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
+       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
+       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
+       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
+       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
+       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
+       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
+   }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_8way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;
+   size_t ptr;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+
+   READ_STATE8( sc );
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK8;
+         INPUT_BLOCK_ADD8;
+         XOR_W8;
+         APPLY_P8;
+         INPUT_BLOCK_SUB8;
+         SWAP_BC8;
+         INCR_W8;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE8(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_8way_context *sc = (shabal_8way_context*)cc;
+    __m256i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE8
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm256_set1_epi32( zz );
+   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE8(sc);
+   DECODE_BLOCK8;
+   INPUT_BLOCK_ADD8;
+   XOR_W8;
+   APPLY_P8;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC8;
+      XOR_W8;
+      APPLY_P8;
+   }
+
+   __m256i *d = (__m256i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_8way_init( void *cc )
+{
+   shabal_8way_init(cc, 256);
+}
+
+void
+shabal256_8way_update( void *cc, const void *data, size_t len )
+{
+   shabal_8way_core( cc, data, len );
+}
+
+void
+shabal256_8way_close( void *cc, void *dst )
+{
+   shabal_8way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+   shabal_8way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_8way_init(void *cc)
+{
+   shabal_8way_init(cc, 512);
+}
+
+void
+shabal512_8way_update(void *cc, const void *data, size_t len)
+{
+   shabal_8way_core(cc, data, len);
+}
+
+void
+shabal512_8way_close(void *cc, void *dst)
+{
+   shabal_8way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+   shabal_8way_close(cc, ub, n, dst, 16);
+}
+
+
+#endif  // AVX2
+
 /*
 * We copy the state into local variables, so that the compiler knows
 * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
+
+
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
+
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)

+/*
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
+                                   _mm_set1_epi32(5UL) ) \
+  __m128i t2 = _mm_xor_si128( xa0, xc ); \
+  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
+  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
+              _mm_xor_si128( t2, \
+                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
+*/
+
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }

 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }

 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1

-#ifdef __AVX2__
+#ifdef __SSE4_1__

 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{

 #define SPH_SIZE_shabal512   512

+#if defined(__AVX2__)
+
+typedef struct {
+   __m256i buf[16];
+   __m256i A[12], B[16], C[16];
+   sph_u32 Whigh, Wlow;
+   size_t ptr;
+   bool state_loaded;
+} shabal_8way_context __attribute__ ((aligned (64)));
+
+typedef shabal_8way_context shabal256_8way_context;
+typedef shabal_8way_context shabal512_8way_context;
+
+void shabal256_8way_init( void *cc );
+void shabal256_8way_update( void *cc, const void *data, size_t len );
+void shabal256_8way_close( void *cc, void *dst );
+void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_8way_init( void *cc );
+void shabal512_8way_update( void *cc, const void *data, size_t len );
+void shabal512_8way_close( void *cc, void *dst );
+void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+
+#endif
+
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;

 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                       void *dst );

 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
+#define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@

 #include <stdio.h>

+// This implementation is deprecated, superseded by VAES in Icelake
+// which provides HW based 4 way aes.
+// It was created for AVX2 to eliminate interleaving between the 
+// preceding and following function.
+// This code can be removed when current users have reverted to one way.
+
 #if defined(__AVX2__)


@@ -16,8 +22,8 @@ static const uint32_t IV512[] =


 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
+                       mm256_ror128_32( b ), 0x88 )

 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   {
      // round 1, 5, 9

-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );

     if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );

     if ( r == 1 )
        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
     k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p3 = _mm256_xor_si256( p3, x );

     k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
     k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
     k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
     k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );

     if ( r == 2 )
        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

     // round 3, 7, 11

-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k00, zero ) ), k13 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k01, zero ) ), k00 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k02, zero ) ), k01 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k03, zero ) ), k02 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

     p1 = _mm256_xor_si256( p1, x );

-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k10, zero ) ), k03 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k11, zero ) ), k10 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k12, zero ) ), k11 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k13, zero ) ), k12 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )

   // round 13

-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );

   p3 = _mm256_xor_si256( p3, x );

-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );

-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );

   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
 void c11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );

-     // Serial
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );

-     // 7 Luffa + 8 cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );

-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                   vhash );
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );

-     // Luffa + Cube
-     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );

     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
-
-#if defined(X12_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,223 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+
+#if defined(X12_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+} x12_8way_ctx_holder;
+
+x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x12_8way_ctx()
+{
+     blake512_8way_init( &x12_8way_ctx.blake );
+     bmw512_8way_init( &x12_8way_ctx.bmw );
+     init_groestl( &x12_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x12_8way_ctx.skein );
+     jh512_8way_init( &x12_8way_ctx.jh );
+     keccak512_8way_init( &x12_8way_ctx.keccak );
+     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
+     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x12_8way_ctx.shavite );
+     simd_4way_init( &x12_8way_ctx.simd, 512 );
+     init_echo( &x12_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x12_8way_ctx.hamsi );
+};
+
+void x12_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x12_8way_ctx_holder ctx;
+     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, state );
+}
+
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[16*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[49]);
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
+
+        x12_8way_hash( hash, vdata );
+
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane<<1 ] < Htarg )
+        {
+           extr_lane_8x64( lane_hash, hash, lane, 256 );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+           {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+           }
+        }
+        n += 8;
+     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X12_4WAY)

 typedef struct {
    blake512_4way_context   blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
     x12_4way_ctx_holder ctx;
     memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );

-     // 1 Blake
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );

-     // 2 Bmw
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
-
-     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );

-     // 3 Groestl
-     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
-
-     // Parallel 4way 64 bit
-     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
-
-     // 4 Skein
-     skein512_4way( &ctx.skein, vhash, 64 );
-     skein512_4way_close( &ctx.skein, vhash );
-
-     // 5 JH
-     jh512_4way( &ctx.jh, vhash, 64 );
-     jh512_4way_close( &ctx.jh, vhash );
-
-     // 6 Keccak
-     keccak512_4way( &ctx.keccak, vhash, 64 );
-     keccak512_4way_close( &ctx.keccak, vhash );
-
-     // Serial
-     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
-     // 7 Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );

-     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     // 10 Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );

-     // 11 Echo
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
-                       (const BitSequence *) hash0, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
-                       (const BitSequence *) hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash2,
-                       (const BitSequence *) hash2, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash3,
-                       (const BitSequence *) hash3, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );

-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );

--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@

 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
+  init_x12_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x12_8way;
+  gate->hash      = (void*)&x12_8way_hash;
+#elif defined (X12_4WAY)
  init_x12_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_4way;
  gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x12;
  gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X12_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X12_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X12_4WAY 1
 #endif

 bool register_x12_algo( algo_gate_t* gate );

-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
+
+void x12_8way_hash( void *state, const void *input );
+int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x12_8way_ctx();
+
+#elif defined(X12_4WAY)

 void x12_4way_hash( void *state, const void *input );
-
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_4way_ctx();

-#endif
+#else

 void x12hash( void *state, const void *input );
-
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x12_ctx();

 #endif

+#endif
+
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,17 +20,17 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
-#include "algo/blake/sse2/blake.c"   
-#include "algo/bmw/sse2/bmw.c"
-#include "algo/keccak/sse2/keccak.c"
-#include "algo/skein/sse2/skein.c"
-#include "algo/jh/sse2/jh_sse2_opt64.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif

 typedef struct {
+   sph_blake512_context    blake;
+   sph_bmw512_context      bmw;
+   sph_skein512_context    skein;
+   sph_jh512_context       jh;
+   sph_keccak512_context   keccak;
 #if defined(__AES__)
   hashState_groestl       groestl;
   hashState_echo          echo;
@@ -49,6 +49,11 @@ x12_ctx_holder x12_ctx;

 void init_x12_ctx()
 {
+        sph_blake512_init( &x12_ctx.blake );
+        sph_bmw512_init( &x12_ctx.bmw );
+        sph_skein512_init( &x12_ctx.skein);
+        sph_jh512_init( &x12_ctx.jh);
+        sph_keccak512_init( &x12_ctx.keccak);
 #if defined(__AES__)
        init_echo( &x12_ctx.echo, 512 );
        init_groestl (&x12_ctx.groestl, 64 );
@@ -65,89 +70,31 @@ void init_x12_ctx()

 void x12hash(void *output, const void *input)
 {
+
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
      
   x12_ctx_holder ctx;
   memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );

-        // X11 algos
+   sph_blake512(&ctx.blake, input, 80);
+   sph_blake512_close(&ctx.blake, hash);

-        unsigned char hashbuf[128];
-        size_t hashptr;
-        sph_u64 hashctA;
-        sph_u64 hashctB;
+   sph_bmw512(&ctx.bmw, hash, 64);
+   sph_bmw512_close(&ctx.bmw, hash);

-        //---blake1---
-
-        DECL_BLK;
-        BLK_I;
-        BLK_W;
-        BLK_C;
-
-        //---bmw2---
-
-        DECL_BMW;
-        BMW_I;
-        BMW_U;
-
-        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
-        #define H(x)    (h[x])
-        #define dH(x)   (dh[x])
-
-        BMW_C;
-
-        #undef M
-        #undef H
-        #undef dH
-        
-        //---groetl----
-
-#if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                  (const char*)hash, 512 );
-#else
-        sph_groestl512 (&ctx.groestl, hash, 64);
-        sph_groestl512_close(&ctx.groestl, hash);
-#endif
-
-        //---skein4---
-
-        DECL_SKN;
-        SKN_I;
-        SKN_U;
-        SKN_C;
-
-        //---jh5------
-
-        DECL_JH;
-        JH_H;
-
-        //---keccak6---
-
-        DECL_KEC;
-        KEC_I;
-        KEC_U;
-        KEC_C;
-
-        //--- luffa7
   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                           (const BitSequence*)hash, 64 );

-        // 8 Cube
   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                         (const byte*)hashB, 64 );

-        // 9 Shavite
   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);

-        // 10 Simd
   update_final_sd( &ctx.simd, (BitSequence *)hash,
                    (const BitSequence *)hashB, 512 );

-        //11---echo---
-
 #if defined(__AES__)
   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                            (const BitSequence *)hash, 512 );
@@ -156,11 +103,26 @@ void x12hash(void *output, const void *input)
   sph_echo512_close(&ctx.echo, hashB);
 #endif

-        // 12 Hamsi
+#if defined(__AES__)
+   update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                  (const char*)hash, 512 );
+#else
+   sph_groestl512 (&ctx.groestl, hash, 64);
+   sph_groestl512_close(&ctx.groestl, hash);
+#endif
+
+   sph_skein512(&ctx.skein, hash, 64);
+   sph_skein512_close(&ctx.skein, hash);
+
+   sph_jh512(&ctx.jh, hash, 64);
+   sph_jh512_close(&ctx.jh, hash);
+
+   sph_keccak512(&ctx.keccak, hash, 64);
+   sph_keccak512_close(&ctx.keccak, hash);
+
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);

-        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }

--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,7 +1,4 @@
 #include "x13-gate.h"
-
-#if defined(X13_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,12 +11,270 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"

+#if defined(X13_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+} x13_8way_ctx_holder;
+
+x13_8way_ctx_holder x13_8way_ctx;
+
+void init_x13_8way_ctx()
+{
+     blake512_8way_init( &x13_8way_ctx.blake );
+     bmw512_8way_init( &x13_8way_ctx.bmw );
+     init_groestl( &x13_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x13_8way_ctx.skein );
+     jh512_8way_init( &x13_8way_ctx.jh );
+     keccak512_8way_init( &x13_8way_ctx.keccak );
+     luffa_4way_init( &x13_8way_ctx.luffa, 512 );
+     cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x13_8way_ctx.shavite );
+     simd_4way_init( &x13_8way_ctx.simd, 512 );
+     init_echo( &x13_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x13_8way_ctx.hamsi );
+     sph_fugue512_init( &x13_8way_ctx.fugue );
+}
+
+void x13_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x13_8way_ctx_holder ctx;
+     memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+     
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     int thr_id = mythr->id;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     const uint32_t last_nonce = max_nonce -8;
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+         x13_8way_hash( hash, vdata );
+         pdata[19] = n;
+
+         for ( int i = 0; i < 8; i++ )
+         if ( ( hash+(i<<3) )[7] < Htarg
+              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+         {
+             pdata[19] = n+i;
+             submit_lane_solution( work, hash+(i<<3), mythr, i );
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+
+#elif defined(X13_4WAY)
+
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -2,7 +2,11 @@

 bool register_x13_algo( algo_gate_t* gate )
 {
-#if defined (X13_4WAY)
+#if defined (X13_8WAY)
+  init_x13_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x13_8way;
+  gate->hash      = (void*)&x13_8way_hash;
+#elif defined (X13_4WAY)
  init_x13_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_4way;
  gate->hash      = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13;
  gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X13_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X13_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X13_4WAY 1
 #endif

 bool register_x13_algo( algo_gate_t* gate );

-#if defined(X13_4WAY)
+#if defined(X13_8WAY)
+
+void x13_8way_hash( void *state, const void *input );
+int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x13_8way_ctx();
+
+#elif defined(X13_4WAY)

 void x13_4way_hash( void *state, const void *input );
-
 int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_4way_ctx();

-#endif
+#else

 void x13hash( void *state, const void *input );
-
 int scanhash_x13( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_x13_ctx();

 #endif

+#endif
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,7 +1,4 @@
 #include "x14-gate.h"
-
-#if defined(X14_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,6 +10,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"

+#if defined(X14_8WAY)
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+} x14_8way_ctx_holder;
+
+x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x14_8way_ctx()
+{
+     blake512_8way_init( &x14_8way_ctx.blake );
+     bmw512_8way_init( &x14_8way_ctx.bmw );
+     init_groestl( &x14_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x14_8way_ctx.skein );
+     jh512_8way_init( &x14_8way_ctx.jh );
+     keccak512_8way_init( &x14_8way_ctx.keccak );
+     luffa_4way_init( &x14_8way_ctx.luffa, 512 );
+     cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x14_8way_ctx.shavite );
+     simd_4way_init( &x14_8way_ctx.simd, 512 );
+     init_echo( &x14_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x14_8way_ctx.hamsi );
+     sph_fugue512_init( &x14_8way_ctx.fugue );
+     shabal512_8way_init( &x14_8way_ctx.shabal );
+};
+
+void x14_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+
+     x14_8way_ctx_holder ctx;
+     memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+     
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, state );
+}
+
+
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*16] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;
+
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x14_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        uint32_t *hash7 = &(hash[7<<3]);
+        for ( int lane = 0; lane < 8; lane++ )
+        if ( hash7[ lane ] < Htarg )
+        {
+            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+            extr_lane_8x32( lane_hash, hash, lane, 256 );
+            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+            {
+                pdata[19] = n + lane;
+                submit_lane_solution( work, lane_hash, mythr, lane );
+            }
+         }
+         n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X14_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()

 void x14_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x14_4way_ctx_holder ctx;
     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );

@@ -184,33 +439,25 @@ void x14_4way_hash( void *state, const void *input )

     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }

 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
+     int thr_id = mythr->id;  
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
     do
     {
       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -220,11 +467,9 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
       pdata[19] = n;

       uint32_t *hash7 = &(hash[7<<2]);
-
       for ( int lane = 0; lane < 4; lane++ )
-            if ( ( hash7[ lane ] & mask ) == 0 )
+       if ( hash7[ lane ] < Htarg )
       {
-               // deinterleave hash for lane
           uint32_t lane_hash[8];
           extr_lane_4x32( lane_hash, hash, lane, 256 );

@@ -235,10 +480,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
           }
        }
        n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
-     *hashes_done = n - first_nonce + 1;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -2,7 +2,11 @@

 bool register_x14_algo( algo_gate_t* gate )
 {
-#if defined (X14_4WAY)
+#if defined (X14_8WAY)
+  init_x14_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x14_8way;
+  gate->hash      = (void*)&x14_8way_hash;
+#elif defined (X14_4WAY)
  init_x14_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_4way;
  gate->hash      = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,20 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X14_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X14_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X14_4WAY 1
 #endif

 bool register_x14_algo( algo_gate_t* gate );

-#if defined(X14_4WAY)
+#if defined(X14_8WAY)
+
+void x14_8way_hash( void *state, const void *input );
+int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x14_8way_ctx();
+
+#elif defined(X14_4WAY)

 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();

-#endif
+#else

 void x14hash( void *state, const void *input );
 int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();

 #endif

+#endif
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,7 +1,4 @@
 #include "x15-gate.h"
-
-#if defined(X15_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,309 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"

+#if defined(X15_8WAY)
+
+
+typedef struct {
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+} x15_8way_ctx_holder;
+
+x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
+
+void init_x15_8way_ctx()
+{
+     blake512_8way_init( &x15_8way_ctx.blake );
+     bmw512_8way_init( &x15_8way_ctx.bmw );
+     init_groestl( &x15_8way_ctx.groestl, 64 );
+     skein512_8way_init( &x15_8way_ctx.skein );
+     jh512_8way_init( &x15_8way_ctx.jh );
+     keccak512_8way_init( &x15_8way_ctx.keccak );
+     luffa_4way_init( &x15_8way_ctx.luffa, 512 );
+     cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x15_8way_ctx.shavite );
+     simd_4way_init( &x15_8way_ctx.simd, 512 );
+     init_echo( &x15_8way_ctx.echo, 512 );
+     hamsi512_8way_init( &x15_8way_ctx.hamsi );
+     sph_fugue512_init( &x15_8way_ctx.fugue );
+     shabal512_8way_init( &x15_8way_ctx.shabal );
+     sph_whirlpool_init( &x15_8way_ctx.whirlpool );
+};
+
+void x15_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x15_8way_ctx_holder ctx;
+     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
+
+     // 1 Blake
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, 512 );
+     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, 512 );
+
+
+     // 12 Hamsi parallel 4way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+
+     // 14 Shabal, parallel 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
+             sizeof(sph_whirlpool_context) );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash4, 32 );
+     memcpy( state+160, hash5, 32 );
+     memcpy( state+192, hash6, 32 );
+     memcpy( state+224, hash7, 32 );
+}
+
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+     uint32_t hash[8*8] __attribute__ ((aligned (128)));
+     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 8;
+     __m512i  *noncev = (__m512i*)vdata + 9;  
+     const uint32_t Htarg = ptarget[7];
+     int thr_id = mythr->id;  
+     mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+     do
+     {
+        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+        x15_8way_hash( hash, vdata );
+        pdata[19] = n;
+
+        for ( int i = 0; i < 8; i++ )
+        if ( ( hash+(i<<3) )[7] < Htarg )
+        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+        {
+           pdata[19] = n+i;
+           submit_lane_solution( work, hash, mythr, i );
+        }
+        n += 8;
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
+     *hashes_done = n - first_nonce;
+     return 0;
+}
+
+#elif defined(X15_4WAY)
+
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()

 void x15_4way_hash( void *state, const void *input )
 {
+     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
-     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x15_4way_ctx_holder ctx;
     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );

@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )

     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
@@ -216,27 +517,18 @@ void x15_4way_hash( void *state, const void *input )
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     uint32_t n = first_nonce;
+     const uint32_t last_nonce = max_nonce - 4;
+     __m256i  *noncev = (__m256i*)vdata + 9;
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint64_t htmax[] = {          0,        0xF,       0xFF,
-                               0xFFF,     0xFFFF, 0x10000000  };
-     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
-                          0xFFFFF000, 0xFFFF0000,          0  };
-
-
+     int thr_id = mythr->id;  
     mm256_bswap32_intrlv80_4x64( vdata, pdata );

-     for ( int m=0; m < 6; m++ )
-       if ( Htarg <= htmax[m] )
-       {
-         uint32_t mask = masks[m];
     do
     {
        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -246,18 +538,16 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
         pdata[19] = n;

         for ( int i = 0; i < 4; i++ )
-            if ( ( (hash+(i<<3))[7] & mask ) == 0 )
+         if ( ( hash+(i<<3) )[7] < Htarg )
         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
            pdata[19] = n+i;
            submit_lane_solution( work, hash, mythr, i );
         }
         n += 4;
-         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
-         break;
-       }
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );

-     *hashes_done = n - first_nonce + 1;
+     *hashes_done = n - first_nonce;
     return 0;
 }

--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -2,7 +2,11 @@

 bool register_x15_algo( algo_gate_t* gate )
 {
-#if defined (X15_4WAY)
+#if defined (X15_8WAY)
+  init_x15_8way_ctx();
+  gate->scanhash  = (void*)&scanhash_x15_8way;
+  gate->hash      = (void*)&x15_8way_hash;
+#elif defined (X15_4WAY)
  init_x15_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_4way;
  gate->hash      = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,20 +4,30 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X15_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X15_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X15_4WAY 1
 #endif

+
 bool register_x15_algo( algo_gate_t* gate );

-#if defined(X15_4WAY)
+#if defined(X15_8WAY)
+
+void x15_8way_hash( void *state, const void *input );
+int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+void init_x15_8way_ctx();
+
+#elif defined(X15_4WAY)

 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();

-#endif
+#else

 void x15hash( void *state, const void *input );
 int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();

 #endif

+#endif
+
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16r_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+
+void x16r_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16r_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16r_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
 union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-};
+} __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

 void x16r_4way_hash( void* output, const void* input )
 {
+   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
-   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
      size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( likely( ( n < max_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );

-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )

 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )

 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
+  gate->hash      = (void*)&x16rv2_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )

 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16r_8way;
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
+  gate->scanhash  = (void*)&scanhash_x16rt_8way;
+  gate->hash      = (void*)&x16rt_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,8 +6,10 @@
 #include <stdint.h>
 #include <unistd.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X16R_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X16R_4WAY 1
 #endif

 enum x16r_Algo {
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );

-#if defined(X16R_4WAY)
+#if defined(X16R_8WAY)
+
+void x16r_8way_hash( void *state, const void *input );
+int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+void x16rv2_8way_hash( void *state, const void *input );
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+void x16rt_8way_hash( void *state, const void *input );
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(X16R_4WAY)

 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-void x21s_4way_hash( void *state, const void *input );
-int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
-bool x21s_4way_thread_init();
-
-#endif
+#else

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

-void hex_hash( void *state, const void *input );
-int scanhash_hex( struct work *work, uint32_t max_nonce,
+#endif
+
+#if defined(X16R_4WAY)
+
+void x21s_4way_hash( void *state, const void *input );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_4way_thread_init();
+
+#else

 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +105,9 @@ bool x21s_thread_init();

 #endif

+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,7 +1,4 @@
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16rt_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
+
+void x16rt_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rt_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+            keccak512_8way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_8way_update( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               keccak512_8way_update( &ctx.keccak, vhash, size );
+            }
+            keccak512_8way_close( &ctx.keccak, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, size );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) timeHash[8*8];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16rt_getTimeHash( ntime, &timeHash );
+      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
+                               hashOrder, ntime, timeHash );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rt_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (X16R_4WAY)
+
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
-#if defined (X16R_4WAY)
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
+#include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,477 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };

+#if defined (X16R_8WAY)
+
+union _x16rv2_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    sph_tiger_context       tiger;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
+
+void x16rv2_8way_hash( void* output, const void* input )
+{
+   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t hash4[24] __attribute__ ((aligned (64)));
+   uint32_t hash5[24] __attribute__ ((aligned (64)));
+   uint32_t hash6[24] __attribute__ ((aligned (64)));
+   uint32_t hash7[24] __attribute__ ((aligned (64)));
+   x16rv2_8way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   void *in4 = (void*) hash4;
+   void *in5 = (void*) hash5;
+   void *in6 = (void*) hash6;
+   void *in7 = (void*) hash7;
+   int size = 80;
+
+   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                 input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_8way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_8way_update( &ctx.blake, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               blake512_8way_update( &ctx.blake, vhash, size );
+            }
+            blake512_8way_close( &ctx.blake, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case BMW:
+            bmw512_8way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_8way_update( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+            bmw512_8way_update( &ctx.bmw, vhash, size );
+            }
+            bmw512_8way_close( &ctx.bmw, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash4,
+                                                 (const char*)in4, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash5,
+                                                 (const char*)in5, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash6,
+                                                 (const char*)in6, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash7,
+                                                 (const char*)in7, size<<3 );
+         break;
+         case SKEIN:
+            skein512_8way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_8way_update( &ctx.skein, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               skein512_8way_update( &ctx.skein, vhash, size );
+            }
+            skein512_8way_close( &ctx.skein, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case JH:
+            jh512_8way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_8way_update( &ctx.jh, input, size );
+            else
+            {
+               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+               jh512_8way_update( &ctx.jh, vhash, size );
+            }
+            jh512_8way_close( &ctx.jh, vhash );
+            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case KECCAK:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             keccak512_8way_init( &ctx.keccak );
+             keccak512_8way_update( &ctx.keccak, vhash, 64 );
+             keccak512_8way_close( &ctx.keccak, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case LUFFA:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
+            luffa_4way_init( &ctx.luffa, 512 );
+            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case CUBEHASH:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            cube_4way_init( &ctx.cube, 512, 16, 32 );
+            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in4, size );
+            sph_shavite512_close( &ctx.shavite, hash4 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in5, size );
+            sph_shavite512_close( &ctx.shavite, hash5 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in6, size );
+            sph_shavite512_close( &ctx.shavite, hash6 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in7, size );
+            sph_shavite512_close( &ctx.shavite, hash7 );
+         break;
+         case SIMD:
+            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
+            simd_4way_init( &ctx.simd, 512 );
+            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
+                                (const BitSequence*)in4, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
+                                (const BitSequence*)in5, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
+                                (const BitSequence*)in6, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
+                                (const BitSequence*)in7, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                            size<<3 );
+
+             hamsi512_8way_init( &ctx.hamsi );
+             hamsi512_8way_update( &ctx.hamsi, vhash, size );
+             hamsi512_8way_close( &ctx.hamsi, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+             break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in4, size );
+             sph_fugue512_close( &ctx.fugue, hash4 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in5, size );
+             sph_fugue512_close( &ctx.fugue, hash5 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in6, size );
+             sph_fugue512_close( &ctx.fugue, hash6 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in7, size );
+             sph_fugue512_close( &ctx.fugue, hash7 );
+         break;
+         case SHABAL:
+             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
+                          size<<3 );
+             shabal512_8way_init( &ctx.shabal );
+             shabal512_8way_update( &ctx.shabal, vhash, size );
+             shabal512_8way_close( &ctx.shabal, vhash );
+             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in4, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash4 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in5, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash5 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in6, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash6 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in7, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash7 );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in4, size );
+             sph_tiger_close( &ctx.tiger, hash4 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in5, size );
+             sph_tiger_close( &ctx.tiger, hash5 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in6, size );
+             sph_tiger_close( &ctx.tiger, hash6 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in7, size );
+             sph_tiger_close( &ctx.tiger, hash7 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
+                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
+
+             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
+                          hash6, hash7 );
+             sha512_8way_init( &ctx.sha512 );
+             sha512_8way_update( &ctx.sha512, vhash, 64 );
+             sha512_8way_close( &ctx.sha512, vhash );
+             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                          hash7, vhash );
+         break;
+      }
+      size = 64;
+   }
+
+   memcpy( output,     hash0, 32 );
+   memcpy( output+32,  hash1, 32 );
+   memcpy( output+64,  hash2, 32 );
+   memcpy( output+96,  hash3, 32 );
+   memcpy( output+128, hash4, 32 );
+   memcpy( output+160, hash5, 32 );
+   memcpy( output+192, hash6, 32 );
+   memcpy( output+224, hash7, 32 );
+}
+
+int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t bedata1[2] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   int thr_id = mythr->id;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+
+   bedata1[0] = bswap_32( pdata[1] );
+   bedata1[1] = bswap_32( pdata[2] );
+   const uint32_t ntime = bswap_32( pdata[17] );
+   if ( s_ntime != ntime )
+   {
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+
+      x16rv2_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
+      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+#elif defined (X16R_4WAY)
+
+
+
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
--- a/algo/x17/sonoa-gate.c
+++ b/algo/x17/sonoa-gate.c
@@ -2,8 +2,10 @@

 bool register_sonoa_algo( algo_gate_t* gate )
 {
-#if defined (SONOA_4WAY)
-//  init_sonoa_4way_ctx();
+#if defined (SONOA_8WAY)
+  gate->scanhash  = (void*)&scanhash_sonoa_8way;
+  gate->hash      = (void*)&sonoa_8way_hash;
+#elif defined (SONOA_4WAY)
  gate->scanhash  = (void*)&scanhash_sonoa_4way;
  gate->hash      = (void*)&sonoa_4way_hash;
 #else
@@ -11,7 +13,7 @@ bool register_sonoa_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_sonoa;
  gate->hash      = (void*)&sonoa_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x17/sonoa-gate.h
+++ b/algo/x17/sonoa-gate.h
@@ -4,29 +4,33 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define SONOA_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define SONOA_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define SONOA_4WAY 1
 #endif

 bool register_sonoa_algo( algo_gate_t* gate );

-#if defined(SONOA_4WAY)
+#if defined(SONOA_8WAY)
+
+void sonoa_8way_hash( void *state, const void *input );
+int scanhash_sonoa_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined(SONOA_4WAY)

 void sonoa_4way_hash( void *state, const void *input );
-
 int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );

-//void init_sonoa_4way_ctx();
-
-#endif
+#else

 void sonoa_hash( void *state, const void *input );
-
 int scanhash_sonoa( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
-
 void init_sonoa_ctx();

 #endif

+#endif
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -1,7 +1,4 @@
 #include "x17-gate.h"
-
-#if defined(X17_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/shavite/shavite-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -24,6 +22,309 @@
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/sha/sha-hash-4way.h"

+#if defined(X17_8WAY)
+
+union _x17_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cube_4way_context       cube;
+    sph_shavite512_context  shavite;
+    simd_4way_context       simd;
+    hashState_echo          echo;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+    haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _x17_8way_context_overlay x17_8way_context_overlay;
+
+void x17_8way_hash( void *state, const void *input )
+{
+     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
+     uint64_t vhash0[8*8] __attribute__ ((aligned (64)));
+     uint64_t vhash1[8*8] __attribute__ ((aligned (64)));
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t hash4[8] __attribute__ ((aligned (64)));
+     uint64_t hash5[8] __attribute__ ((aligned (64)));
+     uint64_t hash6[8] __attribute__ ((aligned (64)));
+     uint64_t hash7[8] __attribute__ ((aligned (64)));
+     x17_8way_context_overlay ctx;
+
+     // 1 Blake parallel 4 way 64 bit
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, 64 );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     // Serialize
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash );
+
+     // 3 Groestl
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
+
+     // Parallellize
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7 );
+
+     // 4 Skein parallel 4 way 64 bit 
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, 64 );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, 64 );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, 64 );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
+
+     // 7 Luffa  
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
+
+     // 8 Cubehash
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
+
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
+
+     // 9 Shavite
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, 64 );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, 64 );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, 64 );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, 64 );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     // 10 Simd
+     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
+     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
+     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+
+
+     // 11 Echo serial
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                            (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                            (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                            (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                            (const BitSequence *) hash3, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                            (const BitSequence *) hash4, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                            (const BitSequence *) hash5, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                            (const BitSequence *) hash6, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                            (const BitSequence *) hash7, 512 );
+
+     // 12 Hamsi parallel 4 way 64 bit
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 13 Fugue serial
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, 64 );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, 64 );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, 64 );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, 64 );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     // 14 Shabal, parallel 4 way 32 bit
+     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, 64 );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                       vhash );
+
+     // 15 Whirlpool serial
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     // 16 SHA512 parallel 64 bit 
+     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                      hash7 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, 64 );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     // 17 Haval parallel 32 bit
+     rintrlv_8x64_8x32( vhash0, vhash,  512 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhash0, 64 );
+     haval256_5_8way_close( &ctx.haval, state );
+}
+
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+     const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      x17_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(X17_4WAY)
+
 union _x17_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -2,14 +2,17 @@

 bool register_x17_algo( algo_gate_t* gate )
 {
-#if defined (X17_4WAY)
+#if defined (X17_8WAY)
+  gate->scanhash  = (void*)&scanhash_x17_8way;
+  gate->hash      = (void*)&x17_8way_hash;
+#elif defined (X17_4WAY)
  gate->scanhash  = (void*)&scanhash_x17_4way;
  gate->hash      = (void*)&x17_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };

--- a/algo/x17/x17-gate.h
+++ b/algo/x17/x17-gate.h
@@ -4,13 +4,20 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define X17_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define X17_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define X17_4WAY 1
 #endif

 bool register_x17_algo( algo_gate_t* gate );

-#if defined(X17_4WAY)
+#if defined(X17_8WAY)
+
+void x17_8way_hash( void *state, const void *input );
+int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(X17_4WAY)

 void x17_4way_hash( void *state, const void *input );
 int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -1,7 +1,4 @@
 #include "xevan-gate.h"
-
-#if defined(XEVAN_4WAY)
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/shavite-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -25,6 +23,515 @@
 #include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

+#if defined(XEVAN_8WAY)
+
+union _xevan_8way_context_overlay
+{
+   blake512_8way_context   blake;
+   bmw512_8way_context     bmw;
+   hashState_groestl       groestl;
+   skein512_8way_context   skein;
+   jh512_8way_context      jh;
+   keccak512_8way_context  keccak;
+   luffa_4way_context      luffa;
+   cube_4way_context       cube;
+   sph_shavite512_context  shavite;
+   simd_4way_context       simd;
+   hashState_echo          echo;
+   hamsi512_8way_context   hamsi;
+   sph_fugue512_context    fugue;
+   shabal512_8way_context  shabal;
+   sph_whirlpool_context   whirlpool;
+   sha512_8way_context     sha512;
+   haval256_5_8way_context haval;
+} __attribute__ ((aligned (64)));
+typedef union _xevan_8way_context_overlay xevan_8way_context_overlay;
+
+void xevan_8way_hash( void *output, const void *input )
+{
+     uint64_t vhash[16<<3] __attribute__ ((aligned (128)));
+     uint64_t vhashA[16<<3] __attribute__ ((aligned (64)));
+     uint64_t vhashB[16<<3] __attribute__ ((aligned (64)));
+     uint64_t hash0[16] __attribute__ ((aligned (64)));
+     uint64_t hash1[16] __attribute__ ((aligned (64)));
+     uint64_t hash2[16] __attribute__ ((aligned (64)));
+     uint64_t hash3[16] __attribute__ ((aligned (64)));
+     uint64_t hash4[16] __attribute__ ((aligned (64)));
+     uint64_t hash5[16] __attribute__ ((aligned (64)));
+     uint64_t hash6[16] __attribute__ ((aligned (64)));
+     uint64_t hash7[16] __attribute__ ((aligned (64)));
+     const int dataLen = 128;
+     xevan_8way_context_overlay ctx __attribute__ ((aligned (64)));
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, input, 80 );
+     blake512_8way_close( &ctx.blake, vhash );
+     memset( &vhash[8<<3], 0, 64<<3 );
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, vhashA );
+
+     rintrlv_8x32_8x64( vhash, vhashA, dataLen<<3 );
+
+     memset( &vhash[ 4<<3 ], 0, (dataLen-32) << 3 );
+
+     blake512_8way_init( &ctx.blake );
+     blake512_8way_update( &ctx.blake, vhash, dataLen );
+     blake512_8way_close(&ctx.blake, vhash);
+
+     bmw512_8way_init( &ctx.bmw );
+     bmw512_8way_update( &ctx.bmw, vhash, dataLen );
+     bmw512_8way_close( &ctx.bmw, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6,
+                               dataLen<<3 );
+     init_groestl( &ctx.groestl, 64 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7,
+                               dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     skein512_8way_init( &ctx.skein );
+     skein512_8way_update( &ctx.skein, vhash, dataLen );
+     skein512_8way_close( &ctx.skein, vhash );
+
+     jh512_8way_init( &ctx.jh );
+     jh512_8way_update( &ctx.jh, vhash, dataLen );
+     jh512_8way_close( &ctx.jh, vhash );
+
+     keccak512_8way_init( &ctx.keccak );
+     keccak512_8way_update( &ctx.keccak, vhash, dataLen );
+     keccak512_8way_close( &ctx.keccak, vhash );
+
+     rintrlv_8x64_4x128( vhashA, vhashB, vhash, dataLen<<3 );
+
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+     luffa_4way_init( &ctx.luffa, 512 );
+     luffa_4way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
+
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
+     cube_4way_init( &ctx.cube, 512, 16, 32 );
+     cube_4way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash0, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash1, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash2, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash3, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash4, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash4 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash5, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash5 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash6, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash6 );
+     sph_shavite512_init( &ctx.shavite );
+     sph_shavite512( &ctx.shavite, hash7, dataLen );
+     sph_shavite512_close( &ctx.shavite, hash7 );
+
+     intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, dataLen<<3 );
+
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+     simd_4way_init( &ctx.simd, 512 );
+     simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
+
+     dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
+     dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 );
+
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash4,
+                       (const BitSequence *) hash4, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash5,
+                       (const BitSequence *) hash5, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash6,
+                       (const BitSequence *) hash6, dataLen<<3 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash7,
+                       (const BitSequence *) hash7, dataLen<<3 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     hamsi512_8way_init( &ctx.hamsi );
+     hamsi512_8way_update( &ctx.hamsi, vhash, dataLen );
+     hamsi512_8way_close( &ctx.hamsi, vhash );
+
+     dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash0, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash4, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash4 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash5, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash5 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash6, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash6 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash7, dataLen );
+     sph_fugue512_close( &ctx.fugue, hash7 );
+
+     intrlv_8x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     shabal512_8way_init( &ctx.shabal );
+     shabal512_8way_update( &ctx.shabal, vhash, dataLen );
+     shabal512_8way_close( &ctx.shabal, vhash );
+
+     dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+                   vhash, dataLen<<3 );
+
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash0 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash1 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash2 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash3 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash4, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash4 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash5, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash5 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash6, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash6 );
+     sph_whirlpool_init( &ctx.whirlpool );
+     sph_whirlpool( &ctx.whirlpool, hash7, dataLen );
+     sph_whirlpool_close( &ctx.whirlpool, hash7 );
+
+     intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
+                  hash7, dataLen<<3 );
+
+     sha512_8way_init( &ctx.sha512 );
+     sha512_8way_update( &ctx.sha512, vhash, dataLen );
+     sha512_8way_close( &ctx.sha512, vhash );
+
+     rintrlv_8x64_8x32( vhashA, vhash, dataLen<<3 );
+
+     haval256_5_8way_init( &ctx.haval );
+     haval256_5_8way_update( &ctx.haval, vhashA, dataLen );
+     haval256_5_8way_close( &ctx.haval, output );
+}
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+   uint32_t *hash7 = &(hash[7<<3]);
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const uint32_t Htarg = ptarget[7];
+
+   mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   do
+   {
+      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
+              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
+                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
+      xevan_8way_hash( hash, vdata );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if unlikely( ( hash7[ lane ] <= Htarg ) )
+      {
+         extr_lane_8x32( lane_hash, hash, lane, 256 );
+         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_lane_solution( work, lane_hash, mythr, lane );
+         }
+      }
+      n += 8;
+   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
+
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(XEVAN_4WAY)
+
 union _xevan_4way_context_overlay
 {
 	blake512_4way_context   blake;
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -2,8 +2,10 @@

 bool register_xevan_algo( algo_gate_t* gate )
 {
-#if defined (XEVAN_4WAY)
-//  init_xevan_4way_ctx();
+#if defined (XEVAN_8WAY)
+  gate->scanhash  = (void*)&scanhash_xevan_8way;
+  gate->hash      = (void*)&xevan_8way_hash;
+#elif defined (XEVAN_4WAY)
  gate->scanhash  = (void*)&scanhash_xevan_4way;
  gate->hash      = (void*)&xevan_4way_hash;
 #else
--- a/algo/x17/xevan-gate.h
+++ b/algo/x17/xevan-gate.h
@@ -4,13 +4,21 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
-  #define XEVAN_4WAY
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define XEVAN_8WAY 1
+#elif defined(__AVX2__) && defined(__AES__)
+  #define XEVAN_4WAY 1
 #endif

 bool register_xevan_algo( algo_gate_t* gate );

-#if defined(XEVAN_4WAY)
+#if defined(XEVAN_8WAY)
+
+void xevan_8way_hash( void *state, const void *input );
+
+int scanhash_xevan_8way( struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done, struct thr_info *mythr );
+#elif defined(XEVAN_4WAY)

 void xevan_4way_hash( void *state, const void *input );

@@ -19,7 +27,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,

 //void init_xevan_4way_ctx();

-#endif
+#else

 void xevan_hash( void *state, const void *input );

@@ -30,3 +38,4 @@ void init_xevan_ctx();

 #endif

+#endif
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,6 +4,8 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

+rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
+
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.5.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.2'
-PACKAGE_STRING='cpuminer-opt 3.10.2'
+PACKAGE_VERSION='3.10.5'
+PACKAGE_STRING='cpuminer-opt 3.10.5'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.5 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.5:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.10.2
+cpuminer-opt configure 3.10.5
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.10.2, which was
+It was created by cpuminer-opt $as_me 3.10.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.10.2'
+ VERSION='3.10.5'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.2, which was
+This file was extended by cpuminer-opt $as_me 3.10.5, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.2
+cpuminer-opt config.status 3.10.5
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.2])
+AC_INIT([cpuminer-opt], [3.10.5])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
        printf(".\n");
     #endif

-     printf("CPU features:");
+     printf("CPU features: ");
+     if      ( cpu_has_avx512 )    printf( " AVX512" );
+     else if ( cpu_has_avx2   )    printf( " AVX2  " );
+     else if ( cpu_has_avx    )    printf( " AVX   " );
+     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
+     else if ( cpu_has_sse2   )    printf( " SSE2  " );
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( "  AES"   );
     if      ( cpu_has_sha    )    printf( " SHA"    );
-     if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2"   );
-     else if ( cpu_has_avx    )    printf( " AVX"    );
-     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2"   );

-     printf(".\nSW features:");
-     if      ( sw_has_vaes   )    printf( " VAES"   );
-     else if ( sw_has_aes    )    printf( " AES"    );
-     if      ( sw_has_sha    )    printf( " SHA"    );
+     printf("\nSW features:  ");
     if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2"   );
-     else if ( sw_has_avx    )    printf( " AVX"    );
+     else if ( sw_has_avx2   )    printf( " AVX2  " );
+     else if ( sw_has_avx    )    printf( " AVX   " );
     else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2"   );
+     else if ( sw_has_sse2   )    printf( " SSE2  " );
+     if      ( sw_has_vaes   )    printf( " VAES"   );
+     else if ( sw_has_aes    )    printf( " AES "   );
+     if      ( sw_has_sha    )    printf( " SHA"    );

-     printf(".\nAlgo features:");
+     printf("\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if      ( algo_has_vaes   )    printf( " VAES"   );
-        else if ( algo_has_aes    )    printf( " AES"    );
-        if      ( algo_has_sha    )    printf( " SHA"    );
        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2"   );
+        else if ( algo_has_avx2   )    printf( " AVX2  " );
        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2"   );
+        else if ( algo_has_sse2   )    printf( " SSE2  " );
+        if      ( algo_has_vaes   )    printf( " VAES"   );
+        else if ( algo_has_aes    )    printf( " AES "   );
+        if      ( algo_has_sha    )    printf( " SHA"    );
     }
-     printf(".\n");
+     printf("\n");

     // Check for CPU and build incompatibilities
     if ( !cpu_has_sse2 )
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
                   use_sha || use_vaes );
      
     // Display best options
-     printf( "Start mining with" );
+     printf( "\nStarting miner with" );
     if         ( use_none ) printf( " no optimizations" );
     else
     {
-        if      ( use_vaes   ) printf( " VAES"   );
-        else if ( use_aes    ) printf( " AES"    );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_sse42  ) printf( " SSE4.2" );
        else if ( use_sse2   ) printf( " SSE2"   );
+        if      ( use_vaes   ) printf( " VAES"   );
+        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha    ) printf( " SHA"    );
     }
-     printf( ".\n\n" );
+     printf( "...\n\n" );

     return true;
 }
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -897,7 +897,7 @@ static inline void intrlv_16x32_512( void *dst, const void *s00,
   *( (uint32_t*)(d06) +(i) ) = s[ 6]; \
   *( (uint32_t*)(d07) +(i) ) = s[ 7]; \
   *( (uint32_t*)(d08) +(i) ) = s[ 8]; \
-   *( (uint32_t*)(d09) +(i) ) = s[ 0]; \
+   *( (uint32_t*)(d09) +(i) ) = s[ 9]; \
   *( (uint32_t*)(d10) +(i) ) = s[10]; \
   *( (uint32_t*)(d11) +(i) ) = s[11]; \
   *( (uint32_t*)(d12) +(i) ) = s[12]; \
@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
   d0[3] = s[6];      d1[3] = s[7];
 }

-
-
-
 #endif // AVX

 ///////////////////////////
@@ -2165,7 +2162,9 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 5] );
   d[ 6] = _mm_unpacklo_epi32( s[ 6], s[ 7] );
   d[ 7] = _mm_unpackhi_epi32( s[ 6], s[ 7] );
+
   if ( bit_len <= 256 ) return;
+
   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[ 9] );
   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[ 9] );
   d[10] = _mm_unpacklo_epi32( s[10], s[11] );
@@ -2174,16 +2173,21 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[13] = _mm_unpackhi_epi32( s[12], s[13] );
   d[14] = _mm_unpacklo_epi32( s[14], s[15] );
   d[15] = _mm_unpackhi_epi32( s[14], s[15] );
+
   if ( bit_len <= 512 ) return;
+
   d[16] = _mm_unpacklo_epi32( s[16], s[17] );
   d[17] = _mm_unpackhi_epi32( s[16], s[17] );
   d[18] = _mm_unpacklo_epi32( s[18], s[19] );
   d[19] = _mm_unpackhi_epi32( s[18], s[19] );
+
   if ( bit_len <= 640 ) return;
+
   d[20] = _mm_unpacklo_epi32( s[20], s[21] );
   d[21] = _mm_unpackhi_epi32( s[20], s[21] );
   d[22] = _mm_unpacklo_epi32( s[22], s[23] );
   d[23] = _mm_unpackhi_epi32( s[22], s[23] );
+
   d[24] = _mm_unpacklo_epi32( s[24], s[25] );
   d[25] = _mm_unpackhi_epi32( s[24], s[25] );
   d[26] = _mm_unpacklo_epi32( s[26], s[27] );
@@ -2194,6 +2198,93 @@ static inline void rintrlv_4x32_4x64( void *dst,
   d[31] = _mm_unpackhi_epi32( s[30], s[31] );
 }

+// 8x32 -> 8x64
+
+static inline void rintrlv_8x32_8x64( void *dst,
+                                      const void *src, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s = (const __m128i*)src;
+
+   d[ 0] = _mm_unpacklo_epi32( s[ 0], s[ 2] );
+   d[ 1] = _mm_unpackhi_epi32( s[ 0], s[ 2] );
+   d[ 2] = _mm_unpacklo_epi32( s[ 1], s[ 3] );
+   d[ 3] = _mm_unpackhi_epi32( s[ 1], s[ 3] );
+   d[ 4] = _mm_unpacklo_epi32( s[ 4], s[ 6] );
+   d[ 5] = _mm_unpackhi_epi32( s[ 4], s[ 6] );
+   d[ 6] = _mm_unpacklo_epi32( s[ 5], s[ 7] );
+   d[ 7] = _mm_unpackhi_epi32( s[ 5], s[ 7] );
+
+   d[ 8] = _mm_unpacklo_epi32( s[ 8], s[10] );
+   d[ 9] = _mm_unpackhi_epi32( s[ 8], s[10] );
+   d[10] = _mm_unpacklo_epi32( s[ 9], s[11] );
+   d[11] = _mm_unpackhi_epi32( s[ 9], s[11] );
+   d[12] = _mm_unpacklo_epi32( s[12], s[14] );
+   d[13] = _mm_unpackhi_epi32( s[12], s[14] );
+   d[14] = _mm_unpacklo_epi32( s[13], s[15] );
+   d[15] = _mm_unpackhi_epi32( s[13], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi32( s[16], s[18] );
+   d[17] = _mm_unpackhi_epi32( s[16], s[18] );
+   d[18] = _mm_unpacklo_epi32( s[17], s[19] );
+   d[19] = _mm_unpackhi_epi32( s[17], s[19] );
+   d[20] = _mm_unpacklo_epi32( s[20], s[22] );
+   d[21] = _mm_unpackhi_epi32( s[20], s[22] );
+   d[22] = _mm_unpacklo_epi32( s[21], s[23] );
+   d[23] = _mm_unpackhi_epi32( s[21], s[23] );
+
+   d[24] = _mm_unpacklo_epi32( s[24], s[26] );
+   d[25] = _mm_unpackhi_epi32( s[24], s[26] );
+   d[26] = _mm_unpacklo_epi32( s[25], s[27] );
+   d[27] = _mm_unpackhi_epi32( s[25], s[27] );
+   d[28] = _mm_unpacklo_epi32( s[28], s[30] );
+   d[29] = _mm_unpackhi_epi32( s[28], s[30] );
+   d[30] = _mm_unpacklo_epi32( s[29], s[31] );
+   d[31] = _mm_unpackhi_epi32( s[29], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi32( s[32], s[34] );
+   d[33] = _mm_unpackhi_epi32( s[32], s[34] );
+   d[34] = _mm_unpacklo_epi32( s[33], s[35] );
+   d[35] = _mm_unpackhi_epi32( s[33], s[35] );
+   d[36] = _mm_unpacklo_epi32( s[36], s[38] );
+   d[37] = _mm_unpackhi_epi32( s[36], s[38] );
+   d[38] = _mm_unpacklo_epi32( s[37], s[39] );
+   d[39] = _mm_unpackhi_epi32( s[37], s[39] );
+
+   d[40] = _mm_unpacklo_epi32( s[40], s[42] );
+   d[41] = _mm_unpackhi_epi32( s[40], s[42] );
+   d[42] = _mm_unpacklo_epi32( s[41], s[43] );
+   d[43] = _mm_unpackhi_epi32( s[41], s[43] );
+   d[44] = _mm_unpacklo_epi32( s[44], s[46] );
+   d[45] = _mm_unpackhi_epi32( s[44], s[46] );
+   d[46] = _mm_unpacklo_epi32( s[45], s[47] );
+   d[47] = _mm_unpackhi_epi32( s[45], s[47] );
+
+   d[48] = _mm_unpacklo_epi32( s[48], s[50] );
+   d[49] = _mm_unpackhi_epi32( s[48], s[50] );
+   d[50] = _mm_unpacklo_epi32( s[49], s[51] );
+   d[51] = _mm_unpackhi_epi32( s[49], s[51] );
+   d[52] = _mm_unpacklo_epi32( s[52], s[54] );
+   d[53] = _mm_unpackhi_epi32( s[52], s[54] );
+   d[54] = _mm_unpacklo_epi32( s[53], s[55] );
+   d[55] = _mm_unpackhi_epi32( s[53], s[55] );
+
+   d[56] = _mm_unpacklo_epi32( s[56], s[58] );
+   d[57] = _mm_unpackhi_epi32( s[56], s[58] );
+   d[58] = _mm_unpacklo_epi32( s[57], s[59] );
+   d[59] = _mm_unpackhi_epi32( s[57], s[59] );
+   d[60] = _mm_unpacklo_epi32( s[60], s[62] );
+   d[61] = _mm_unpackhi_epi32( s[60], s[62] );
+   d[62] = _mm_unpacklo_epi32( s[61], s[63] );
+   d[63] = _mm_unpackhi_epi32( s[61], s[63] );
+}
+
+
+
 /*
 #define RLEAVE_4x32_4x64(i) do \
 { \
@@ -2225,7 +2316,6 @@ static inline void rintrlv_4x32_4x64( void *dst,

 // 2x128 -> 4x64

-
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                       const void *src1, const int bit_len )
 {
@@ -2268,7 +2358,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
 }

-
 /*
 #define RLEAVE_2x128_4x64( i ) do \
 { \
@@ -2339,7 +2428,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
   d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
 }

-
 /*
 #define RLEAVE_4x64_2x128( i ) do \
 { \
@@ -2364,6 +2452,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 }
 */

+// 2x128 -> 8x64
+
+static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
+                                       const void *src1, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   const __m128i *s0 = (const __m128i*)src0;
+   const __m128i *s1 = (const __m128i*)src1;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
+   d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
+   d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
+   d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
+   d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
+   d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
+   d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
+   d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
+   d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
+   d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
+   d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
+   d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
+   d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
+   d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
+   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
+   d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
+   d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
+   d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
+   d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
+   d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
+   d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
+   d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
+
+   d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
+   d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
+   d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
+   d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
+   d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
+   d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
+   d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
+   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
+   d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
+   d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
+   d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
+   d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
+   d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
+   d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
+   d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
+
+   d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
+   d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
+   d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
+   d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
+   d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
+   d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
+   d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
+   d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
+
+   d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
+   d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
+   d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
+   d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
+   d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
+   d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
+   d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
+   d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
+
+   d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
+   d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
+   d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
+   d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
+   d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
+   d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
+   d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
+   d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
+}
+
+// 8x64 -> 4x128
+
+static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
+                                       const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
+   d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
+   d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+
+   d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
+   d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
+   d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
+   d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
+   d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
+   d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
+   d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
+   d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
+   d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
+   d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
+   d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
+
+   d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
+   d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
+   d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
+   d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
+   d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
+   d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
+   d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
+   d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
+   d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
+   d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
+   d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
+   d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
+   d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
+   d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
+   d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
+   d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
+   d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
+   d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
+   d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
+   d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
+   d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
+   d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
+   d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
+   d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
+   d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
+   d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
+   d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
+   d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
+   d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
+   d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
+   d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 8x64 -> 2x256
+
+static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
+                          void *dst3,  const void *src, const int bit_len )
+{
+   __m128i *d0 = (__m128i*)dst0;
+   __m128i *d1 = (__m128i*)dst1;
+   __m128i *d2 = (__m128i*)dst2;
+   __m128i *d3 = (__m128i*)dst3;
+   const __m128i* s = (const __m128i*)src;
+
+   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
+   d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
+   d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );   
+   d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
+   d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); 
+   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
+   d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); 
+   d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
+   
+   d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); 
+   d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
+   d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); 
+   d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
+   d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
+   d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
+   d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
+   d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
+   d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
+   d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
+   d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
+   d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
+   d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
+   d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
+   d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
+   
+   d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
+   d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
+   d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
+   d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
+   d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
+   d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
+   d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
+   d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
+
+   if ( bit_len <= 512 ) return;
+
+   d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
+   d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
+   d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
+   d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
+   d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
+   d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
+   d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
+   d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
+
+   d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
+   d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
+   d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
+   d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
+   d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
+   d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
+   d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
+   d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
+
+   d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
+   d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
+   d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
+   d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
+   d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
+   d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
+   d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
+   d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
+
+   d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
+   d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
+   d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
+   d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
+   d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
+   d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
+   d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
+   d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
+}
+
+// 4x128 -> 8x64
+
+static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
+      const void *src1, const void *src2, const void *src3, const int bit_len )
+{
+   __m128i *d = (__m128i*)dst;
+   __m128i *s0 = (__m128i*)src0;
+   __m128i *s1 = (__m128i*)src1;
+   __m128i *s2 = (__m128i*)src2;
+   __m128i *s3 = (__m128i*)src3;
+
+   d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
+   d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
+   d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
+   d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
+   d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
+   d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
+   d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
+   d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
+
+   d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
+   d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
+   d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
+   d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
+   d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
+   d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
+   d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
+   d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
+   d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
+   d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
+   d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
+   d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
+   d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
+   d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
+   d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
+
+   d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
+   d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
+   d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
+   d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
+   d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
+   d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
+   d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
+   d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
+   d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
+   d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
+   d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
+   d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
+   d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
+   d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
+   d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
+
+   d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
+   d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
+   d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
+   d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
+   d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
+   d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
+   d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
+   d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
+
+   d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
+   d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
+   d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
+   d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
+   d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
+   d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
+   d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
+   d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
+
+   d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
+   d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
+   d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
+   d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
+   d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
+   d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
+   d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
+   d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
+}
+
 //
 // Some functions customized for mining.

--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #else

-
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )

+// Rotate 16 byte (128 bit) vector by c bytes.
+// Less efficient using shift but more versatile. Use only for odd number
+// byte rotations. Use shuffle above whenever possible.
+#define mm128_ror_x8( v, c ) \
+   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
+
+#define mm128_rol_x8( v, c ) \
+   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+
 #if defined (__SSE3__)
 // no SSE2 implementation, no current users

@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_1x8( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
                                       0x060504030201000f ) )
-#endif  // SSE3
+#else  // SSE2

-// Rotate 16 byte (128 bit) vector by c bytes.
-// Less efficient using shift but more versatile. Use only for odd number
-// byte rotations. Use shuffle above whenever possible.
-#define mm128_bror( v, c ) \
-   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
+#define mm128_ror_1x16( v ) \
+   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )

-#define mm128_brol( v, c ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+#define mm128_rol_1x16( v ) \
+   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )

+#define mm128_ror_1x8( v ) \
+   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
+
+#define mm128_rol_1x8( v ) \
+   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
+
+#endif   // SSE3 else SSE2

 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Rotate elements within lanes.

-#define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_swap_64_32( v )  _mm_shuffle_epi32( v, 0xb1 )

-#define mm128_ror16_64( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
-                                       0x0100070605040302 )
+#define mm128_rol64_8( v, c ) \
+     _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
+                   _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )

-#define mm128_rol16_64( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
-                                       0x0504030201000706 )
+#define mm128_ror64_8( v, c ) \
+     _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
+                   _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
+
+#define mm128_rol32_8( v, c ) \
+     _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
+                   _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
+
+#define mm128_ror32_8( v, c ) \
+     _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
+                   _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
           
-#define mm128_swap16_32( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
-                                       0x0504070601000302 )

 //
 // Endian byte swap.
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 // Swap 128 bit vectorse.

-#define mm128_swap128_256( v1, v2 ) \
+#define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );

+
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
 #if defined(__SSE4_1__)

-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -497,7 +515,7 @@ do { \

 #else  // SSE2

-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                              _mm_slli_si128( v2, 8 ) ); \
@@ -506,7 +524,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                              _mm_srli_si128( v2, 8 ) ); \
@@ -515,7 +533,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                              _mm_slli_si128( v2, 12 ) ); \
@@ -524,7 +542,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                              _mm_srli_si128( v2, 12 ) ); \
@@ -533,7 +551,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                              _mm_slli_si128( v2, 14 ) ); \
@@ -542,7 +560,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                              _mm_srli_si128( v2, 14 ) ); \
@@ -551,7 +569,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                              _mm_slli_si128( v2, 15 ) ); \
@@ -560,7 +578,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                              _mm_srli_si128( v2, 15 ) ); \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )


 //
-// Rotate elements within lanes of 256 bit vector.
+// Rotate elements within each 128 bit lane of 256 bit vector.

-// Swap 64 bit elements in each 128 bit lane.
-#define mm256_swap64_128( v )   _mm256_shuffle_epi32( v, 0x4e )
+#define mm256_swap128_64( v )   _mm256_shuffle_epi32( v, 0x4e )

-// Rotate each 128 bit lane by one 32 bit element.
-#define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_ror128_32( v )  _mm256_shuffle_epi32( v, 0x39 )

-#define mm256_ror1x16_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
-                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
+#define mm256_rol128_1x32( v )  _mm256_shuffle_epi32( v, 0x93 )

-#define mm256_rol1x16_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
-                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )
-
-#define mm256_ror1x8_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
-                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
-
-#define mm256_rol1x8_128( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
-                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
-
-// Rotate each 128 bit lane by c bytes.
-#define mm256_bror_128( v, c ) \
+// Rotave each 128 bit lane by c elements.
+#define mm256_ror128_8( v, c ) \
  _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
                   _mm256_bslli_epi128( v, 16-(c) ) )
-#define mm256_brol_128( v, c ) \
+#define mm256_rol128_8( v, c ) \
  _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
                   _mm256_bsrli_epi128( v, 16-(c) ) )

-// Swap 32 bit elements in each 64 bit lane
-#define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
+
+// Rotate elements in each 64 bit lane
+
+#define mm256_swap64_32( v )    _mm256_shuffle_epi32( v, 0xb1 )

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#define mm256_rol1x16_64( v )   _mm256_rol_epi64( v, 16 )
-#define mm256_ror1x16_64( v )   _mm256_ror_epi64( v, 16 )
+#define mm256_rol64_8( v, c )   _mm256_rol_epi64( v, ((c)<<3) ) 
+#define mm256_ror64_8( v, c )   _mm256_ror_epi64( v, ((c)<<3) ) 

 #else

-#define mm256_ror1x16_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
-                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#define mm256_rol64_8( v, c ) \
+     _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
+                      _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
+
+#define mm256_ror64_8( v, c ) \
+     _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
+                      _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )

-#define mm256_rol1x16_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
-                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 #endif

-#define mm256_ror1x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
-                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )

-#define mm256_rol1x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
-                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
-
-#define mm256_ror3x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
-                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
-
-#define mm256_rol3x8_64( v ) \
-   _mm256_shuffle_epi8( v, \
-        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
-                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
-
-
-// Swap 16 bit elements in each 32 bit lane
+// Rotate elements in each 32 bit lane

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#define mm256_swap16_32( v )   _mm256_rol_epi32( v, 16 )
+#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
+
+#define mm256_rol32_8( v )   _mm256_rol_epi32( v, 8 )
+#define mm256_ror32_8( v )   _mm256_ror_epi32( v, 8 )

 #else

-#define mm256_swap16_32( v ) \
-   _mm256_shuffle_epi8( v, \
-         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
-                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
+#define mm256_swap32_16( v ) \
+     _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
+                      _mm256_srli_epi32( v, 16 ) )
+
+#define mm256_rol32_8( v ) \
+     _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
+                      _mm256_srli_epi32( v, 8 ) )
+
+#define mm256_ror32_8( v, c ) \
+     _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
+                      _mm256_slli_epi32( v, 8 ) )
+
 #endif

+
 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.

-#define mm256_swap256_512 (v1, v2) \
-   v1 = _mm256_xor_si256(v1, v2); \
-   v2 = _mm256_xor_si256(v1, v2); \
-   v1 = _mm256_xor_si256(v1, v2);
+#define mm256_swap512_256( v1, v2 ) \
+   v1 = _mm256_xor_si256( v1, v2 ); \
+   v2 = _mm256_xor_si256( v1, v2 ); \
+   v1 = _mm256_xor_si256( v1, v2 );

-#define mm256_ror1x128_512( v1, v2 ) \
+#define mm256_ror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)

-#define mm256_rol1x128_512( v1, v2 ) \
+#define mm256_rol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,13 @@

 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    Some instructions like cmp and blend use the mask regsiters now instead
-//    a vector mask.
+//    cmp instruction now returns a bitmask isnstead of a vector mask.
+//    This eliminates the need for the blendv instruction.
 //
-//    The new rotate instructions require the count to be only an 8 bit
-//    immediate value. The documentation is the same as for shift and
-//    it allows variables. Suspect a compiler issue but it still happens
-//    in GCC9.
+//    The new rotate instructions require the count to be an 8 bit
+//    immediate value only. Compilation fails if a variable is used.
+//    The documentation is the same as for shift and it works with
+//    variables.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const2_64( i1, i0 ) \
   m512_const1_128( m128_const_64( i1, i0 ) )

+#define m512_const2_32( i1, i0 ) \
+   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
+                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
+
+
 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
 {
@@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) )

 #define mm512_bswap_32( v ) \
   _mm512_shuffle_epi8( v, \
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 { \
  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
+                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements in 512 bit vector.

+
 #define mm512_swap_256( v )        _mm512_alignr_epi64( v, v, 4 )

+// 1x64 notation used to disinguish from bit rotation.
 #define mm512_ror_1x128( v )       _mm512_alignr_epi64( v, v, 2 )
 #define mm512_rol_1x128( v )       _mm512_alignr_epi64( v, v, 6 )

@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.

+// Rename these for consistency. Element size is always last.
+// mm<vectorsize>_<op><lanesize>_<elementsize>
+
+
 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap128_256( v )   _mm512_permutex_epi64( v, 0x4e )
+
+#define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )

 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
+
+#define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
+#define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )


 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror1x32_256( v ) \
+
+#define mm512_ror256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x000000080000000f, 0x0000000e0000000d, \
                      0x0000000c0000000b, 0x0000000a00000009, \
                      0x0000000000000007, 0x0000000600000005, \
                      0x0000000400000003, 0x0000000200000001 ), v )

-#define mm512_rol1x32_256( v ) \
+#define mm512_rol256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x0000000e0000000d, 0x0000000c0000000b, \
                      0x0000000a00000009, 0x000000080000000f, \
                      0x0000000600000005, 0x0000000400000003, \
                      0x0000000200000001, 0x0000000000000007 ), v )

-#define mm512_ror1x16_256( v ) \
+#define mm512_ror256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x00100001001e001d, 0x001c001b001a0019, \
                     0x0018001700160015, 0x0014001300120011, \
                     0x0000000f000e000d, 0x000c000b000a0009, \
                     0x0008000700060005, 0x0004000300020001 ), v )

-#define mm512_rol1x16_256( v ) \
+#define mm512_rol256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x001e001d001c001b, 0x001a001900180017, \
                     0x0016001500140013, 0x001200110010001f, \
                     0x000e000d000c000b, 0x000a000900080007, \
                     0x0006000500040003, 0x000200010000000f ), v )

-#define mm512_ror1x8_256( v ) \
+#define mm512_ror256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x203f3e3d3c3b3a39, 0x3837363534333231, \
                     0x302f2e2d2c2b2a29, 0x2827262524232221, \
                     0x001f1e1d1c1b1a19, 0x1817161514131211, \
                     0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )

-#define mm512_rol1x8_256( v ) \
+#define mm512_rol256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231302f, \
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Rotate elements within 128 bit lanes of 512 bit vector.

 // Swap hi & lo 64 bits in each 128 bit lane
-#define mm512_swap64_128( v )    _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64( v )    _mm512_shuffle_epi32( v, 0x4e )

 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_ror128_32( v )   _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_rol128_32( v )   _mm512_shuffle_epi32( v, 0x93 )

-#define mm512_ror1x16_128( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                     0x0018001f001e001d, 0x001c001b001a0019, \
-                     0x0010001700160015, 0x0014001300120011, \
-                     0x0008000f000e000d, 0x000c000b000a0009, \
-                     0x0000000700060005, 0x0004000300020001 ), v ) 

-#define mm512_rol1x16_128( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                     0x001e001d001c001b, 0x001a00190018001f, \
-                     0x0016001500140013, 0x0012001100100017, \
-                     0x000e000d000c000b, 0x000a00090008000f, \
-                     0x0006000500040003, 0x0002000100000007 ), v ) 
-
-#define mm512_ror1x8_128( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                     0x303f3e3d3c3b3a39, 0x3837363534333231, \
-                     0x202f2e2d2c2b2a29, 0x2827262524232221, \
-                     0x101f1e1d1c1b1a19, 0x1817161514131211, \
-                     0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
-
-#define mm512_rol1x8_128( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                     0x3e3d3c3b3a393837, 0x363534333231303f, \
-                     0x2e2d2c2b2a292827, 0x262524232221202f, \
-                     0x1e1d1c1b1a191817, 0x161514131211101f, \
-                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
-
-// Rotate 128 bit lanes by c bytes.  
-#define mm512_bror_128( v, c ) \
+// Rotate 128 bit lanes by c bytes, faster than building that monstrous 
+// constant above.  
+#define mm512_ror128_8( v, c ) \
   _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
                    _mm512_bslli_epi128( v, 16-(c) ) )
-#define mm512_brol_128( v, c ) \
+#define mm512_rol128_8( v, c ) \
   _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
                    _mm512_bsrli_epi128( v, 16-(c) ) )

@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 64 bit lanes.

+#define mm512_rol64_x8( v, c )   _mm512_rol_epi64( v, ((c)<<3) )
+#define mm512_ror64_x8( v, c )   _mm512_ror_epi64( v, ((c)<<3) )
+
 // Swap 32 bit elements in each 64 bit lane
-#define mm512_swap32_64( v )      _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32( v )      _mm512_shuffle_epi32( v, 0xb1 )

 // Rotate each 64 bit lane by one 16 bit element.
-#define mm512_ror1x16_64( v )   _mm512_ror_epi64( v, 16 )
-#define mm512_rol1x16_64( v )   _mm512_rol_epi64( v, 16 )
-#define mm512_ror1x8_64( v )    _mm512_ror_epi64( v, 8 )
-#define mm512_rol1x8_64( v )    _mm512_rol_epi64( v, 8 )
-
-/*
-#define mm512_ror1x16_64( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                      0x001c001f001e001d, 0x0018001b001a0019, \
-                      0x0014001700160015, 0x0010001300120011, \
-                      0x000c000f000e000d, 0x0008000b000a0009, \
-                      0x0004000700060005, 0x0000000300020001, v )
-
-#define mm512_rol1x16_64( v ) \
-    _mm512_permutexvar_epi16( m512_const_64( \
-                      0x001e001d001c001f, 0x001a00190018001b, \
-                      0x0016001500140017, 0x0012001100100013, \
-                      0x000e000d000c000f, 0x000a00090008000b, \
-                      0x0006000500040007, 0x0002000100000003, v )
-
-// Rotate each 64 bit lane by one byte.
-#define mm512_ror1x8_64( v ) \
-    _mm512_shuffle_epi8( v, m512_const_64( \
-                      0x383F3E3D3C3B3A39, 0x3037363534333231, \
-                      0x282F2E2D2C2B2A29, 0x2027262524232221, \
-                      0x181F1E1D1C1B1A19, 0x1017161514131211, \
-                      0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
-#define mm512_rol1x8_64( v ) \
-    _mm512_shuffle( v, m512_const_64( \
-                       0x3E3D3C3B3A39383F, 0x3635343332313037, \
-                       0x2E2D2C2B2A29282F, 0x2625242322212027, \
-                       0x1E1D1C1B1A19181F, 0x1615141312111017, \
-                       0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
-*/
+#define mm512_ror64_16( v )   _mm512_ror_epi64( v, 16 )
+#define mm512_rol64_16( v )   _mm512_rol_epi64( v, 16 )
+#define mm512_ror64_8( v )    _mm512_ror_epi64( v, 8 )
+#define mm512_rol64_8( v )    _mm512_rol_epi64( v, 8 )

 //
 // Rotate elements within 32 bit lanes.

-#define mm512_swap16_32( v )   _mm512_ror_epi32( v, 16 )
-#define mm512_ror1x8_32( v )   _mm512_ror_epi32( v, 8 )
-#define mm512_rol1x8_32( v )   _mm512_rol_epi32( v, 8 )
-
-/*
-#define mm512_swap16_32( v ) \
-   _mm512_permutexvar_epi16( m512_const_64( \
-                       0x001e001f001c001d, 0x001a001b00180019, \
-                       0x0016001700140015, 0x0012001300100011, \
-                       0x000e000f000c000d, 0x000a000b00080009, \
-                       0x0006000700040005, 0x0002000300000001 ), v )
-
-#define mm512_ror1x8_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const_64( \
-                       0x3C3F3E3D383B3A39, 0x3437363530333231, \
-                       0x2C2F2E2D282B2A29, 0x2427262520232221, \
-                       0x1C1F1E1D181B1A19, 0x1417161510131211, \
-                       0x0C0F0E0D080B0A09, 0x0407060500030201 ))
-
-#define mm512_rol1x8_32( v ) \
-   _mm512_shuffle_epi8( v, m512_const_64( \
-                       0x3E3D3C3F3A39383B, 0x3635343732313033, \
-                       0x2E2D2C2F2A29282B, 0x2625242722212023, \
-                       0x1E1D1C1F1A19181B, 0x1615141712111013, \
-                       0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
-*/
-
+#define mm512_rol32_x8( v, c )   _mm512_rol_epi32( v, ((c)<<2) )
+#define mm512_ror32_x8( v, c )   _mm512_ror_epi32( v, ((c)<<2) )


 //
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //  These can all be done with 2 permutex2var instructions but they are
 //  slower than either xor or alignr and require AVX512VBMI.

-#define mm512_swap512_1024(v1, v2) \
+#define mm512_swap1024_512(v1, v2) \
   v1 = _mm512_xor_si512(v1, v2); \
   v2 = _mm512_xor_si512(v1, v2); \
   v1 = _mm512_xor_si512(v1, v2);

-#define mm512_ror1x256_1024( v1, v2 ) \
+#define mm512_ror1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1x256_1024( v1, v2 ) \
+#define mm512_rol1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1x128_1024( v1, v2 ) \
+#define mm512_ror1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1x128_1024( v1, v2 ) \
+#define mm512_rol1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1x64_1024( v1, v2 ) \
+#define mm512_ror1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1x64_1024( v1, v2 ) \
+#define mm512_rol1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1x32_1024( v1, v2 ) \
+#define mm512_ror1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1x32_1024( v1, v2 ) \
+#define mm512_rol1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \