v3.10.4

2025-09-17 23:44:27 +00:00 · 2019-12-17 00:57:35 -05:00
51 changed files with 5473 additions and 911 deletions
--- a/71
+++ b/71
@@ -1,12 +1,14 @@
-Requirements:
+1. Requirements:
 ---------------
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.
-Building on linux prerequisites:
+2. Building on linux prerequisites:
 -----------------------------------
 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
+be installed manually. There may be others, read the compiler error messages,
-will give a clue as to the missing package.
+they will give a clue as to the missing package.
 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
-but different package names.
+but different package names. 
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
-compiler version, to CFLAGS:
+support depending on your CPU and compiler version:
-"-march=native" or "-march=znver1" or "-msha".
+
 "-march=native" is always the best choice
 "-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
 "-msha"  Add SHA to other tuning options
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.
-Extract cpuminer source.
+3. Download cpuminer-opt
 ------------------------
-tar xvzf cpuminer-opt-x.y.z.tar.gz
+Download the source code for the latest realease from the official repository.
 cd cpuminer-opt-x.y.z
-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases
-./autogen.sh
+Extract the source code.
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make
-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
 Alternatively it can be cloned from git.
 $ git clone https://github.com/JayDDee/cpuminer-opt.git
 4. Build cpuminer-opt
 ---------------------
 It is recomended to Build with default options, this will usuallly
 produce the best results.
 $ ./build.sh to build on Linux or execute the following commands.
 or 
 $ ./autogen.sh
 $ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 $ make -j n
 n is the number of threads.
 5. Start mining.
 ----------------
 $ ./cpuminer -a algo -o url -u username -p password
 ./cpuminer -a algo -o url -u username -p password
 Windows
 -------
 See also INSTAL_WINDOWS
 The following procedure is obsolete and uses an old compiler.
 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,8 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/sponge-2way.c \
  algo/lyra2/lyra2-hash-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/13
+++ b/13
@@ -1,6 +1,8 @@
 cpuminer-opt is a console program run from the command line using the
 keyboard, not the mouse.
 See also README.md for list of supported algorithms,
 Security warning
 ----------------
@@ -31,7 +33,16 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------
-v3.10.2
+v3.10.4
 AVX512 for x16r, x16rv2, x16rt, x16s, x16rt-veil (veil).
 v3.10.3
 AVX512 for x12, x13, x14, x15.
 Fixed x12 AVX2 invalid shares.
 v.10.2
 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
 Fixed c11 AVX2 invalid shares.
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 #include <immintrin.h>
-#define  rotr32  mm256_swap32_64
+#define  rotr32( x )  mm256_ror_64( x, 32 )
-#define  rotr24  mm256_ror3x8_64
+#define  rotr24( x )  mm256_ror_64( x, 24 )
-#define  rotr16  mm256_ror1x16_64
+#define  rotr16( x )  mm256_ror_64( x, 16 )
-#define  rotr63( x ) mm256_rol_64( x, 1 )
+#define  rotr63( x )  mm256_rol_64( x,  1 )
 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
 //#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
 #define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);
 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
 #define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);
 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
 #define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);
 #ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
 #define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);
 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);
 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
 #define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
 }
 void
-blake256_16way_close_update(void *cc, void *dst)
+blake256_16way_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
 }
 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
 }
 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
 }
 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
 }
 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
 }
 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap64_128( x4 );
+        x4 = mm512_swap128_64( x4 );
-        x5 = mm512_swap64_128( x5 );
+        x5 = mm512_swap128_64( x5 );
-        x6 = mm512_swap64_128( x6 );
+        x6 = mm512_swap128_64( x6 );
-        x7 = mm512_swap64_128( x7 );
+        x7 = mm512_swap128_64( x7 );
        x4 = _mm512_add_epi32( x0, x4 );
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap32_64( x4 );
+        x4 = mm512_swap64_32( x4 );
-        x5 = mm512_swap32_64( x5 );
+        x5 = mm512_swap64_32( x5 );
-        x6 = mm512_swap32_64( x6 );
+        x6 = mm512_swap64_32( x6 );
-        x7 = mm512_swap32_64( x7 );
+        x7 = mm512_swap64_32( x7 );
    }
    _mm512_store_si512( (__m512i*)sp->h,     x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
+        x4 = mm256_swap128_64( x4 );
-        x5 = mm256_swap64_128( x5 );
+        x5 = mm256_swap128_64( x5 );
-        x6 = mm256_swap64_128( x6 );
+        x6 = mm256_swap128_64( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x7 = mm256_swap128_64( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
+        x4 = mm256_swap64_32( x4 );
-        x5 = mm256_swap32_64( x5 );
+        x5 = mm256_swap64_32( x5 );
-        x6 = mm256_swap32_64( x6 );
+        x6 = mm256_swap64_32( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x7 = mm256_swap64_32( x7 );
    }
    _mm256_store_si256( (__m256i*)sp->h,     x0 );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
+        x2 = mm256_swap128_64( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
+        x2 = mm256_swap64_32( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x3 = mm256_swap64_32( x3 );
    }
    _mm256_store_si256( (__m256i*)sp->x,     x0 );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };
 #define s0   m0
 #define s1   c0
 #define s2   m1
 #define s3   c1
 #define s4   c2
 #define s5   m2
 #define s6   c3
 #define s7   m3
 #define s8   m4
 #define s9   c4
 #define sA   m5
 #define sB   c5
 #define sC   c6
 #define sD   m6
 #define sE   c7
 #define sF   m7
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // Hamsi 8 way 
 #define INPUT_BIG8 \
 do { \
  __m512i db = *buf; \
  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
  for ( int u = 0; u < 64; u++ ) \
  { \
     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
     dm = mm512_negate_32( _mm512_or_si512( dm, \
                                          _mm512_slli_epi64( dm, 32 ) ) ); \
     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[0] ) ) ); \
     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[1] ) ) ); \
     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[2] ) ) ); \
     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[3] ) ) ); \
     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[4] ) ) ); \
     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[5] ) ) ); \
     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[6] ) ) ); \
     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[7] ) ) ); \
     tp += 8; \
     db = _mm512_srli_epi64( db, 1 ); \
  } \
 } while (0)
 #define SBOX8( a, b, c, d ) \
 do { \
  __m512i t; \
  t = a; \
  a = _mm512_and_si512( a, c ); \
  a = _mm512_xor_si512( a, d ); \
  c = _mm512_xor_si512( c, b ); \
  c = _mm512_xor_si512( c, a ); \
  d = _mm512_or_si512( d, t ); \
  d = _mm512_xor_si512( d, b ); \
  t = _mm512_xor_si512( t, c ); \
  b = d; \
  d = _mm512_or_si512( d, t ); \
  d = _mm512_xor_si512( d, a ); \
  a = _mm512_and_si512( a, b ); \
  t = _mm512_xor_si512( t, a ); \
  b = _mm512_xor_si512( b, d ); \
  b = _mm512_xor_si512( b, t ); \
  a = c; \
  c = b; \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
 #define L8( a, b, c, d ) \
 do { \
   a = mm512_rol_32( a, 13 ); \
   c = mm512_rol_32( c,  3 ); \
   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
                                              _mm512_slli_epi32( a, 3 ) ) ); \
   b = mm512_rol_32( b, 1 ); \
   d = mm512_rol_32( d, 7 ); \
   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
                                              _mm512_slli_epi32( b, 7 ) ) ); \
   a = mm512_rol_32( a,  5 ); \
   c = mm512_rol_32( c, 22 ); \
 } while (0)
 #define DECL_STATE_BIG8 \
   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
 #define READ_STATE_BIG8(sc) \
 do { \
   c0 = sc->h[0x0]; \
   c1 = sc->h[0x1]; \
   c2 = sc->h[0x2]; \
   c3 = sc->h[0x3]; \
   c4 = sc->h[0x4]; \
   c5 = sc->h[0x5]; \
   c6 = sc->h[0x6]; \
   c7 = sc->h[0x7]; \
 } while (0)
 #define WRITE_STATE_BIG8(sc) \
 do { \
   sc->h[0x0] = c0; \
   sc->h[0x1] = c1; \
   sc->h[0x2] = c2; \
   sc->h[0x3] = c3; \
   sc->h[0x4] = c4; \
   sc->h[0x5] = c5; \
   sc->h[0x6] = c6; \
   sc->h[0x7] = c7; \
 } while (0)
 #define ROUND_BIG8(rc, alpha) \
 do { \
   __m512i t0, t1, t2, t3; \
   s0 = _mm512_xor_si512( s0, m512_const1_64( \
                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
  SBOX8( s0, s4, s8, sC ); \
  SBOX8( s1, s5, s9, sD ); \
  SBOX8( s2, s6, sA, sE ); \
  SBOX8( s3, s7, sB, sF ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
                                        _mm512_bslli_epi128( s5, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
                                        _mm512_bslli_epi128( sE, 4 ) ); \
  L8( s0, t1, s9, t3 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
                                        _mm512_bslli_epi128( s6, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
                                        _mm512_bslli_epi128( sF, 4 ) ); \
  L8( s1, t1, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
                                        _mm512_bslli_epi128( s7, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
                                        _mm512_bslli_epi128( sC, 4 ) ); \
  L8( s2, t1, sB, t3 ); \
  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
                                        _mm512_bslli_epi128( s4, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
                                        _mm512_bslli_epi128( sD, 4 ) ); \
  L8( s3, t1, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
                                        _mm512_bslli_epi128( sB, 4 ) ); \
  L8( t0, t1, t2, t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
                                        _mm512_bslli_epi128( sD, 4 ) ); \
  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
  L8( t0, t1, t2, t3 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
 } while (0)
 #define P_BIG8 \
 do { \
   ROUND_BIG8(0, alpha_n); \
   ROUND_BIG8(1, alpha_n); \
   ROUND_BIG8(2, alpha_n); \
   ROUND_BIG8(3, alpha_n); \
   ROUND_BIG8(4, alpha_n); \
   ROUND_BIG8(5, alpha_n); \
 } while (0)
 #define PF_BIG8 \
 do { \
   ROUND_BIG8( 0, alpha_f); \
   ROUND_BIG8( 1, alpha_f); \
   ROUND_BIG8( 2, alpha_f); \
   ROUND_BIG8( 3, alpha_f); \
   ROUND_BIG8( 4, alpha_f); \
   ROUND_BIG8( 5, alpha_f); \
   ROUND_BIG8( 6, alpha_f); \
   ROUND_BIG8( 7, alpha_f); \
   ROUND_BIG8( 8, alpha_f); \
   ROUND_BIG8( 9, alpha_f); \
   ROUND_BIG8(10, alpha_f); \
   ROUND_BIG8(11, alpha_f); \
 } while (0)
 #define T_BIG8 \
 do { /* order is important */ \
   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
 } while (0)
 void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
 {
   DECL_STATE_BIG8
   uint32_t tmp = num << 6;
   sc->count_low = SPH_T32( sc->count_low + tmp );
   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
   if ( sc->count_low < tmp )
      sc->count_high++;
   READ_STATE_BIG8( sc );
   while ( num-- > 0 )
   {
      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
      INPUT_BIG8;
      P_BIG8;
      T_BIG8;
      buf++;
   }
   WRITE_STATE_BIG8( sc );
 }
 void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
 {
   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
   DECL_STATE_BIG8
   READ_STATE_BIG8( sc );
   INPUT_BIG8;
   PF_BIG8;
   T_BIG8;
   WRITE_STATE_BIG8( sc );
 }
 void hamsi512_8way_init( hamsi_8way_big_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
   sc->h[2] = m512_const1_64( 0x302c206272672031 );
   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
 }
 void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   hamsi_8way_big( sc, vdata, len>>3 );
   vdata += ( (len& ~(size_t)7) >> 3 );
   len &= (size_t)7;
   memcpy_512( sc->buf, vdata, len>>3 );
   sc->partial_len = len;
 }
 void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 {
   __m512i pad[1];
   int ch, cl;
   sph_enc32be( &ch, sc->count_high );
   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
                               cl, ch, cl, ch, cl, ch, cl, ch );
 //   pad[0] =  m512_const2_32( cl, ch );
   sc->buf[0] = m512_const1_64( 0x80 );
   hamsi_8way_big( sc, sc->buf, 1 );
   hamsi_8way_big_final( sc, pad );
   mm512_block_bswap_32( (__m512i*)dst, sc->h );
 }
 #endif // AVX512
 // Hamsi 4 way
 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
   sc->h[0x7] = c7; \
 } while (0)
 /*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
 */
 #define ROUND_BIG(rc, alpha) \
 do { \
   __m256i t0, t1, t2, t3; \
   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
   s9 = _mm256_xor_si256( s9, m256_const1_64( \
        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
   sA = _mm256_xor_si256( sA, m256_const1_64( \
        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
   sB = _mm256_xor_si256( sB, m256_const1_64( \
        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
   sC = _mm256_xor_si256( sC, m256_const1_64( \
        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
   sD = _mm256_xor_si256( sD, m256_const1_64( \
        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
   sE = _mm256_xor_si256( sE, m256_const1_64( \
        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
   sF = _mm256_xor_si256( sF, m256_const1_64( \
        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
 \
  SBOX( s0, s4, s8, sC ); \
  SBOX( s1, s5, s9, sD ); \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;
 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
      size_t len );
 #define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
   __m512i h[8];
   __m512i buf[1];
   size_t partial_len;
   sph_u32 count_high, count_low;
 } hamsi_8way_big_context;
 typedef hamsi_8way_big_context hamsi512_8way_context;
 void hamsi512_8way_init( hamsi512_8way_context *sc );
 void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
                           size_t len );
 void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int size = ROW_LEN_BYTES * 4; // nRows;
-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
 //   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
   init_lyra2rev3_16way_ctx();;
 #else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
 #endif
 #endif
   return l2v3_wholeMatrix;
 }
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
  gate->hash      = (void*)&lyra2rev3_16way_hash;
 #elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,6 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
 //  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,29 @@
 #include <stdint.h>
 #include "lyra2.h"
 /*
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define LYRA2REV3_16WAY 1
 #elif defined(__AVX2__)
 */
 #if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
+  #define LYRA2REV3_8WAY 1
-#endif
+#elif defined(__SSE2__)
-
+  #define LYRA2REV3_4WAY 1
 #if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif
 extern __thread uint64_t* l2v3_wholeMatrix;
 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
 #if defined(LYRA2REV3_16WAY)
 void lyra2rev3_16way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_16way_ctx();
 #elif defined(LYRA2REV3_8WAY)
 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -46,6 +46,7 @@
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */
 #if 0
 int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
               const uint64_t timeCost, const uint64_t nRows,
@@ -216,29 +217,55 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   return 0;
 }
 #endif
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // This version is currently only used by REv3 and has some hard coding
 // specific to v3 such as input data size of 32 bytes.
 //
 // Similarly with REv2. Thedifference with REv3 isn't clear and maybe
 // they can be merged.
 //
 // RE is used by RE, allium. The main difference between RE and REv2
 // in the matrix size.
 //
 // Z also needs to support 80 byte input as well as 32 byte, and odd
 // matrix sizes like 330 rows. It is used by lyra2z330, lyra2z, lyra2h.
 /////////////////////////////////////////////////
 // 2 way 256
 // drop salt, salt len arguments, hard code some others.
 // Data is interleaved 2x256.
 //int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
 //                    const void *pwd, uint64_t pwdlen, uint64_t timeCost,
 //                    uint64_t nRows, uint64_t nCols )
 // hard coded for 32 byte input as well as matrix size.
 // Other required versions include 80 byte input and different block
 // sizez
 int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
      const void *pwd, const uint64_t pwdlen, const void *salt,
      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
      const uint64_t nCols )
 {
   //====================== Basic variables ============================//
-   uint64_t _ALIGN(256) state[16];
+   uint64_t _ALIGN(256) state[32];
-   int64_t row = 2; //index of row to be processed
+   int64_t row = 2; 
-   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t prev = 1;
-   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t rowa0 = 0;
-   int64_t tau; //Time Loop iterator
+   int64_t rowa1 = 0;
-   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t tau; 
-   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t step = 1;
-   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t window = 2;
   int64_t gap = 1; 
 //   int64_t i; //auxiliary iteration counter
-   int64_t v64; // 64bit var for memcpy
+//   int64_t v64; // 64bit var for memcpy
-   uint64_t instance0 = 0; // Seperate instance for each lane
+   uint64_t instance0 = 0;
   uint64_t instance1 = 0;
   //====================================================================/
@@ -248,7 +275,9 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   uint64_t *ptrWord = wholeMatrix;
 //  2 way 256 rewrite. Salt always == password, and data is interleaved,
-//  need to build in parallel:
+//  need to build in parallel as pw isalready interleaved.
 //  {   password,    (64 or 80 bytes)
 //      salt,        (64 or 80 bytes) =  same as password
 //      Klen,        (u64)  = 32 bytes
@@ -262,16 +291,45 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
 //      1            (byte)
 //   }
-//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+// It's all u64 so don't use byte
 // input is usually 32 maybe 64, both are aligned to 256 bit vector.
 // 80 byte inpput is not aligned complicating matters for lyra2z.   
   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
   uint64_t *ptr = wholeMatrix;
   uint64_t *pw = (uint64_t*)pwd;
-   byte *ptrByte = (byte*) wholeMatrix;
+   memcpy( ptr, pw, 2*pwdlen ); // password 
   ptr += pwdlen>>2;
   memcpy( ptr, pw, 2*pwdlen ); // password lane 1
   ptr += pwdlen>>2;
 // now build the rest interleaving on the fly.
-   //Prepends the password
+   ptr[0] = ptr[ 4] = kLen;
-   memcpy(ptrByte, pwd, pwdlen);
+   ptr[1] = ptr[ 5] = pwdlen;
-   ptrByte += pwdlen;
+   ptr[2] = ptr[ 6] = pwdlen;   // saltlen
   ptr[3] = ptr[ 7] = timeCost;
   ptr[8] = ptr[12] = nRows;
   ptr[9] = ptr[13] = nCols;
   ptr[10] = ptr[14] = 0x80;
   ptr[11] = ptr[15] = 0x0100000000000000;
   ptr = wholeMatrix;
 /* 
   // do it the old way to compare.
   uint64_t pb[512];
   byte* ptrByte = (byte*)pb;
   //Prepends the password (use salt for testing)
   memcpy( ptrByte, salt, saltlen );
   ptrByte += saltlen;
   //Concatenates the salt
   memcpy(ptrByte, salt, saltlen);
@@ -280,55 +338,259 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
                       - (saltlen + pwdlen) );
-   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, 8);
-   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += 8;
-   ptrByte += sizeof(uint64_t);
+   memcpy(ptrByte, &pwdlen, 8);
-   v64 = pwdlen;
+   ptrByte += 8;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
+   memcpy(ptrByte, &saltlen, 8);
-   ptrByte += sizeof(uint64_t);
+   ptrByte += 8;
-   v64 = saltlen;
+   memcpy(ptrByte, &timeCost, 8);
-   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += 8;
-   ptrByte += sizeof(uint64_t);
+   memcpy(ptrByte, &nRows, 8);
-   v64 = timeCost;
+   ptrByte += 8;
-   memcpy(ptrByte, &v64, sizeof(int64_t));
+   memcpy(ptrByte, &nCols, 8);
-   ptrByte += sizeof(uint64_t);
+   ptrByte += 8;
-   v64 = nRows;
+
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nCols;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   //Now comes the padding
   *ptrByte = 0x80; //first byte of padding: right after the password
-   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte = (byte*) pb; //resets the pointer to the start of the memory matrix
   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 */
 // display the data
 printf("LYRA2REV3 data, blocks= %d\n", nBlocksInput);
 /*
 uint64_t* m = (uint64_t*)wholeMatrix;
 printf("Lyra2v3 1: blocklensafe %d\n", BLOCK_LEN_BLAKE2_SAFE_BYTES);
 printf("pb: %016lx %016lx %016lx %016lx\n",pb[0],pb[1],pb[2],pb[3]);
 printf("pb: %016lx %016lx %016lx %016lx\n",pb[4],pb[5],pb[6],pb[7]);
 printf("pb: %016lx %016lx %016lx %016lx\n",pb[8],pb[8],pb[10],pb[11]);
 printf("pb: %016lx %016lx %016lx %016lx\n",pb[12],pb[13],pb[14],pb[15]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[0],m[1],m[2],m[3]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[4],m[5],m[6],m[7]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[8],m[8],m[10],m[11]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[12],m[13],m[14],m[15]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[16],m[17],m[18],m[19]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[20],m[21],m[22],m[23]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[24],m[25],m[26],m[27]);
 printf("data V:  %016lx %016lx %016lx %016lx\n",m[28],m[29],m[30],m[31]);
 */
 // from here on it's all simd acces to state and matrix
 // define vector pointers and adjust sizes and pointer offsets
 uint64_t _ALIGN(256) st[16];
   ptrWord = wholeMatrix;
-   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+   absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
-   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+uint64_t *p = wholeMatrix;
 printf("wholematrix[0]\n");
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[2*ROW_LEN_INT64];
 printf("wholematrix[1]\n");
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[4*ROW_LEN_INT64];
 printf("wholematrix[2]\n");
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[6*ROW_LEN_INT64];
 printf("wholematrix[3]\n");
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV1 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 //printf("SV1: %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 /*
   absorbBlockBlake2Safe( st, pb, nBlocksInput, BLOCK_LEN );
 printf("SV: %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SS: %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
 */
   reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols );
 // At this point the entire matrix should be filled but only col 0 is.
 // The others are unchanged or the display offsets are wrong.
 p = wholeMatrix;
 printf("wholematrix[0]   %x\n",wholeMatrix);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[32],p[33],p[34],p[35]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[36],p[37],p[38],p[39]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[40],p[41],p[42],p[43]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[44],p[45],p[46],p[47]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[48],p[49],p[50],p[51]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[52],p[53],p[54],p[55]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[56],p[57],p[58],p[59]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[60],p[61],p[62],p[63]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[64],p[65],p[66],p[67]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[68],p[69],p[70],p[71]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[72],p[73],p[74],p[75]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[76],p[77],p[78],p[79]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[80],p[81],p[82],p[83]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[84],p[85],p[86],p[87]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[88],p[89],p[90],p[91]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[92],p[93],p[94],p[95]);
 p = &wholeMatrix[2*ROW_LEN_INT64];
 printf("wholematrix[1]   %x\n", &wholeMatrix[2*ROW_LEN_INT64]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[4*ROW_LEN_INT64];
 printf("wholematrix[2]   %x\n",&wholeMatrix[4*ROW_LEN_INT64]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[6*ROW_LEN_INT64];
 printf("wholematrix[3]   %x\n",&wholeMatrix[6*ROW_LEN_INT64]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV2 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 //printf("SV2 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 /*
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
 printf("SV2 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
 */
   reducedDuplexRow1_2way( state, &wholeMatrix[0], &wholeMatrix[2*ROW_LEN_INT64],
                      nCols);
 //printf("SV3 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 /*
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
 printf("SV3 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
 */
 p = wholeMatrix;
 printf("wholematrix[0]\n");
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[2*ROW_LEN_INT64];
 printf("wholematrix[1]\n");
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[4*ROW_LEN_INT64];
 printf("wholematrix[2]\n");
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[6*ROW_LEN_INT64];
 printf("wholematrix[3]\n");
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV3 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
   do
   {
-      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+      reducedDuplexRowSetup_2way( state, &wholeMatrix[2*prev*ROW_LEN_INT64],
-                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[2*rowa0*ROW_LEN_INT64],
-                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+                             &wholeMatrix[2*row*ROW_LEN_INT64], nCols );
-      rowa = (rowa + step) & (window - 1);
+      rowa0 = (rowa0 + step) & (window - 1);
      prev = row;
      row++;
-      if (rowa == 0)
+      if (rowa0 == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
@@ -337,6 +599,80 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
   } while (row < nRows);
 p = wholeMatrix;
 printf("wholematrix[0]\n");
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[2*ROW_LEN_INT64];
 printf("wholematrix[1]\n");
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[4*ROW_LEN_INT64];
 printf("wholematrix[2]\n");
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 p = &wholeMatrix[6*ROW_LEN_INT64];
 printf("wholematrix[3]\n");
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 //printf("SV5 prev= %d\n",prev);
 /*
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV4 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
 printf("SV4 S %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
 */        
 //printf("Lyra2v3 4\n");
 uint64_t *ptr0 = wholeMatrix;    // base address for each lane
 uint64_t *ptr1 = wholeMatrix + 4;
 // convert a simple offset to an index into interleaved data.
 // good for state and 4 row matrix. 
 // index = ( int( off / 4 ) * 2 ) + ( off mod 4 )
 #define offset_to_index( o ) \
   ( ( ( (uint64_t)( (o) & 0xf) / 4 ) * 8 ) + ( (o) % 4 ) )
   row = 0;
   for (tau = 1; tau <= timeCost; tau++)
   {
@@ -344,24 +680,79 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
      do
      {
        // This part is not parallel, rowa will be different for each lane.
-        // state (u64[16]) is interleaved 2x256, need to extract seperately.
+        // state (u64[16]) is interleaved 2x256, need to extract seperately
        // and figure out where the data is when interleaved.
        // &state[0] (or matrix) is the start of lane 0, while &state[4]
        // is the start of lane 1. From there there are 4 consecutive elements
        // followed by 4 elements from the other lane that must be skipped.
-        // index = 2 * instance / 4 * 4 + instance % 4
+        povly ptr;
-        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
+        ptr.u64 = wholeMatrix;
                           + ( instance0 & 0x3 )
        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
                           + ( instance1 & 0x3 )
-        instance0 = state[ index0 ] & 0xf;
+/*        
-        instance1 = (state+4)[ index1 ] & 0xf;
+printf("SV4a %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[27]);
 printf("SV4a %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
 *        
 //printf("SV4a o to i %016lx = %016lx\n", instance0, offset_to_index( instance0 ) );
 */
        instance0 = state[ offset_to_index( instance0 ) ];
        instance1 = (&state[4])[ offset_to_index( instance1 ) ];
-        rowa0 = state[ instance0 ];
+printf("SV4b o to i %016lx = %016lx, state0 %016lx\n", instance0, offset_to_index( instance0 ), state[offset_to_index( instance0 )] );
-        rowa1 = (state+4)[ instance1 ];
+printf("SV4b o to i %016lx = %016lx, state1 %016lx\n", instance1, offset_to_index( instance1 ), (state+4)[offset_to_index( instance1 )] );
 //printf("SV4b lane 1 instance1 = %d, rowa1= %d\n",instance1,rowa1);
-        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
+        rowa0 = state[ offset_to_index( instance0 )  ]
-                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
+                & (unsigned int)(nRows-1);
-                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
+        rowa1 = (state+4)[ offset_to_index( instance1 ) ]
-                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
+                & (unsigned int)(nRows-1);
 // matrix[prev] ie row 0,  is messed up after rdr for row 1. ok after rdr 0
 //printf("SV5 lane 1 instance1= %016lx, rowa1= %d\n",instance1,rowa1);
 printf("SV5 row= %d, step= %d\n",row,step);         
 printf("SV5 instance0 %016lx, rowa0 %d, p0 %016lx\n",instance0,rowa0,ptr0[ 2* rowa0 * ROW_LEN_INT64 ]);
 printf("SV5 instance1 %016lx, rowa1 %d, p1 %016lx\n",instance1,rowa1,ptr1[ 2* rowa1 * ROW_LEN_INT64 ]);
 uint64_t *p = &wholeMatrix[2*rowa1*ROW_LEN_INT64];
 printf("SV5 prev= %d\n",prev); 
 /*
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[0],p[1],p[2],p[3]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[4],p[5],p[6],p[7]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[8],p[9],p[10],p[11]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[12],p[13],p[14],p[15]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[16],p[17],p[18],p[19]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[20],p[21],p[22],p[23]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[24],p[25],p[26],p[27]);
 printf("SV5 M  %016lx %016lx %016lx %016lx\n",p[28],p[29],p[30],p[31]);
 */
        reducedDuplexRow_2way( state, ptr, prev, rowa0, rowa1, row, nCols );
 /*
        reducedDuplexRow_2way( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
                                      &ptr0[ 2* rowa0 * ROW_LEN_INT64 ],
                                      &ptr1[ 2* rowa1 * ROW_LEN_INT64 ],
                               &wholeMatrix[ 2* row*ROW_LEN_INT64], nCols );
 */
 /*
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[4],state[5],state[6],state[7]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[8],state[9],state[10],state[11]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[12],state[13],state[14],state[15]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[16],state[17],state[18],state[19]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[20],state[21],state[22],state[23]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[24],state[25],state[26],state[271]);
 printf("SV6 %016lx %016lx %016lx %016lx\n",state[28],state[29],state[30],state[31]);
 */        
 /*
           instance = state[instance & 0xF];
           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
@@ -378,13 +769,22 @@ int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
       } while ( row != 0 );
   }
-   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
+printf("SV7 %016lx %016lx %016lx %016lx\n",state[0],state[1],state[2],state[3]);
-   squeeze( state, K, (unsigned int) kLen );
+
 // rowa mismatches here so need to do a split read
   absorbBlock_2way( state, &wholeMatrix[2*rowa0*ROW_LEN_INT64] );
   squeeze_2way( state, K, (unsigned int) kLen );
   return 0;
 }
 #undef offset_to_index
 #endif // AVX512
 #if 0
 //////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
@@ -713,3 +1113,4 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   return 0;
 }
 #endif
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
               uint64_t pwdlen, const void *salt, uint64_t saltlen,
               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
 //int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 //        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
 #endif
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -4,8 +4,212 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/cubehash/cube-hash-2way.h"
-#if defined (LYRA2REV3_8WAY)
+#if defined (LYRA2REV3_16WAY)
 typedef struct {
   blake256_16way_context     blake;
   cube_4way_context          cube;
   bmw256_16way_context       bmw;
 } lyra2v3_16way_ctx_holder;
 static __thread lyra2v3_16way_ctx_holder l2v3_16way_ctx;
 bool init_lyra2rev3_16way_ctx()
 {
   blake256_16way_init( &l2v3_16way_ctx.blake );
   cube_4way_init( &l2v3_16way_ctx.cube, 256, 16, 32 );
   bmw256_16way_init( &l2v3_16way_ctx.bmw );
   return true;
 }
 void lyra2rev3_16way_hash( void *state, const void *input )
 {
   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
   uint32_t hash0[8] __attribute__ ((aligned (64)));
   uint32_t hash1[8] __attribute__ ((aligned (32)));
   uint32_t hash2[8] __attribute__ ((aligned (32)));
   uint32_t hash3[8] __attribute__ ((aligned (32)));
   uint32_t hash4[8] __attribute__ ((aligned (32)));
   uint32_t hash5[8] __attribute__ ((aligned (32)));
   uint32_t hash6[8] __attribute__ ((aligned (32)));
   uint32_t hash7[8] __attribute__ ((aligned (32)));
   uint32_t hash8[8] __attribute__ ((aligned (64)));
   uint32_t hash9[8] __attribute__ ((aligned (32)));
   uint32_t hash10[8] __attribute__ ((aligned (32)));
   uint32_t hash11[8] __attribute__ ((aligned (32)));
   uint32_t hash12[8] __attribute__ ((aligned (32)));
   uint32_t hash13[8] __attribute__ ((aligned (32)));
   uint32_t hash14[8] __attribute__ ((aligned (32)));
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   lyra2v3_16way_ctx_holder ctx __attribute__ ((aligned (64)));
   memcpy( &ctx, &l2v3_16way_ctx, sizeof(l2v3_16way_ctx) );
   blake256_16way_update( &ctx.blake, input + (64*16), 16 );
   blake256_16way_close( &ctx.blake, vhash );
   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
           hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
           vhash, 256 );
 //printf("Lyra1 lane 0\n");
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
 uint32_t h[8];
   LYRA2REV3( l2v3_wholeMatrix, h, 32, hash1, 32, hash1, 32, 1, 4, 4 );
 printf("S: %08x %08x %08x %08x %08x %08x %08x %08x\n",hash0[0],hash0[1],hash0[2],hash0[3],hash0[4],hash0[5],hash0[6],hash0[7]);
 printf("V: %08x %08x %08x %08x %08x %08x %08x %08x\n",h[0],h[1],h[2],h[3],h[4],h[5],h[6],h[7]);
 printf("\n");
 //printf("Lyra1 lane 2\n");
   dintrlv_2x256( hash0, hash1, vhash, 256 );
 /*
   intrlv_2x256( vhash, hash2, hash3, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
   dintrlv_2x256( hash2, hash3, vhash, 256 );
   intrlv_2x256( vhash, hash4, hash5, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
   dintrlv_2x256( hash4, hash5, vhash, 256 );
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
   intrlv_2x256( vhash, hash8, hash9, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
   dintrlv_2x256( hash8, hash9, vhash, 256 );
   intrlv_2x256( vhash, hash10, hash11, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
   dintrlv_2x256( hash10, hash11, vhash, 256 );
   intrlv_2x256( vhash, hash12, hash13, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
   dintrlv_2x256( hash12, hash13, vhash, 256 );
   intrlv_2x256( vhash, hash14, hash15, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
   dintrlv_2x256( hash14, hash15, vhash, 256 );
 */
 //printf("cube\n");
   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
   cube_4way_init( &ctx.cube, 256, 16, 32 );
   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
   cube_4way_init( &ctx.cube, 256, 16, 32 );
   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
   cube_4way_init( &ctx.cube, 256, 16, 32 );
   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
 //printf("Lyra2...\n");
 /*
   intrlv_2x256( vhash, hash0, hash1, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash0, 32, 1, 4, 4 );
   dintrlv_2x256( hash0, hash1, vhash, 256 );
   intrlv_2x256( vhash, hash2, hash3, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash2, 32, 1, 4, 4 );
   dintrlv_2x256( hash2, hash3, vhash, 256 );
   intrlv_2x256( vhash, hash4, hash5, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash4, 32, 1, 4, 4 );
   dintrlv_2x256( hash4, hash5, vhash, 256 );
   intrlv_2x256( vhash, hash6, hash7, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash6, 32, 1, 4, 4 );
   dintrlv_2x256( hash6, hash7, vhash, 256 );
   intrlv_2x256( vhash, hash8, hash9, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash8, 32, 1, 4, 4 );
   dintrlv_2x256( hash8, hash9, vhash, 256 );
   intrlv_2x256( vhash, hash10, hash11, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash10, 32, 1, 4, 4 );
   dintrlv_2x256( hash10, hash11, vhash, 256 );
   intrlv_2x256( vhash, hash12, hash13, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash12, 32, 1, 4, 4 );
   dintrlv_2x256( hash12, hash13, vhash, 256 );
   intrlv_2x256( vhash, hash14, hash15, 256 );
   LYRA2REV3_2WAY( l2v3_wholeMatrix, vhash, 32, vhash, 32, hash14, 32, 1, 4, 4 );
   dintrlv_2x256( hash14, hash15, vhash, 256 );
 */
   intrlv_16x32( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
             hash7, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
             hash15, 256 );
 //printf("bmw\n");
   bmw256_16way_update( &ctx.bmw, vhash, 32 );
   bmw256_16way_close( &ctx.bmw, state );
 //printf("done\n"); 
 }
 int scanhash_lyra2rev3_16way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   blake256_16way_init( &l2v3_16way_ctx.blake );
 //   blake256_16way_update( &l2v3_16way_ctx.blake, vdata, 64 );
   do
   {
      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
                                                  n+11, n+10, n+ 9, n+ 8,
                                                  n+ 7, n+ 6, n+ 5, n+ 4,
                                                  n+ 3, n+ 2, n+ 1, n ) );
      lyra2rev3_16way_hash( hash, vdata );
      pdata[19] = n;
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash7[lane] <= Htarg ) )
      {
         extr_lane_16x32( lane_hash, hash, lane, 256 );
         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
         {
             pdata[19] = n + lane;
             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
      n += 16;
   } while ( likely( (n < max_nonce-16) && !work_restart[thr_id].restart ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined (LYRA2REV3_8WAY)
 typedef struct {
   blake256_8way_context     blake;
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include "algo-gate.h"
+//#include "algo-gate.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -31,21 +31,31 @@
 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
-    const int len_m256i = len / 32;
+    const int fullBlocks = len / 32;
    const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
    __m512i* state = (__m512i*)State;
    __m512i* out   = (__m512i*)Out;
    int i;
 //printf("squeeze 1, len= %d, full  %d\n", len,fullBlocks);
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
 //printf("squeeze 1, %d\n",i);
       memcpy_512( out, state, BLOCK_LEN_M256I*2 );
-       LYRA_ROUND_2WAY_AVX2( state[0], state[1], state[2], state[3] );
+
-       out += BLOCK_LEN_M256I*2;
+//printf("squeeze 2\n");
       LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
 //printf("squeeze 2\n");
       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_512( out, state, ( (len_m256i % BLOCK_LEN_M256I) * 2 ) );
+//    memcpy_512( out, state, ( (len * 2 ) );
 }
 inline void absorbBlock_2way( uint64_t *State, const uint64_t *In ) 
@@ -90,7 +100,7 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,
    state1 = _mm512_xor_si512( state1, in[1] );
    LYRA_12_ROUNDS_2WAY_AVX512( state0, state1, state2, state3 );
-    In += block_len * 2;
+    In += block_len*2;
  }
  _mm512_store_si512( (__m512i*)State,     state0 );
@@ -109,7 +119,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    register __m512i state0, state1, state2, state3;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -132,7 +142,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
       out[2] = state2;
       //Goes to next block (column) that will receive the squeezed data
-       out -= BLOCK_LEN_M256I * 2;
+       out -= BLOCK_LEN_M256I;
       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
    }
@@ -143,15 +153,14 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }
 // This function has to deal with gathering 2 256 bit rowin vectors from
 // non-contiguous memory. Extra work and performance penalty.
 inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
                 uint64_t *rowOut, uint64_t nCols )
 {
    int i;
    register __m512i state0, state1, state2, state3;
-    __m512i *in = (__m256i*)rowIn;
+    __m512i *in = (__m512i*)rowIn;
    __m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    state0 = _mm512_load_si512( (__m512i*)State     );
    state1 = _mm512_load_si512( (__m512i*)State + 1 );
@@ -171,17 +180,15 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
         out[2] = _mm512_xor_si512( state2, in[2] );
         //Input: next column (i.e., next block in sequence)
-         in0 += BLOCK_LEN_M256I;
+         in += BLOCK_LEN_M256I;
         in1 += BLOCK_LEN_M256I;
         //Output: goes to previous column
-         out -= BLOCK_LEN_M256I * 2;
+         out -= BLOCK_LEN_M256I;
    }
-    _mm512_store_si256( (__m512i*)State,     state0 );
+    _mm512_store_si512( (__m512i*)State,     state0 );
-    _mm512_store_si256( (__m512i*)State + 1, state1 );
+    _mm512_store_si512( (__m512i*)State + 1, state1 );
-    _mm512_store_si256( (__m512i*)State + 2, state2 );
+    _mm512_store_si512( (__m512i*)State + 2, state2 );
-    _mm512_store_si256( (__m512i*)State + 3, state3 );
+    _mm512_store_si512( (__m512i*)State + 3, state3 );
   }
 }
 inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
@@ -192,7 +199,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    register __m512i state0, state1, state2, state3;
    __m512i* in    = (__m512i*)rowIn;
    __m512i* inout = (__m512i*)rowInOut;
-    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I * 2 );
+    __m512i* out   = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
    __m512i  t0, t1, t2;
    state0 = _mm512_load_si512( (__m512i*)State     );
@@ -209,7 +216,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       state2 = _mm512_xor_si512( state2,
                                  _mm512_add_epi64( in[2], inout[2] ) );
-       LYRA_ROUND_2WAY AVX512( state0, state1, state2, state3 );
+       LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
       out[0] = _mm512_xor_si512( state0, in[0] );
       out[1] = _mm512_xor_si512( state1, in[1] );
@@ -221,17 +228,17 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
       t2 = _mm512_permutex_epi64( state2, 0x93 );
       inout[0] = _mm512_xor_si512( inout[0],
-                                 _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x03, t0, t2 ) );
       inout[1] = _mm512_xor_si512( inout[1],
-                                 _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x03, t1, t0 ) );
       inout[2] = _mm512_xor_si512( inout[2],
-                                 _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+                                 _mm512_mask_blend_epi32( 0x03, t2, t1 ) );
       //Inputs: next column (i.e., next block in sequence)
-       in    += BLOCK_LEN_M256I * 2;
+       in    += BLOCK_LEN_M256I;
-       inout += BLOCK_LEN_M256I * 2;
+       inout += BLOCK_LEN_M256I;
       //Output: goes to previous column
-       out   -= BLOCK_LEN_M256I * 2;
+       out   -= BLOCK_LEN_M256I;
    }
    _mm512_store_si512( (__m512i*)State,     state0 );
@@ -240,53 +247,99 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
    _mm512_store_si512( (__m512i*)State + 3, state3 );
 }
-inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
+// big ugly workaound for pointer aliasing, use a union of pointers.
-                uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut,
+// Access matrix using m512i for in and out, m256i for inout
-                uint64_t nCols )
+inline void reducedDuplexRow_2way( uint64_t *State, povly matrix,
                                   uint64_t rowIn,
                                   uint64_t rowInOut0, uint64_t rowInOut1,
                                   uint64_t rowOut, uint64_t nCols )
 {
   int i;
-   register __m512i state0, state1, state2, state3;
+   const uint64_t ROW_LEN_M256I = BLOCK_LEN_INT64 * nCols / 4;
-    __m256i *in0 = (__m256i*)rowIn0;
+   __m512i state0, state1, state2, state3;
-    __m256i *in0 = (__m256i*)rowIn0;
+//   register __m512i state0, state1, state2, state3;
-    __m2512* in    = (__m512i*)rowIn;
+   __m512i *in = &matrix.v512[ rowIn * ROW_LEN_M256I ];
-    __m2512* inout = (__m512i*)rowInOut;
+   __m256i *inout0 = &matrix.v256[ 2 * rowInOut0 * ROW_LEN_M256I ];
-    __m512i* out   = (__m512i*)rowOut;
+   __m256i *inout1 = &matrix.v256[ 2 * rowInOut1 * ROW_LEN_M256I ];
-    __m512i  t0, t1, t2;
+   __m512i *out   = &matrix.v512[ rowOut * ROW_LEN_M256I ];
    __m512i io[3];
   povly inout;
   inout.v512 = &io[0];
    __m512i t0, t1, t2;
    _mm_prefetch( in0,     _MM_HINT_T0 );
    _mm_prefetch( in1,     _MM_HINT_T0 );
    _mm_prefetch( in0 + 2, _MM_HINT_T0 );
    _mm_prefetch( in1 + 2, _MM_HINT_T0 );
    _mm_prefetch( in0 + 4, _MM_HINT_T0 );
    _mm_prefetch( in1 + 4, _MM_HINT_T0 );
    _mm_prefetch( in0 + 6, _MM_HINT_T0 );
    _mm_prefetch( in1 + 6, _MM_HINT_T0 );
   state0 = _mm512_load_si512( (__m512i*)State     );
   state1 = _mm512_load_si512( (__m512i*)State + 1 );
   state2 = _mm512_load_si512( (__m512i*)State + 2 );
   state3 = _mm512_load_si512( (__m512i*)State + 3 );
    _mm_prefetch( in,     _MM_HINT_T0 );
    _mm_prefetch( inout0,     _MM_HINT_T0 );
    _mm_prefetch( inout1,     _MM_HINT_T0 );
    _mm_prefetch( in     + 2, _MM_HINT_T0 );
    _mm_prefetch( inout0 + 2, _MM_HINT_T0 );
    _mm_prefetch( inout1 + 2, _MM_HINT_T0 );
    _mm_prefetch( in     + 4, _MM_HINT_T0 );
    _mm_prefetch( inout0 + 4, _MM_HINT_T0 );
    _mm_prefetch( inout1 + 4, _MM_HINT_T0 );
    _mm_prefetch( in     + 6, _MM_HINT_T0 );
    _mm_prefetch( inout0 + 6, _MM_HINT_T0 );
    _mm_prefetch( inout1 + 6, _MM_HINT_T0 );
 //uint64_t *ii = (uint64_t*)in0;
 //printf("RDRV0 IO %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
    for ( i = 0; i < nCols; i++ )
    {
 /*       
 //printf("RDR: loop %d\n",i);
 uint64_t *io1 = (uint64_t*)inout1;
 printf("RDRV0 col= %d\n", i);
 printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[0],io1[1],io1[2],io1[3]);
 printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[4],io1[5],io1[6],io1[7]);
 printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[8],io1[9],io1[10],io1[11]);
 printf("RDRV0 IO1 %016lx %016lx %016lx %016lx\n",io1[12],io1[13],io1[14],io1[153]);
 */
      //Absorbing "M[prev] [+] M[row*]"
      inout.v256[0] = inout0[0];
      inout.v256[1] = inout1[1];
      inout.v256[2] = inout0[2];
      inout.v256[3] = inout1[3];
      inout.v256[4] = inout0[4];
      inout.v256[5] = inout1[5];
 /*      
 uint64_t *io = (uint64_t*)inout.u64;
 uint64_t *ii = (uint64_t*)in;
 printf("RDRV1 col= %d\n", i);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
 printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[0],ii[1],ii[2],ii[3]);
 printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[4],ii[5],ii[6],ii[7]);
 printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[8],ii[9],ii[10],ii[11]);
 printf("RDRV1 IN %016lx %016lx %016lx %016lx\n",ii[12],ii[13],ii[14],ii[15]);
 */
 //         state0 = _mm512_xor_si512( state0, mm512_concat_256( in1[0], in0[0] );
 //         state1 = _mm512_xor_si512( state1, mm512_concat_256( in1[1], in0[1] );
 //         state2 = _mm512_xor_si512( state2, mm512_concat_256( in1[2], in0[2] );
      t0 = mm512_concat_256( in1[0], in0[0] );
      t1 = mm512_concat_256( in1[1], in0[1] );
      t2 = mm512_concat_256( in1[2], in0[2] );
      state0 = _mm512_xor_si512( state0,
-                                     _mm512_add_epi64( t0, inout[0] ) );
+                                 _mm512_add_epi64( in[0], inout.v512[0] ) );
      state1 = _mm512_xor_si512( state1,
-                                     _mm512_add_epi64( t1, inout[1] ) );
+                                 _mm512_add_epi64( in[1], inout.v512[1] ) );
      state2 = _mm512_xor_si512( state2,
-                                     _mm512_add_epi64( t2, inout[2] ) );
+                                 _mm512_add_epi64( in[2], inout.v512[2] ) );
 //printf("RDR: round\n");
      //Applies the reduced-round transformation f to the sponge's state
      LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
 //printf("RDR 3\n");
      //M[rowOut][col] = M[rowOut][col] XOR rand
      out[0] = _mm512_xor_si512( out[0], state0 );
      out[1] = _mm512_xor_si512( out[1], state1 );
@@ -296,18 +349,76 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn1,
      t0 = _mm512_permutex_epi64( state0, 0x93 );
      t1 = _mm512_permutex_epi64( state1, 0x93 );
      t2 = _mm512_permutex_epi64( state2, 0x93 );
 /*
 uint64_t *st = (uint64_t*)&state0;
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
 printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
 st = (uint64_t*)&state1;
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
 printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
 st = (uint64_t*)&state2;
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
 printf("RDRv2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
-      inout[0] = _mm512_xor_si512( inout[0],
+st = (uint64_t*)&t0;
-                                   _mm512_mask_blend_epi32( t0, t2, 0x03 ) );
+printf("RDRV2 t0 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
-      inout[1] = _mm512_xor_si512( inout[1],
+printf("RDRv2 t0 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
-                                   _mm512_mask_blend_epi32( t1, t0, 0x03 ) );
+st = (uint64_t*)&t1;
-      inout[2] = _mm512_xor_si512( inout[2],
+printf("RDRV2 t1 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
-                                   _mm512_mask_blend_epi32( t2, t1, 0x03 ) );
+printf("RDRv2 t1 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
 st = (uint64_t*)&t2;
 printf("RDRV2 t2 %016lx %016lx %016lx %016lx\n",st[0],st[1],st[2],st[3]);
 printf("RDRv2 t2 %016lx %016lx %016lx %016lx\n",st[4],st[5],st[6],st[7]);
 */
 /*
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[8],st[9],st[10],st[11]);
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[12],st[13],st[14],st[15]);
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[16],st[17],st[18],st[19]);
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[20],st[21],st[22],st[23]);
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[24],st[25],st[26],st[271]);
 printf("RDRV2 %016lx %016lx %016lx %016lx\n",st[28],st[29],st[30],st[31]);
 */
 //printf("RDR 4\n");    
 /*
 //uint64_t *io = (uint64_t*)&inout;
 printf("RDRV1 col= %d\n", i);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
 printf("RDRV1 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[15]);
 */
 // need to split inout for write
      inout.v512[0] = _mm512_xor_si512( inout.v512[0],
                                   _mm512_mask_blend_epi32( 0x03, t0, t2 ) );
      inout.v512[1] = _mm512_xor_si512( inout.v512[1],
                                   _mm512_mask_blend_epi32( 0x03, t1, t0 ) );
      inout.v512[2] = _mm512_xor_si512( inout.v512[2],
                                   _mm512_mask_blend_epi32( 0x03, t2, t1 ) );
 /*
 printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[0],io[1],io[2],io[3]);
 printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[4],io[5],io[6],io[7]);
 printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[8],io[9],io[10],io[11]);
 printf("RDRV3 IO %016lx %016lx %016lx %016lx\n",io[12],io[13],io[14],io[153]);
 */    
      inout0[0] = inout.v256[0];
      inout1[1] = inout.v256[1];
      inout0[2] = inout.v256[2];
      inout1[3] = inout.v256[3];
      inout0[4] = inout.v256[4];
      inout1[5] = inout.v256[5];
 //printf("RDR 5\n"); 
       //Goes to next block
-       in    += BLOCK_LEN_M256I * 2;
+       in     += BLOCK_LEN_M256I;
-       out   += BLOCK_LEN_M256I * 2;
+       inout0 += BLOCK_LEN_M256I * 2;
-       inout += BLOCK_LEN_M256I * 2;
+       inout1 += BLOCK_LEN_M256I * 2;
       out    += BLOCK_LEN_M256I;
   }
   _mm512_store_si512( (__m512i*)State,     state0 );
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror_1x64( s1); \
+   s1 = mm512_ror256_64( s1); \
-   s2 = mm512_swap128_256( s2 ); \
+   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_rol1x64_256( s3 ); \
+   s3 = mm512_rol256_64( s3 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol1x64_256( s1 ); \
+   s1 = mm512_rol256_64( s1 ); \
-   s2 = mm512_swap128_256( s2 ); \
+   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_ror1x64_256( s3 );
+   s3 = mm512_ror256_64( s3 );
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
+   mm128_ror256_64( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
+   mm128_swap256_128( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_rol256_64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
+   mm128_rol256_64( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
+   mm128_swap256_128( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_ror256_64( s6, s7 );
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -220,7 +220,23 @@ void reducedDuplexRow1_2way( uint64_t *state, uint64_t *rowIn,
                             uint64_t *rowOut, uint64_t nCols);
 void reducedDuplexRowSetup_2way( uint64_t *state, uint64_t *rowIn,
                    uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols );
-void reducedDuplexRow_2way(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn0, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
+/*
 void reducedDuplexRow_2way( uint64_t *state, uint64_t *rowIn,
                            uint64_t *rowInOut0, uint64_t *rowInOut1,
                            uint64_t *rowOut, uint64_t nCols);
 */
 union _povly
 {
   __m512i *v512;
   __m256i *v256;
   uint64_t *u64;
 };
 typedef union _povly povly;
 void reducedDuplexRow_2way( uint64_t *state, povly matrix, uint64_t rowIn,
                            uint64_t rowInOut0, uint64_t rowInOut1,
                            uint64_t rowOut, uint64_t nCols);
 #endif
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -56,7 +56,7 @@ typedef struct {
   __m128i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_4way_context;
+} sha256_4way_context __attribute__ ((aligned (64)));
 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
@@ -71,7 +71,7 @@ typedef struct {
   __m256i val[8];
   uint32_t count_high, count_low;
   bool initialized;
-} sha256_8way_context;
+} sha256_8way_context __attribute__ ((aligned (128)));
 void sha256_8way_init( sha256_8way_context *sc );
 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
@@ -86,30 +86,32 @@ typedef struct {
   __m256i val[8];
   uint64_t count;
   bool initialized;
-} sha512_4way_context;
+} sha512_4way_context __attribute__ ((aligned (128)));
 void sha512_4way_init( sha512_4way_context *sc);
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_update( sha512_4way_context *sc, const void *data,
                         size_t len );
 #define sha512_4way sha512_4way_update
 void sha512_4way_close( sha512_4way_context *sc, void *dst );
-// SHA-256 11 way hybrid
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
+
 // SHA-512 8 way
 typedef struct {
-   __m256i  bufx[64>>2];
+   __m512i buf[128>>3];
-   __m256i  valx[8];
+   __m512i val[8];
-   __m64    bufy[64>>2];
+   uint64_t count;
-   __m64    valy[8];
+   bool initialized;
-   uint32_t bufz[64>>2];
+} sha512_8way_context __attribute__ ((aligned (128)));
   uint32_t valz[8];
   uint32_t count_high, count_low;
 } sha256_11way_context;
-void sha256_11way_init( sha256_11way_context *ctx );
+void sha512_8way_init( sha512_8way_context *sc);
-void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
+void sha512_8way_update( sha512_8way_context *sc, const void *data, 
-	                 const void *datay, const void *dataz, size_t len );
+                         size_t len );
-void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
+void sha512_8way_close( sha512_8way_context *sc, void *dst );
 	                 void *dstz  );
 #endif  // AVX512
 #endif  // __AVX2__
 #endif  // __SSE2__
 #endif  // SHA256_4WAY_H__
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -36,8 +36,6 @@
 #include <string.h>
 #include "sha-hash-4way.h"
 // SHA-512 4 way 64 bit
 /*
 static const sph_u64 H512[8] = {
        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
@@ -90,6 +88,236 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // SHA-512 8 way 64 bit
 #define CH8W(X, Y, Z) \
   _mm512_xor_si512( _mm512_and_si512( _mm512_xor_si512( Y, Z ), X ), Z ) 
 #define MAJ8W(X, Y, Z) \
   _mm512_or_si512( _mm512_and_si512( X, Y ), \
                    _mm512_and_si512( _mm512_or_si512( X, Y ), Z ) )
 #define BSG8W_5_0(x) \
   _mm512_xor_si512( _mm512_xor_si512( \
        mm512_ror_64(x, 28), mm512_ror_64(x, 34) ), mm512_ror_64(x, 39) )
 #define BSG8W_5_1(x) \
   _mm512_xor_si512( _mm512_xor_si512( \
        mm512_ror_64(x, 14), mm512_ror_64(x, 18) ), mm512_ror_64(x, 41) )
 #define SSG8W_5_0(x) \
   _mm512_xor_si512( _mm512_xor_si512( \
        mm512_ror_64(x,  1), mm512_ror_64(x,  8) ), _mm512_srli_epi64(x, 7) ) 
 #define SSG8W_5_1(x) \
   _mm512_xor_si512( _mm512_xor_si512( \
        mm512_ror_64(x, 19), mm512_ror_64(x, 61) ), _mm512_srli_epi64(x, 6) )
 static inline __m512i ssg8w_512_add( __m512i w0, __m512i w1 )
 {
   __m512i w0a, w1a, w0b, w1b;
   w0a = mm512_ror_64( w0, 1 );
   w1a = mm512_ror_64( w1,19 );
   w0b = mm512_ror_64( w0, 8 );
   w1b = mm512_ror_64( w1,61 );
   w0a = _mm512_xor_si512( w0a, w0b );
   w1a = _mm512_xor_si512( w1a, w1b );
   w0b = _mm512_srli_epi64( w0, 7 );
   w1b = _mm512_srli_epi64( w1, 6 );
   w0a = _mm512_xor_si512( w0a, w0b );
   w1a = _mm512_xor_si512( w1a, w1b );
   return _mm512_add_epi64( w0a, w1a );
 }
 #define SSG8W_512x2_0( w0, w1, i ) do \
 { \
   __m512i X0a, X1a, X0b, X1b; \
  X0a = mm512_ror_64( W[i-15], 1 ); \
  X1a = mm512_ror_64( W[i-14], 1 ); \
  X0b = mm512_ror_64( W[i-15], 8 ); \
  X1b = mm512_ror_64( W[i-14], 8 ); \
  X0a = _mm512_xor_si512( X0a, X0b ); \
  X1a = _mm512_xor_si512( X1a, X1b ); \
  X0b = _mm512_srli_epi64( W[i-15], 7 ); \
  X1b = _mm512_srli_epi64( W[i-14], 7 ); \
  w0  = _mm512_xor_si512( X0a, X0b ); \
  w1  = _mm512_xor_si512( X1a, X1b ); \
 } while(0)
 #define SSG8W_512x2_1( w0, w1, i ) do \
 { \
   __m512i X0a, X1a, X0b, X1b; \
  X0a = mm512_ror_64( W[i-2],19 ); \
  X1a = mm512_ror_64( W[i-1],19 ); \
  X0b = mm512_ror_64( W[i-2],61 ); \
  X1b = mm512_ror_64( W[i-1],61 ); \
  X0a = _mm512_xor_si512( X0a, X0b ); \
  X1a = _mm512_xor_si512( X1a, X1b ); \
  X0b = _mm512_srli_epi64( W[i-2], 6 ); \
  X1b = _mm512_srli_epi64( W[i-1], 6 ); \
  w0  = _mm512_xor_si512( X0a, X0b ); \
  w1  = _mm512_xor_si512( X1a, X1b ); \
 } while(0)
 #define SHA3_8WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
  __m512i T1, T2; \
  __m512i K = _mm512_set1_epi64( K512[ i ] ); \
  T1 = _mm512_add_epi64( H, mm512_add4_64( BSG8W_5_1(E), CH8W(E, F, G), \
                                           K, W[i] ) ); \
  T2 = _mm512_add_epi64( BSG8W_5_0(A), MAJ8W(A, B, C) ); \
  D  = _mm512_add_epi64( D, T1 ); \
  H  = _mm512_add_epi64( T1, T2 ); \
 } while (0)
 static void
 sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
 {
   int i;
   register __m512i A, B, C, D, E, F, G, H;
   __m512i W[80];
   mm512_block_bswap_64( W  , in );
   mm512_block_bswap_64( W+8, in+8 );
   for ( i = 16; i < 80; i++ )
      W[i] = _mm512_add_epi64( ssg8w_512_add( W[i-15], W[i-2] ),
                               _mm512_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
   if ( ctx->initialized )
   {
      A = r[0];
      B = r[1];
      C = r[2];
      D = r[3];
      E = r[4];
      F = r[5];
      G = r[6];
      H = r[7];
   }
   else
   {
      A = m512_const1_64( 0x6A09E667F3BCC908 );
      B = m512_const1_64( 0xBB67AE8584CAA73B );
      C = m512_const1_64( 0x3C6EF372FE94F82B );
      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
      E = m512_const1_64( 0x510E527FADE682D1 );
      F = m512_const1_64( 0x9B05688C2B3E6C1F );
      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
      H = m512_const1_64( 0x5BE0CD19137E2179 );
   }
   for ( i = 0; i < 80; i += 8 )
   {
      SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
      SHA3_8WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
      SHA3_8WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
      SHA3_8WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
      SHA3_8WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
      SHA3_8WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
      SHA3_8WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
      SHA3_8WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
   }
   if ( ctx->initialized )
   {
      r[0] = _mm512_add_epi64( r[0], A );
      r[1] = _mm512_add_epi64( r[1], B );
      r[2] = _mm512_add_epi64( r[2], C );
      r[3] = _mm512_add_epi64( r[3], D );
      r[4] = _mm512_add_epi64( r[4], E );
      r[5] = _mm512_add_epi64( r[5], F );
      r[6] = _mm512_add_epi64( r[6], G );
      r[7] = _mm512_add_epi64( r[7], H );
   }
   else
   {
      ctx->initialized = true;
      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
   }
 }
 void sha512_8way_init( sha512_8way_context *sc )
 {
   sc->initialized = false;
   sc->count = 0;
 }
 void sha512_8way_update( sha512_8way_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   size_t ptr;
   const int buf_size = 128;
   ptr = (unsigned)sc->count & (buf_size - 1U);
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_512( sc->buf + (ptr>>3), vdata, clen>>3 );
      vdata = vdata + (clen>>3);
      ptr += clen;
      len -= clen;
      if ( ptr == buf_size )
      {
         sha512_8way_round( sc, sc->buf, sc->val );
         ptr = 0;
      }
      sc->count += clen;
   }
 }
 void sha512_8way_close( sha512_8way_context *sc, void *dst )
 {
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
    const __m512i shuff_bswap64 = m512_const_64(
                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
    ptr += 8;
    if ( ptr > pad )
    {
         memset_zero_512( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
         sha512_8way_round( sc, sc->buf, sc->val );
         memset_zero_512( sc->buf, pad >> 3 );
    }
    else
         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
    sha512_8way_round( sc, sc->buf, sc->val );
    mm512_block_bswap_64( dst, sc->val );
 }
 #endif   // AVX512
 // SHA-512 4 way 64 bit
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
@@ -254,7 +482,7 @@ void sha512_4way_init( sha512_4way_context *sc )
   sc->count = 0;
 }
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+void sha512_4way_update( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6
 #if defined(__AVX2__)
 #define DECL_STATE8   \
   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
           A08, A09, A0A, A0B; \
   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
           B8, B9, BA, BB, BC, BD, BE, BF; \
   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
   sph_u32 Wlow, Whigh;
 #define READ_STATE8(state) do \
 { \
   if ( (state)->state_loaded ) \
   { \
      A00 = (state)->A[0]; \
      A01 = (state)->A[1]; \
      A02 = (state)->A[2]; \
      A03 = (state)->A[3]; \
      A04 = (state)->A[4]; \
      A05 = (state)->A[5]; \
      A06 = (state)->A[6]; \
      A07 = (state)->A[7]; \
      A08 = (state)->A[8]; \
      A09 = (state)->A[9]; \
      A0A = (state)->A[10]; \
      A0B = (state)->A[11]; \
      B0 = (state)->B[0]; \
      B1 = (state)->B[1]; \
      B2 = (state)->B[2]; \
      B3 = (state)->B[3]; \
      B4 = (state)->B[4]; \
      B5 = (state)->B[5]; \
      B6 = (state)->B[6]; \
      B7 = (state)->B[7]; \
      B8 = (state)->B[8]; \
      B9 = (state)->B[9]; \
      BA = (state)->B[10]; \
      BB = (state)->B[11]; \
      BC = (state)->B[12]; \
      BD = (state)->B[13]; \
      BE = (state)->B[14]; \
      BF = (state)->B[15]; \
      C0 = (state)->C[0]; \
      C1 = (state)->C[1]; \
      C2 = (state)->C[2]; \
      C3 = (state)->C[3]; \
      C4 = (state)->C[4]; \
      C5 = (state)->C[5]; \
      C6 = (state)->C[6]; \
      C7 = (state)->C[7]; \
      C8 = (state)->C[8]; \
      C9 = (state)->C[9]; \
      CA = (state)->C[10]; \
      CB = (state)->C[11]; \
      CC = (state)->C[12]; \
      CD = (state)->C[13]; \
      CE = (state)->C[14]; \
      CF = (state)->C[15]; \
   } \
   else \
   { \
       (state)->state_loaded = true; \
       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
       A02 = m256_const1_64( 0xE782B699E782B699 ); \
       A03 = m256_const1_64( 0x5530463255304632 ); \
       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
       BD = m256_const1_64( 0x75941D9975941D99 ); \
       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
       C3 = m256_const1_64( 0x8134F3598134F359 ); \
       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
       C7 = m256_const1_64( 0x0405278004052780 ); \
       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
       C9 = m256_const1_64( 0x5194358F5194358F ); \
       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
       CC = m256_const1_64( 0x950C3434950C3434 ); \
       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
   } \
   Wlow = (state)->Wlow; \
   Whigh = (state)->Whigh; \
 } while (0)
 #define WRITE_STATE8(state)   do { \
      (state)->A[0] = A00; \
      (state)->A[1] = A01; \
      (state)->A[2] = A02; \
      (state)->A[3] = A03; \
      (state)->A[4] = A04; \
      (state)->A[5] = A05; \
      (state)->A[6] = A06; \
      (state)->A[7] = A07; \
      (state)->A[8] = A08; \
      (state)->A[9] = A09; \
      (state)->A[10] = A0A; \
      (state)->A[11] = A0B; \
      (state)->B[0] = B0; \
      (state)->B[1] = B1; \
      (state)->B[2] = B2; \
      (state)->B[3] = B3; \
      (state)->B[4] = B4; \
      (state)->B[5] = B5; \
      (state)->B[6] = B6; \
      (state)->B[7] = B7; \
      (state)->B[8] = B8; \
      (state)->B[9] = B9; \
      (state)->B[10] = BA; \
      (state)->B[11] = BB; \
      (state)->B[12] = BC; \
      (state)->B[13] = BD; \
      (state)->B[14] = BE; \
      (state)->B[15] = BF; \
      (state)->C[0] = C0; \
      (state)->C[1] = C1; \
      (state)->C[2] = C2; \
      (state)->C[3] = C3; \
      (state)->C[4] = C4; \
      (state)->C[5] = C5; \
      (state)->C[6] = C6; \
      (state)->C[7] = C7; \
      (state)->C[8] = C8; \
      (state)->C[9] = C9; \
      (state)->C[10] = CA; \
      (state)->C[11] = CB; \
      (state)->C[12] = CC; \
      (state)->C[13] = CD; \
      (state)->C[14] = CE; \
      (state)->C[15] = CF; \
      (state)->Wlow = Wlow; \
      (state)->Whigh = Whigh; \
   } while (0)
 #define DECODE_BLOCK8 \
 do { \
   M0 = buf[ 0]; \
   M1 = buf[ 1]; \
   M2 = buf[ 2]; \
   M3 = buf[ 3]; \
   M4 = buf[ 4]; \
   M5 = buf[ 5]; \
   M6 = buf[ 6]; \
   M7 = buf[ 7]; \
   M8 = buf[ 8]; \
   M9 = buf[ 9]; \
   MA = buf[10]; \
   MB = buf[11]; \
   MC = buf[12]; \
   MD = buf[13]; \
   ME = buf[14]; \
   MF = buf[15]; \
 } while (0)
 #define INPUT_BLOCK_ADD8 \
 do { \
    B0 = _mm256_add_epi32( B0, M0 );\
    B1 = _mm256_add_epi32( B1, M1 );\
    B2 = _mm256_add_epi32( B2, M2 );\
    B3 = _mm256_add_epi32( B3, M3 );\
    B4 = _mm256_add_epi32( B4, M4 );\
    B5 = _mm256_add_epi32( B5, M5 );\
    B6 = _mm256_add_epi32( B6, M6 );\
    B7 = _mm256_add_epi32( B7, M7 );\
    B8 = _mm256_add_epi32( B8, M8 );\
    B9 = _mm256_add_epi32( B9, M9 );\
    BA = _mm256_add_epi32( BA, MA );\
    BB = _mm256_add_epi32( BB, MB );\
    BC = _mm256_add_epi32( BC, MC );\
    BD = _mm256_add_epi32( BD, MD );\
    BE = _mm256_add_epi32( BE, ME );\
    BF = _mm256_add_epi32( BF, MF );\
 } while (0)
 #define INPUT_BLOCK_SUB8 \
 do { \
    C0 = _mm256_sub_epi32( C0, M0 ); \
    C1 = _mm256_sub_epi32( C1, M1 ); \
    C2 = _mm256_sub_epi32( C2, M2 ); \
    C3 = _mm256_sub_epi32( C3, M3 ); \
    C4 = _mm256_sub_epi32( C4, M4 ); \
    C5 = _mm256_sub_epi32( C5, M5 ); \
    C6 = _mm256_sub_epi32( C6, M6 ); \
    C7 = _mm256_sub_epi32( C7, M7 ); \
    C8 = _mm256_sub_epi32( C8, M8 ); \
    C9 = _mm256_sub_epi32( C9, M9 ); \
    CA = _mm256_sub_epi32( CA, MA ); \
    CB = _mm256_sub_epi32( CB, MB ); \
    CC = _mm256_sub_epi32( CC, MC ); \
    CD = _mm256_sub_epi32( CD, MD ); \
    CE = _mm256_sub_epi32( CE, ME ); \
    CF = _mm256_sub_epi32( CF, MF ); \
 } while (0)
 #define XOR_W8 \
 do { \
   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
 } while (0)
 #define SWAP_BC8 \
 do { \
    mm256_swap512_256( B0, C0 ); \
    mm256_swap512_256( B1, C1 ); \
    mm256_swap512_256( B2, C2 ); \
    mm256_swap512_256( B3, C3 ); \
    mm256_swap512_256( B4, C4 ); \
    mm256_swap512_256( B5, C5 ); \
    mm256_swap512_256( B6, C6 ); \
    mm256_swap512_256( B7, C7 ); \
    mm256_swap512_256( B8, C8 ); \
    mm256_swap512_256( B9, C9 ); \
    mm256_swap512_256( BA, CA ); \
    mm256_swap512_256( BB, CB ); \
    mm256_swap512_256( BC, CC ); \
    mm256_swap512_256( BD, CD ); \
    mm256_swap512_256( BE, CE ); \
    mm256_swap512_256( BF, CF ); \
 } while (0)
 #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
            _mm256_andnot_si256( xb3, xb2 ), \
            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
 } while (0)
 #define PERM_STEP_0_8   do { \
      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define PERM_STEP_1_8   do { \
      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define PERM_STEP_2_8   do { \
      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define APPLY_P8 \
 do { \
    B0 = mm256_ror_32( B0, 15 ); \
    B1 = mm256_ror_32( B1, 15 ); \
    B2 = mm256_ror_32( B2, 15 ); \
    B3 = mm256_ror_32( B3, 15 ); \
    B4 = mm256_ror_32( B4, 15 ); \
    B5 = mm256_ror_32( B5, 15 ); \
    B6 = mm256_ror_32( B6, 15 ); \
    B7 = mm256_ror_32( B7, 15 ); \
    B8 = mm256_ror_32( B8, 15 ); \
    B9 = mm256_ror_32( B9, 15 ); \
    BA = mm256_ror_32( BA, 15 ); \
    BB = mm256_ror_32( BB, 15 ); \
    BC = mm256_ror_32( BC, 15 ); \
    BD = mm256_ror_32( BD, 15 ); \
    BE = mm256_ror_32( BE, 15 ); \
    BF = mm256_ror_32( BF, 15 ); \
    PERM_STEP_0_8; \
    PERM_STEP_1_8; \
    PERM_STEP_2_8; \
    A0B = _mm256_add_epi32( A0B, C6 ); \
    A0A = _mm256_add_epi32( A0A, C5 ); \
    A09 = _mm256_add_epi32( A09, C4 ); \
    A08 = _mm256_add_epi32( A08, C3 ); \
    A07 = _mm256_add_epi32( A07, C2 ); \
    A06 = _mm256_add_epi32( A06, C1 ); \
    A05 = _mm256_add_epi32( A05, C0 ); \
    A04 = _mm256_add_epi32( A04, CF ); \
    A03 = _mm256_add_epi32( A03, CE ); \
    A02 = _mm256_add_epi32( A02, CD ); \
    A01 = _mm256_add_epi32( A01, CC ); \
    A00 = _mm256_add_epi32( A00, CB ); \
    A0B = _mm256_add_epi32( A0B, CA ); \
    A0A = _mm256_add_epi32( A0A, C9 ); \
    A09 = _mm256_add_epi32( A09, C8 ); \
    A08 = _mm256_add_epi32( A08, C7 ); \
    A07 = _mm256_add_epi32( A07, C6 ); \
    A06 = _mm256_add_epi32( A06, C5 ); \
    A05 = _mm256_add_epi32( A05, C4 ); \
    A04 = _mm256_add_epi32( A04, C3 ); \
    A03 = _mm256_add_epi32( A03, C2 ); \
    A02 = _mm256_add_epi32( A02, C1 ); \
    A01 = _mm256_add_epi32( A01, C0 ); \
    A00 = _mm256_add_epi32( A00, CF ); \
    A0B = _mm256_add_epi32( A0B, CE ); \
    A0A = _mm256_add_epi32( A0A, CD ); \
    A09 = _mm256_add_epi32( A09, CC ); \
    A08 = _mm256_add_epi32( A08, CB ); \
    A07 = _mm256_add_epi32( A07, CA ); \
    A06 = _mm256_add_epi32( A06, C9 ); \
    A05 = _mm256_add_epi32( A05, C8 ); \
    A04 = _mm256_add_epi32( A04, C7 ); \
    A03 = _mm256_add_epi32( A03, C6 ); \
    A02 = _mm256_add_epi32( A02, C5 ); \
    A01 = _mm256_add_epi32( A01, C4 ); \
    A00 = _mm256_add_epi32( A00, C3 ); \
 } while (0)
 #define INCR_W8   do { \
      if ((Wlow = T32(Wlow + 1)) == 0) \
         Whigh = T32(Whigh + 1); \
   } while (0)
 static void
 shabal_8way_init( void *cc, unsigned size )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
       sc->state_loaded = false;
   }
   else
   {  // No users
       sc->state_loaded = true;
       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
   }
    sc->Wlow = 1;
    sc->Whigh = 0;
    sc->ptr = 0;
 }
 static void
 shabal_8way_core( void *cc, const unsigned char *data, size_t len )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
   const int buf_size = 64;
   size_t ptr;
   DECL_STATE8
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr ) )
   {
      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
      ptr += len;
      sc->ptr = ptr;
      return;
   }
   READ_STATE8( sc );
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
      ptr += clen;
      vdata += clen>>2;
      len -= clen;
      if ( ptr == buf_size )
      {
         DECODE_BLOCK8;
         INPUT_BLOCK_ADD8;
         XOR_W8;
         APPLY_P8;
         INPUT_BLOCK_SUB8;
         SWAP_BC8;
         INCR_W8;
         ptr = 0;
      }
   }
   WRITE_STATE8(sc);
   sc->ptr = ptr;
 }
 static void
 shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
    __m256i *buf;
   const int buf_size = 64;
   size_t ptr;
   int i;
   unsigned z, zz;
   DECL_STATE8
   buf = sc->buf;
   ptr = sc->ptr;
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
   buf[ptr>>2] = _mm256_set1_epi32( zz );
   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
   READ_STATE8(sc);
   DECODE_BLOCK8;
   INPUT_BLOCK_ADD8;
   XOR_W8;
   APPLY_P8;
   for ( i = 0; i < 3; i ++ )
   {
      SWAP_BC8;
      XOR_W8;
      APPLY_P8;
   }
   __m256i *d = (__m256i*)dst;
   if ( size_words == 16 )   // 512
   {
      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
   }
   else    // 256
   {
      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
   }
 }
 void
 shabal256_8way_init( void *cc )
 {
   shabal_8way_init(cc, 256);
 }
 void
 shabal256_8way_update( void *cc, const void *data, size_t len )
 {
   shabal_8way_core( cc, data, len );
 }
 void
 shabal256_8way_close( void *cc, void *dst )
 {
   shabal_8way_close(cc, 0, 0, dst, 8);
 }
 void
 shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                  void *dst )
 {
   shabal_8way_close(cc, ub, n, dst, 8);
 }
 void
 shabal512_8way_init(void *cc)
 {
   shabal_8way_init(cc, 512);
 }
 void
 shabal512_8way_update(void *cc, const void *data, size_t len)
 {
   shabal_8way_core(cc, data, len);
 }
 void
 shabal512_8way_close(void *cc, void *dst)
 {
   shabal_8way_close(cc, 0, 0, dst, 16);
 }
 void
 shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   shabal_8way_close(cc, ub, n, dst, 16);
 }
 #endif  // AVX2
 /*
 * We copy the state into local variables, so that the compiler knows
 * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
+    mm128_swap256_128( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
+    mm128_swap256_128( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
+    mm128_swap256_128( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
+    mm128_swap256_128( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
+    mm128_swap256_128( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
+    mm128_swap256_128( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
+    mm128_swap256_128( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
+    mm128_swap256_128( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
+    mm128_swap256_128( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
+    mm128_swap256_128( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
+    mm128_swap256_128( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
+    mm128_swap256_128( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
+    mm128_swap256_128( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
+    mm128_swap256_128( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
+    mm128_swap256_128( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)
 /*
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
                                   _mm_set1_epi32(5UL) ) \
  __m128i t2 = _mm_xor_si128( xa0, xc ); \
  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
              _mm_xor_si128( t2, \
                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
 */
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }
 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }
 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{
 #define SPH_SIZE_shabal512   512
 #if defined(__AVX2__)
 typedef struct {
   __m256i buf[16];
   __m256i A[12], B[16], C[16];
   sph_u32 Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_8way_context __attribute__ ((aligned (64)));
 typedef shabal_8way_context shabal256_8way_context;
 typedef shabal_8way_context shabal512_8way_context;
 void shabal256_8way_init( void *cc );
 void shabal256_8way_update( void *cc, const void *data, size_t len );
 void shabal256_8way_close( void *cc, void *dst );
 void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
 void shabal512_8way_init( void *cc );
 void shabal512_8way_update( void *cc, const void *data, size_t len );
 void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
 #endif
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;
 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                       void *dst );
 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
 #define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -3,6 +3,12 @@
 #include <stdio.h>
 // This implementation is deprecated, superseded by VAES in Icelake
 // which provides HW based 4 way aes.
 // It was created for AVX2 to eliminate interleaving between the 
 // preceding and following function.
 // This code can be removed when current users have reverted to one way.
 #if defined(__AVX2__)
@@ -16,8 +22,8 @@ static const uint32_t IV512[] =
 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+                       mm256_ror128_32( b ), 0x88 )
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +67,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   {
      // round 1, 5, 9
-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );
     if ( r == 0 )
@@ -71,7 +77,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
     if ( r == 1 )
        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +86,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
     k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p3 = _mm256_xor_si256( p3, x );
     k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
     k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
     k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
     k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
     if ( r == 2 )
        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +140,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     // round 3, 7, 11
-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k00, zero ) ), k13 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k01, zero ) ), k00 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k02, zero ) ), k01 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k03, zero ) ), k02 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p1 = _mm256_xor_si256( p1, x );
-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k10, zero ) ), k03 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k11, zero ) ), k10 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k12, zero ) ), k11 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k13, zero ) ), k12 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
@@ -192,35 +198,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   // round 13
-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
   p3 = _mm256_xor_si256( p3, x );
-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -51,6 +51,8 @@ void init_c11_8way_ctx()
 void c11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));     
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -107,21 +109,18 @@ void c11_8way_hash( void *state, const void *input )
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
-     // Serial
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
-     // 7 Luffa + 8 cube
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -51,6 +51,8 @@ void init_x11_8way_ctx()
 void x11_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
@@ -108,20 +110,18 @@ void x11_8way_hash( void *state, const void *input )
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
-     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
+     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
                   vhash );
-     // Luffa + Cube
+     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
-     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
-     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
+
-     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
+     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
 #if defined(X12_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,223 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+
 #if defined(X12_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
 } x12_8way_ctx_holder;
 x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
 void init_x12_8way_ctx()
 {
     blake512_8way_init( &x12_8way_ctx.blake );
     bmw512_8way_init( &x12_8way_ctx.bmw );
     init_groestl( &x12_8way_ctx.groestl, 64 );
     skein512_8way_init( &x12_8way_ctx.skein );
     jh512_8way_init( &x12_8way_ctx.jh );
     keccak512_8way_init( &x12_8way_ctx.keccak );
     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x12_8way_ctx.shavite );
     simd_4way_init( &x12_8way_ctx.simd, 512 );
     init_echo( &x12_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x12_8way_ctx.hamsi );
 };
 void x12_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x12_8way_ctx_holder ctx;
     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, state );
 }
 int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[16*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[49]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     int thr_id = mythr->id;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
        x12_8way_hash( hash, vdata );
        for ( int lane = 0; lane < 8; lane++ )
        if ( hash7[ lane<<1 ] < Htarg )
        {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
           {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
        n += 8;
     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X12_4WAY)
 typedef struct {
    blake512_4way_context   blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
     x12_4way_ctx_holder ctx;
     memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
     // 1 Blake
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );
     // 2 Bmw
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
     // 5 JH
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     // 6 Keccak
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
-     // 11 Echo
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-                       (const BitSequence *) hash0, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@
 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
  init_x12_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_8way;
  gate->hash      = (void*)&x12_8way_hash;
 #elif defined (X12_4WAY)
  init_x12_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_4way;
  gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x12;
  gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X12_4WAY
+  #define X12_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X12_4WAY 1
 #endif
 bool register_x12_algo( algo_gate_t* gate );
-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
 void x12_8way_hash( void *state, const void *input );
 int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_8way_ctx();
 #elif defined(X12_4WAY)
 void x12_4way_hash( void *state, const void *input );
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_4way_ctx();
-#endif
+#else
 void x12hash( void *state, const void *input );
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_ctx();
 #endif
 #endif
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,35 +20,40 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 typedef struct {
   sph_blake512_context    blake;
   sph_bmw512_context      bmw;
   sph_skein512_context    skein;
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
 #if defined(__AES__)
-        hashState_groestl       groestl;
+   hashState_groestl       groestl;
-        hashState_echo          echo;
+   hashState_echo          echo;
 #else
-        sph_groestl512_context   groestl;
+   sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
+   sph_echo512_context      echo;
 #endif
-        hashState_luffa         luffa;
+   hashState_luffa          luffa;
-        cubehashParam           cubehash;
+   cubehashParam            cubehash;
-        sph_shavite512_context  shavite;
+   sph_shavite512_context   shavite;
-        hashState_sd            simd;
+   hashState_sd             simd;
-        sph_hamsi512_context    hamsi;
+   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;
 x12_ctx_holder x12_ctx;
 void init_x12_ctx()
 {
        sph_blake512_init( &x12_ctx.blake );
        sph_bmw512_init( &x12_ctx.bmw );
        sph_skein512_init( &x12_ctx.skein);
        sph_jh512_init( &x12_ctx.jh);
        sph_keccak512_init( &x12_ctx.keccak);
 #if defined(__AES__)
        init_echo( &x12_ctx.echo, 512 );
        init_groestl (&x12_ctx.groestl, 64 );
@@ -65,102 +70,59 @@ void init_x12_ctx()
 void x12hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
-        x12_ctx_holder ctx;
+   x12_ctx_holder ctx;
-        memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
+   memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
-        // X11 algos
+   sph_blake512(&ctx.blake, input, 80);
   sph_blake512_close(&ctx.blake, hash);
-        unsigned char hashbuf[128];
+   sph_bmw512(&ctx.bmw, hash, 64);
-        size_t hashptr;
+   sph_bmw512_close(&ctx.bmw, hash);
        sph_u64 hashctA;
        sph_u64 hashctB;
-        //---blake1---
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                           (const BitSequence*)hash, 64 );
-        DECL_BLK;
+   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
-        BLK_I;
+                         (const byte*)hashB, 64 );
        BLK_W;
        BLK_C;
-        //---bmw2---
+   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);
-        DECL_BMW;
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
-        BMW_I;
+                    (const BitSequence *)hashB, 512 );
        BMW_U;
        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
        #define H(x)    (h[x])
        #define dH(x)   (dh[x])
        BMW_C;
        #undef M
        #undef H
        #undef dH
        //---groetl----
 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
+   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                                  (const char*)hash, 512 );
 #else
        sph_groestl512 (&ctx.groestl, hash, 64);
        sph_groestl512_close(&ctx.groestl, hash);
 #endif
        //---skein4---
        DECL_SKN;
        SKN_I;
        SKN_U;
        SKN_C;
        //---jh5------
        DECL_JH;
        JH_H;
        //---keccak6---
        DECL_KEC;
        KEC_I;
        KEC_U;
        KEC_C;
        //--- luffa7
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                (const BitSequence*)hash, 64 );
        // 8 Cube
        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                              (const byte*)hashB, 64 );
        // 9 Shavite
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hashB);
        // 10 Simd
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hashB, 512 );
        //11---echo---
 #if defined(__AES__)
        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                            (const BitSequence *)hash, 512 );
 #else
-        sph_echo512(&ctx.echo, hash, 64);
+   sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
+   sph_echo512_close(&ctx.echo, hashB);
 #endif
-        // 12 Hamsi
+#if defined(__AES__)
   update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #else
   sph_groestl512 (&ctx.groestl, hash, 64);
   sph_groestl512_close(&ctx.groestl, hash);
 #endif
   sph_skein512(&ctx.skein, hash, 64);
   sph_skein512_close(&ctx.skein, hash);
   sph_jh512(&ctx.jh, hash, 64);
   sph_jh512_close(&ctx.jh, hash);
   sph_keccak512(&ctx.keccak, hash, 64);
   sph_keccak512_close(&ctx.keccak, hash);
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);
        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,7 +1,4 @@
 #include "x13-gate.h"
 #if defined(X13_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,12 +11,270 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #if defined(X13_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
 } x13_8way_ctx_holder;
 x13_8way_ctx_holder x13_8way_ctx;
 void init_x13_8way_ctx()
 {
     blake512_8way_init( &x13_8way_ctx.blake );
     bmw512_8way_init( &x13_8way_ctx.bmw );
     init_groestl( &x13_8way_ctx.groestl, 64 );
     skein512_8way_init( &x13_8way_ctx.skein );
     jh512_8way_init( &x13_8way_ctx.jh );
     keccak512_8way_init( &x13_8way_ctx.keccak );
     luffa_4way_init( &x13_8way_ctx.luffa, 512 );
     cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x13_8way_ctx.shavite );
     simd_4way_init( &x13_8way_ctx.simd, 512 );
     init_echo( &x13_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x13_8way_ctx.hamsi );
     sph_fugue512_init( &x13_8way_ctx.fugue );
 }
 void x13_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x13_8way_ctx_holder ctx;
     memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
     memcpy( state+128, hash4, 32 );
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
 }
 int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     int thr_id = mythr->id;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     const uint32_t last_nonce = max_nonce -8;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
         x13_8way_hash( hash, vdata );
         pdata[19] = n;
         for ( int i = 0; i < 8; i++ )
         if ( ( hash+(i<<3) )[7] < Htarg
              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
             pdata[19] = n+i;
             submit_lane_solution( work, hash+(i<<3), mythr, i );
         }
         n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X13_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -2,7 +2,11 @@
 bool register_x13_algo( algo_gate_t* gate )
 {
-#if defined (X13_4WAY)
+#if defined (X13_8WAY)
  init_x13_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_8way;
  gate->hash      = (void*)&x13_8way_hash;
 #elif defined (X13_4WAY)
  init_x13_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_4way;
  gate->hash      = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13;
  gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X13_4WAY
+  #define X13_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X13_4WAY 1
 #endif
 bool register_x13_algo( algo_gate_t* gate );
-#if defined(X13_4WAY)
+#if defined(X13_8WAY)
 void x13_8way_hash( void *state, const void *input );
 int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_8way_ctx();
 #elif defined(X13_4WAY)
 void x13_4way_hash( void *state, const void *input );
 int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_4way_ctx();
-#endif
+#else
 void x13hash( void *state, const void *input );
 int scanhash_x13( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_ctx();
 #endif
 #endif
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,7 +1,4 @@
 #include "x14-gate.h"
 #if defined(X14_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,6 +10,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #if defined(X14_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
 } x14_8way_ctx_holder;
 x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
 void init_x14_8way_ctx()
 {
     blake512_8way_init( &x14_8way_ctx.blake );
     bmw512_8way_init( &x14_8way_ctx.bmw );
     init_groestl( &x14_8way_ctx.groestl, 64 );
     skein512_8way_init( &x14_8way_ctx.skein );
     jh512_8way_init( &x14_8way_ctx.jh );
     keccak512_8way_init( &x14_8way_ctx.keccak );
     luffa_4way_init( &x14_8way_ctx.luffa, 512 );
     cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x14_8way_ctx.shavite );
     simd_4way_init( &x14_8way_ctx.simd, 512 );
     init_echo( &x14_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x14_8way_ctx.hamsi );
     sph_fugue512_init( &x14_8way_ctx.fugue );
     shabal512_8way_init( &x14_8way_ctx.shabal );
 };
 void x14_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x14_8way_ctx_holder ctx;
     memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     // 14 Shabal, parallel 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     shabal512_8way_update( &ctx.shabal, vhash, 64 );
     shabal512_8way_close( &ctx.shabal, state );
 }
 int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t last_nonce = max_nonce - 8;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
        x14_8way_hash( hash, vdata );
        pdata[19] = n;
        uint32_t *hash7 = &(hash[7<<3]);
        for ( int lane = 0; lane < 8; lane++ )
        if ( hash7[ lane ] < Htarg )
        {
            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
            extr_lane_8x32( lane_hash, hash, lane, 256 );
            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
                pdata[19] = n + lane;
                submit_lane_solution( work, lane_hash, mythr, lane );
            }
         }
         n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X14_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()
 void x14_4way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x14_4way_ctx_holder ctx;
     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
@@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input )
     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;  
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
-     for ( int m=0; m < 6; m++ )
+     do
-       if ( Htarg <= htmax[m] )
+     {
       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       x14_4way_hash( hash, vdata );
       pdata[19] = n;
       uint32_t *hash7 = &(hash[7<<2]);
       for ( int lane = 0; lane < 4; lane++ )
       if ( hash7[ lane ] < Htarg )
       {
-         uint32_t mask = masks[m];
+           uint32_t lane_hash[8];
-         do
+           extr_lane_4x32( lane_hash, hash, lane, 256 );
         {
           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-            x14_4way_hash( hash, vdata );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            pdata[19] = n;
+           {
-
+               pdata[19] = n + lane;
-            uint32_t *hash7 = &(hash[7<<2]);
+               submit_lane_solution( work, lane_hash, mythr, lane );
-
+           }
-            for ( int lane = 0; lane < 4; lane++ )
+        }
-            if ( ( hash7[ lane ] & mask ) == 0 )
+        n += 4;
-            {
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-               // deinterleave hash for lane
+     *hashes_done = n - first_nonce;
               uint32_t lane_hash[8];
               extr_lane_4x32( lane_hash, hash, lane, 256 );
               if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
               {
                  pdata[19] = n + lane;
                  submit_lane_solution( work, lane_hash, mythr, lane );
               }
            }
            n += 4;
         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
       }
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -2,7 +2,11 @@
 bool register_x14_algo( algo_gate_t* gate )
 {
-#if defined (X14_4WAY)
+#if defined (X14_8WAY)
  init_x14_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_8way;
  gate->hash      = (void*)&x14_8way_hash;
 #elif defined (X14_4WAY)
  init_x14_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_4way;
  gate->hash      = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,20 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X14_4WAY
+  #define X14_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X14_4WAY 1
 #endif
 bool register_x14_algo( algo_gate_t* gate );
-#if defined(X14_4WAY)
+#if defined(X14_8WAY)
 void x14_8way_hash( void *state, const void *input );
 int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_8way_ctx();
 #elif defined(X14_4WAY)
 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();
-#endif
+#else
 void x14hash( void *state, const void *input );
 int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();
 #endif
 #endif
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,7 +1,4 @@
 #include "x15-gate.h"
 #if defined(X15_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,309 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #if defined(X15_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
    sph_whirlpool_context   whirlpool;
 } x15_8way_ctx_holder;
 x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
 void init_x15_8way_ctx()
 {
     blake512_8way_init( &x15_8way_ctx.blake );
     bmw512_8way_init( &x15_8way_ctx.bmw );
     init_groestl( &x15_8way_ctx.groestl, 64 );
     skein512_8way_init( &x15_8way_ctx.skein );
     jh512_8way_init( &x15_8way_ctx.jh );
     keccak512_8way_init( &x15_8way_ctx.keccak );
     luffa_4way_init( &x15_8way_ctx.luffa, 512 );
     cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x15_8way_ctx.shavite );
     simd_4way_init( &x15_8way_ctx.simd, 512 );
     init_echo( &x15_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x15_8way_ctx.hamsi );
     sph_fugue512_init( &x15_8way_ctx.fugue );
     shabal512_8way_init( &x15_8way_ctx.shabal );
     sph_whirlpool_init( &x15_8way_ctx.whirlpool );
 };
 void x15_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t vhash0[4*8] __attribute__ ((aligned (64)));
     uint64_t vhash1[4*8] __attribute__ ((aligned (64)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x15_8way_ctx_holder ctx;
     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
     // 1 Blake
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     // 2 Bmw
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     // 5 JH
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     // 6 Keccak
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     rintrlv_8x64_4x128( vhash0, vhash1, vhash, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash0, vhash0, 64 );
     luffa_4way_init( &ctx.luffa, 512 );
     luffa_4way_update_close( &ctx.luffa, vhash1, vhash1, 64 );
     cube_4way_update_close( &ctx.cube, vhash0, vhash0, 64 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     cube_4way_update_close( &ctx.cube, vhash1, vhash1, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash0 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash1 );
     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     // 10 Simd
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     // 12 Hamsi parallel 4way 64 bit
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     // 14 Shabal, parallel 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     shabal512_8way_update( &ctx.shabal, vhash, 64 );
     shabal512_8way_close( &ctx.shabal, vhash );
     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash4 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash5 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash6 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash7 );
     memcpy( state,    hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
     memcpy( state+128, hash4, 32 );
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
 }
 int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 8;
     __m512i  *noncev = (__m512i*)vdata + 9;  
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;  
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
        x15_8way_hash( hash, vdata );
        pdata[19] = n;
        for ( int i = 0; i < 8; i++ )
        if ( ( hash+(i<<3) )[7] < Htarg )
        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n+i;
           submit_lane_solution( work, hash, mythr, i );
        }
        n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X15_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -64,11 +365,11 @@ void init_x15_4way_ctx()
 void x15_4way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x15_4way_ctx_holder ctx;
     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
@@ -187,7 +488,7 @@ void x15_4way_hash( void *state, const void *input )
     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -216,48 +517,37 @@ void x15_4way_hash( void *state, const void *input )
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;  
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
-     for ( int m=0; m < 6; m++ )
+     do
-       if ( Htarg <= htmax[m] )
+     {
-       {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-         uint32_t mask = masks[m];
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-         do
+
         x15_4way_hash( hash, vdata );
         pdata[19] = n;
         for ( int i = 0; i < 4; i++ )
         if ( ( hash+(i<<3) )[7] < Htarg )
         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+            pdata[19] = n+i;
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+            submit_lane_solution( work, hash, mythr, i );
         }
         n += 4;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-            x15_4way_hash( hash, vdata );
+     *hashes_done = n - first_nonce;
            pdata[19] = n;
            for ( int i = 0; i < 4; i++ )
            if ( ( (hash+(i<<3))[7] & mask ) == 0 )
            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
            {
               pdata[19] = n+i;
               submit_lane_solution( work, hash, mythr, i );
            }
            n += 4;
         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
       }
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -2,7 +2,11 @@
 bool register_x15_algo( algo_gate_t* gate )
 {
-#if defined (X15_4WAY)
+#if defined (X15_8WAY)
  init_x15_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_8way;
  gate->hash      = (void*)&x15_8way_hash;
 #elif defined (X15_4WAY)
  init_x15_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_4way;
  gate->hash      = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,20 +4,30 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X15_4WAY
+  #define X15_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X15_4WAY 1
 #endif
 bool register_x15_algo( algo_gate_t* gate );
-#if defined(X15_4WAY)
+#if defined(X15_8WAY)
 void x15_8way_hash( void *state, const void *input );
 int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_8way_ctx();
 #elif defined(X15_4WAY)
 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();
-#endif
+#else
 void x15hash( void *state, const void *input );
 int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();
 #endif
 #endif
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
 #if defined (X16R_4WAY)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,6 +17,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -32,6 +30,392 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 #if defined (X16R_8WAY)
 union _x16r_8way_context_overlay
 {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_8way_context     sha512;
 } __attribute__ ((aligned (64)));
 typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
 void x16r_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t hash4[24] __attribute__ ((aligned (64)));
   uint32_t hash5[24] __attribute__ ((aligned (64)));
   uint32_t hash6[24] __attribute__ ((aligned (64)));
   uint32_t hash7[24] __attribute__ ((aligned (64)));
   x16r_8way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   void *in4 = (void*) hash4;
   void *in5 = (void*) hash5;
   void *in6 = (void*) hash6;
   void *in7 = (void*) hash7;
   int size = 80;
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            blake512_8way_init( &ctx.blake );
            if ( i == 0 )
               blake512_8way_update( &ctx.blake, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
               blake512_8way_update( &ctx.blake, vhash, size );
            }
            blake512_8way_close( &ctx.blake, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case BMW:
            bmw512_8way_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_8way_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
            bmw512_8way_update( &ctx.bmw, vhash, size );
            }
            bmw512_8way_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                 (const char*)in0, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                 (const char*)in1, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                 (const char*)in2, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                 (const char*)in3, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash4,
                                                 (const char*)in4, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash5,
                                                 (const char*)in5, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash6,
                                                 (const char*)in6, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash7,
                                                 (const char*)in7, size<<3 );
         break;
         case SKEIN:
            skein512_8way_init( &ctx.skein );
            if ( i == 0 )
               skein512_8way_update( &ctx.skein, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               skein512_8way_update( &ctx.skein, vhash, size );
            }
            skein512_8way_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case JH:
            jh512_8way_init( &ctx.jh );
            if ( i == 0 )
               jh512_8way_update( &ctx.jh, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
               jh512_8way_update( &ctx.jh, vhash, size );
            }
            jh512_8way_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case KECCAK:
            keccak512_8way_init( &ctx.keccak );
            if ( i == 0 )
               keccak512_8way_update( &ctx.keccak, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7, 
                            size<<3 );
               keccak512_8way_update( &ctx.keccak, vhash, size );
            }
            keccak512_8way_close( &ctx.keccak, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case LUFFA:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in0, size );
            sph_shavite512_close( &ctx.shavite, hash0 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in1, size );
            sph_shavite512_close( &ctx.shavite, hash1 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in2, size );
            sph_shavite512_close( &ctx.shavite, hash2 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in3, size );
            sph_shavite512_close( &ctx.shavite, hash3 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in4, size );
            sph_shavite512_close( &ctx.shavite, hash4 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in5, size );
            sph_shavite512_close( &ctx.shavite, hash5 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in6, size );
            sph_shavite512_close( &ctx.shavite, hash6 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in7, size );
            sph_shavite512_close( &ctx.shavite, hash7 );
         break;
         case SIMD:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                (const BitSequence*)in0, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
                                (const BitSequence*)in1, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
                                (const BitSequence*)in2, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
                                (const BitSequence*)in3, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
                                (const BitSequence*)in4, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
                                (const BitSequence*)in5, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
                                (const BitSequence*)in6, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
                                (const BitSequence*)in7, size<<3 );
         break;
         case HAMSI:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
             hamsi512_8way_init( &ctx.hamsi );
             hamsi512_8way_update( &ctx.hamsi, vhash, size );
             hamsi512_8way_close( &ctx.hamsi, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
             break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in0, size );
             sph_fugue512_close( &ctx.fugue, hash0 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in1, size );
             sph_fugue512_close( &ctx.fugue, hash1 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in2, size );
             sph_fugue512_close( &ctx.fugue, hash2 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in3, size );
             sph_fugue512_close( &ctx.fugue, hash3 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in4, size );
             sph_fugue512_close( &ctx.fugue, hash4 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in5, size );
             sph_fugue512_close( &ctx.fugue, hash5 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in6, size );
             sph_fugue512_close( &ctx.fugue, hash6 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in7, size );
             sph_fugue512_close( &ctx.fugue, hash7 );
         break;
         case SHABAL:
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                          size<<3 );
             shabal512_8way_init( &ctx.shabal );
             shabal512_8way_update( &ctx.shabal, vhash, size );
             shabal512_8way_close( &ctx.shabal, vhash );
             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in0, size );
             sph_whirlpool_close( &ctx.whirlpool, hash0 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in1, size );
             sph_whirlpool_close( &ctx.whirlpool, hash1 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in2, size );
             sph_whirlpool_close( &ctx.whirlpool, hash2 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in3, size );
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in4, size );
             sph_whirlpool_close( &ctx.whirlpool, hash4 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in5, size );
             sph_whirlpool_close( &ctx.whirlpool, hash5 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in6, size );
             sph_whirlpool_close( &ctx.whirlpool, hash6 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in7, size );
             sph_whirlpool_close( &ctx.whirlpool, hash7 );
         break;
         case SHA_512:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
             sha512_8way_init( &ctx.sha512 );
             sha512_8way_update( &ctx.sha512, vhash, size );
             sha512_8way_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
      }
      size = 64;
   }
   memcpy( output,     hash0, 32 );
   memcpy( output+32,  hash1, 32 );
   memcpy( output+64,  hash2, 32 );
   memcpy( output+96,  hash3, 32 );
   memcpy( output+128, hash4, 32 );
   memcpy( output+160, hash5, 32 );
   memcpy( output+192, hash6, 32 );
   memcpy( output+224, hash7, 32 );
 }
 int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
   }
   do
   {
      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      x16r_8way_hash( hash, vdata );
      pdata[19] = n;
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
      {
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined (X16R_4WAY)
 union _x16r_4way_context_overlay
 {
    blake512_4way_context   blake;
@@ -50,16 +434,16 @@ union _x16r_4way_context_overlay
    shabal512_4way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_4way_context     sha512;
-};
+} __attribute__ ((aligned (64)));
 typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
 void x16r_4way_hash( void* output, const void* input )
 {
   uint32_t vhash[24*4] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
   x16r_4way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
@@ -86,7 +470,7 @@ void x16r_4way_hash( void* output, const void* input )
               blake512_4way( &ctx.blake, vhash, size );
            }
            blake512_4way_close( &ctx.blake, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case BMW:
            bmw512_4way_init( &ctx.bmw );
@@ -98,7 +482,7 @@ void x16r_4way_hash( void* output, const void* input )
               bmw512_4way( &ctx.bmw, vhash, size );
            }
            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
@@ -124,7 +508,7 @@ void x16r_4way_hash( void* output, const void* input )
               skein512_4way( &ctx.skein, vhash, size );
            }
            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case JH:
            jh512_4way_init( &ctx.jh );
@@ -136,7 +520,7 @@ void x16r_4way_hash( void* output, const void* input )
               jh512_4way( &ctx.jh, vhash, size );
            }
            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case KECCAK:
            keccak512_4way_init( &ctx.keccak );
@@ -148,17 +532,17 @@ void x16r_4way_hash( void* output, const void* input )
               keccak512_4way( &ctx.keccak, vhash, size );
            }
            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+            dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case LUFFA:
            intrlv_2x128( vhash, in0, in1, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            luffa_2way_init( &ctx.luffa, 512 );
            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case CUBEHASH:
            cubehashInit( &ctx.cube, 512, 16, 32 );
@@ -192,11 +576,11 @@ void x16r_4way_hash( void* output, const void* input )
            intrlv_2x128( vhash, in0, in1, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            dintrlv_2x128_512( hash0, hash1, vhash );
            intrlv_2x128( vhash, in2, in3, size<<3 );
            simd_2way_init( &ctx.simd, 512 );
            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
+            dintrlv_2x128_512( hash2, hash3, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
@@ -217,7 +601,7 @@ void x16r_4way_hash( void* output, const void* input )
             hamsi512_4way_init( &ctx.hamsi );
             hamsi512_4way( &ctx.hamsi, vhash, size );
             hamsi512_4way_close( &ctx.hamsi, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
@@ -238,7 +622,7 @@ void x16r_4way_hash( void* output, const void* input )
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32_512( hash0, hash1, hash2, hash3, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
@@ -259,7 +643,7 @@ void x16r_4way_hash( void* output, const void* input )
             sha512_4way_init( &ctx.sha512 );
             sha512_4way( &ctx.sha512, vhash, size );
             sha512_4way_close( &ctx.sha512, vhash );
-             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x64_512( hash0, hash1, hash2, hash3, vhash );
         break;
      }
      size = 64;
@@ -280,6 +664,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
@@ -317,9 +702,9 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 4;
-   } while ( likely( ( n < max_nonce ) && !(*restart) ) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   return 0;
 }
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -34,14 +34,17 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 bool register_x16r_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -49,14 +52,17 @@ bool register_x16r_algo( algo_gate_t* gate )
 bool register_x16rv2_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_8way;
  gate->hash      = (void*)&x16rv2_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
  gate->hash      = (void*)&x16rv2_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rv2;
  gate->hash      = (void*)&x16rv2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -64,14 +70,17 @@ bool register_x16rv2_algo( algo_gate_t* gate )
 bool register_x16s_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16r_8way;
  gate->hash      = (void*)&x16r_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  opt_target_factor = 256.0;
  return true;
@@ -196,28 +205,34 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16rt_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  opt_target_factor = 256.0;
  return true;
 };
 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16R_4WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
  gate->hash      = (void*)&x16rt_8way_hash;
 #elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
  gate->hash      = (void*)&x16rt_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
  gate->hash      = (void*)&x16rt_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
  opt_target_factor = 256.0;
  return true;
@@ -231,7 +246,7 @@ bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
  gate->hash            = (void*)&hex_hash;
-  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
  return true;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -6,8 +6,10 @@
 #include <stdint.h>
 #include <unistd.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X16R_4WAY
+  #define X16R_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X16R_4WAY 1
 #endif
 enum x16r_Algo {
@@ -44,7 +46,20 @@ bool register_x16rt_algo( algo_gate_t* gate );
 bool register_hex__algo( algo_gate_t* gate );
 bool register_x21s__algo( algo_gate_t* gate );
-#if defined(X16R_4WAY)
+#if defined(X16R_8WAY)
 void x16r_8way_hash( void *state, const void *input );
 int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void x16rv2_8way_hash( void *state, const void *input );
 int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 void x16rt_8way_hash( void *state, const void *input );
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 #elif defined(X16R_4WAY)
 void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -58,12 +73,7 @@ void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
-void x21s_4way_hash( void *state, const void *input );
+#else
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();
 #endif
 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
@@ -77,9 +87,16 @@ void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
-void hex_hash( void *state, const void *input );
+#endif
-int scanhash_hex( struct work *work, uint32_t max_nonce,
+
-                  uint64_t *hashes_done, struct thr_info *mythr );
+#if defined(X16R_4WAY)
 void x21s_4way_hash( void *state, const void *input );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_4way_thread_init();
 #else
 void x21s_hash( void *state, const void *input );
 int scanhash_x21s( struct work *work, uint32_t max_nonce,
@@ -88,3 +105,9 @@ bool x21s_thread_init();
 #endif
 void hex_hash( void *state, const void *input );
 int scanhash_hex( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 #endif
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -1,7 +1,4 @@
 #include "x16r-gate.h"
 #if defined (X16R_4WAY)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -15,6 +12,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -26,6 +24,391 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 #if defined (X16R_8WAY)
 union _x16rt_8way_context_overlay
 {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_8way_context     sha512;
 } __attribute__ ((aligned (64)));
 typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
 void x16rt_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t hash4[24] __attribute__ ((aligned (64)));
   uint32_t hash5[24] __attribute__ ((aligned (64)));
   uint32_t hash6[24] __attribute__ ((aligned (64)));
   uint32_t hash7[24] __attribute__ ((aligned (64)));
   x16rt_8way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   void *in4 = (void*) hash4;
   void *in5 = (void*) hash5;
   void *in6 = (void*) hash6;
   void *in7 = (void*) hash7;
   int size = 80;
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            blake512_8way_init( &ctx.blake );
            if ( i == 0 )
               blake512_8way_update( &ctx.blake, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               blake512_8way_update( &ctx.blake, vhash, size );
            }
            blake512_8way_close( &ctx.blake, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case BMW:
            bmw512_8way_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_8way_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
            bmw512_8way_update( &ctx.bmw, vhash, size );
            }
            bmw512_8way_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                 (const char*)in0, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                 (const char*)in1, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                 (const char*)in2, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                 (const char*)in3, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash4,
                                                 (const char*)in4, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash5,
                                                 (const char*)in5, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash6,
                                                 (const char*)in6, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash7,
                                                 (const char*)in7, size<<3 );
         break;
         case SKEIN:
            skein512_8way_init( &ctx.skein );
            if ( i == 0 )
               skein512_8way_update( &ctx.skein, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               skein512_8way_update( &ctx.skein, vhash, size );
            }
            skein512_8way_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case JH:
            jh512_8way_init( &ctx.jh );
            if ( i == 0 )
               jh512_8way_update( &ctx.jh, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               jh512_8way_update( &ctx.jh, vhash, size );
            }
            jh512_8way_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case KECCAK:
            keccak512_8way_init( &ctx.keccak );
            if ( i == 0 )
               keccak512_8way_update( &ctx.keccak, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               keccak512_8way_update( &ctx.keccak, vhash, size );
            }
            keccak512_8way_close( &ctx.keccak, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case LUFFA:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, size);
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in0, size );
            sph_shavite512_close( &ctx.shavite, hash0 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in1, size );
            sph_shavite512_close( &ctx.shavite, hash1 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in2, size );
            sph_shavite512_close( &ctx.shavite, hash2 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in3, size );
            sph_shavite512_close( &ctx.shavite, hash3 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in4, size );
            sph_shavite512_close( &ctx.shavite, hash4 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in5, size );
            sph_shavite512_close( &ctx.shavite, hash5 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in6, size );
            sph_shavite512_close( &ctx.shavite, hash6 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in7, size );
            sph_shavite512_close( &ctx.shavite, hash7 );
         break;
         case SIMD:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                (const BitSequence*)in0, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
                                (const BitSequence*)in1, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
                                (const BitSequence*)in2, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
                                (const BitSequence*)in3, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
                                (const BitSequence*)in4, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
                                (const BitSequence*)in5, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
                                (const BitSequence*)in6, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
                                (const BitSequence*)in7, size<<3 );
         break;
         case HAMSI:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
             hamsi512_8way_init( &ctx.hamsi );
             hamsi512_8way_update( &ctx.hamsi, vhash, size );
             hamsi512_8way_close( &ctx.hamsi, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
             break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in0, size );
             sph_fugue512_close( &ctx.fugue, hash0 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in1, size );
             sph_fugue512_close( &ctx.fugue, hash1 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in2, size );
             sph_fugue512_close( &ctx.fugue, hash2 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in3, size );
             sph_fugue512_close( &ctx.fugue, hash3 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in4, size );
             sph_fugue512_close( &ctx.fugue, hash4 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in5, size );
             sph_fugue512_close( &ctx.fugue, hash5 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in6, size );
             sph_fugue512_close( &ctx.fugue, hash6 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in7, size );
             sph_fugue512_close( &ctx.fugue, hash7 );
         break;
         case SHABAL:
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                          size<<3 );
             shabal512_8way_init( &ctx.shabal );
             shabal512_8way_update( &ctx.shabal, vhash, size );
             shabal512_8way_close( &ctx.shabal, vhash );
             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in0, size );
             sph_whirlpool_close( &ctx.whirlpool, hash0 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in1, size );
             sph_whirlpool_close( &ctx.whirlpool, hash1 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in2, size );
             sph_whirlpool_close( &ctx.whirlpool, hash2 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in3, size );
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in4, size );
             sph_whirlpool_close( &ctx.whirlpool, hash4 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in5, size );
             sph_whirlpool_close( &ctx.whirlpool, hash5 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in6, size );
             sph_whirlpool_close( &ctx.whirlpool, hash6 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in7, size );
             sph_whirlpool_close( &ctx.whirlpool, hash7 );
         break;
         case SHA_512:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
             sha512_8way_init( &ctx.sha512 );
             sha512_8way_update( &ctx.sha512, vhash, size );
             sha512_8way_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
      }
      size = 64;
   }
   memcpy( output,     hash0, 32 );
   memcpy( output+32,  hash1, 32 );
   memcpy( output+64,  hash2, 32 );
   memcpy( output+96,  hash3, 32 );
   memcpy( output+128, hash4, 32 );
   memcpy( output+160, hash5, 32 );
   memcpy( output+192, hash6, 32 );
   memcpy( output+224, hash7, 32 );
 }
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) timeHash[8*8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
                               hashOrder, ntime, timeHash );
   }
   do
   {
      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      x16rt_8way_hash( hash, vdata );
      pdata[19] = n;
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
      {
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined (X16R_4WAY)
 union _x16rt_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -5,9 +5,6 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
 #if defined (X16R_4WAY)
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -21,6 +18,7 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -33,6 +31,477 @@
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
 #if defined (X16R_8WAY)
 union _x16rv2_8way_context_overlay
 {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
    sph_whirlpool_context   whirlpool;
    sha512_8way_context     sha512;
    sph_tiger_context       tiger;
 } __attribute__ ((aligned (64)));
 typedef union _x16rv2_8way_context_overlay x16rv2_8way_context_overlay;
 void x16rv2_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[24*8] __attribute__ ((aligned (128)));
   uint32_t hash0[24] __attribute__ ((aligned (64)));
   uint32_t hash1[24] __attribute__ ((aligned (64)));
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t hash4[24] __attribute__ ((aligned (64)));
   uint32_t hash5[24] __attribute__ ((aligned (64)));
   uint32_t hash6[24] __attribute__ ((aligned (64)));
   uint32_t hash7[24] __attribute__ ((aligned (64)));
   x16rv2_8way_context_overlay ctx;
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
   void *in3 = (void*) hash3;
   void *in4 = (void*) hash4;
   void *in5 = (void*) hash5;
   void *in6 = (void*) hash6;
   void *in7 = (void*) hash7;
   int size = 80;
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 input, 640 );
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
      switch ( algo )
      {
         case BLAKE:
            blake512_8way_init( &ctx.blake );
            if ( i == 0 )
               blake512_8way_update( &ctx.blake, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               blake512_8way_update( &ctx.blake, vhash, size );
            }
            blake512_8way_close( &ctx.blake, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case BMW:
            bmw512_8way_init( &ctx.bmw );
            if ( i == 0 )
               bmw512_8way_update( &ctx.bmw, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
            bmw512_8way_update( &ctx.bmw, vhash, size );
            }
            bmw512_8way_close( &ctx.bmw, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case GROESTL:
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash0,
                                                 (const char*)in0, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash1,
                                                 (const char*)in1, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash2,
                                                 (const char*)in2, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash3,
                                                 (const char*)in3, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash4,
                                                 (const char*)in4, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash5,
                                                 (const char*)in5, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash6,
                                                 (const char*)in6, size<<3 );
               init_groestl( &ctx.groestl, 64 );
               update_and_final_groestl( &ctx.groestl, (char*)hash7,
                                                 (const char*)in7, size<<3 );
         break;
         case SKEIN:
            skein512_8way_init( &ctx.skein );
            if ( i == 0 )
               skein512_8way_update( &ctx.skein, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               skein512_8way_update( &ctx.skein, vhash, size );
            }
            skein512_8way_close( &ctx.skein, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case JH:
            jh512_8way_init( &ctx.jh );
            if ( i == 0 )
               jh512_8way_update( &ctx.jh, input, size );
            else
            {
               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
               jh512_8way_update( &ctx.jh, vhash, size );
            }
            jh512_8way_close( &ctx.jh, vhash );
            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case KECCAK:
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in0, size );
             sph_tiger_close( &ctx.tiger, hash0 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in1, size );
             sph_tiger_close( &ctx.tiger, hash1 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in2, size );
             sph_tiger_close( &ctx.tiger, hash2 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in3, size );
             sph_tiger_close( &ctx.tiger, hash3 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in4, size );
             sph_tiger_close( &ctx.tiger, hash4 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in5, size );
             sph_tiger_close( &ctx.tiger, hash5 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in6, size );
             sph_tiger_close( &ctx.tiger, hash6 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in7, size );
             sph_tiger_close( &ctx.tiger, hash7 );
             for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
             keccak512_8way_init( &ctx.keccak );
             keccak512_8way_update( &ctx.keccak, vhash, 64 );
             keccak512_8way_close( &ctx.keccak, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case LUFFA:
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in0, size );
             sph_tiger_close( &ctx.tiger, hash0 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in1, size );
             sph_tiger_close( &ctx.tiger, hash1 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in2, size );
             sph_tiger_close( &ctx.tiger, hash2 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in3, size );
             sph_tiger_close( &ctx.tiger, hash3 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in4, size );
             sph_tiger_close( &ctx.tiger, hash4 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in5, size );
             sph_tiger_close( &ctx.tiger, hash5 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in6, size );
             sph_tiger_close( &ctx.tiger, hash6 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in7, size );
             sph_tiger_close( &ctx.tiger, hash7 );
             for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 
                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
            intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3);
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7);
            luffa_4way_init( &ctx.luffa, 512 );
            luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case CUBEHASH:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            cube_4way_init( &ctx.cube, 512, 16, 32 );
            cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in0, size );
            sph_shavite512_close( &ctx.shavite, hash0 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in1, size );
            sph_shavite512_close( &ctx.shavite, hash1 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in2, size );
            sph_shavite512_close( &ctx.shavite, hash2 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in3, size );
            sph_shavite512_close( &ctx.shavite, hash3 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in4, size );
            sph_shavite512_close( &ctx.shavite, hash4 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in5, size );
            sph_shavite512_close( &ctx.shavite, hash5 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in6, size );
            sph_shavite512_close( &ctx.shavite, hash6 );
            sph_shavite512_init( &ctx.shavite );
            sph_shavite512( &ctx.shavite, in7, size );
            sph_shavite512_close( &ctx.shavite, hash7 );
         break;
         case SIMD:
            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
            simd_4way_init( &ctx.simd, 512 );
            simd_4way_update_close( &ctx.simd, vhash, vhash, size<<3 );
            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
         break;
         case ECHO:
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
                                (const BitSequence*)in0, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
                                (const BitSequence*)in1, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
                                (const BitSequence*)in2, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
                                (const BitSequence*)in3, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash4,
                                (const BitSequence*)in4, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash5,
                                (const BitSequence*)in5, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash6,
                                (const BitSequence*)in6, size<<3 );
             init_echo( &ctx.echo, 512 );
             update_final_echo ( &ctx.echo, (BitSequence *)hash7,
                                (const BitSequence*)in7, size<<3 );
         break;
         case HAMSI:
             intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                            size<<3 );
             hamsi512_8way_init( &ctx.hamsi );
             hamsi512_8way_update( &ctx.hamsi, vhash, size );
             hamsi512_8way_close( &ctx.hamsi, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
             break;
         case FUGUE:
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in0, size );
             sph_fugue512_close( &ctx.fugue, hash0 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in1, size );
             sph_fugue512_close( &ctx.fugue, hash1 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in2, size );
             sph_fugue512_close( &ctx.fugue, hash2 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in3, size );
             sph_fugue512_close( &ctx.fugue, hash3 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in4, size );
             sph_fugue512_close( &ctx.fugue, hash4 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in5, size );
             sph_fugue512_close( &ctx.fugue, hash5 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in6, size );
             sph_fugue512_close( &ctx.fugue, hash6 );
             sph_fugue512_init( &ctx.fugue );
             sph_fugue512( &ctx.fugue, in7, size );
             sph_fugue512_close( &ctx.fugue, hash7 );
         break;
         case SHABAL:
             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
                          size<<3 );
             shabal512_8way_init( &ctx.shabal );
             shabal512_8way_update( &ctx.shabal, vhash, size );
             shabal512_8way_close( &ctx.shabal, vhash );
             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in0, size );
             sph_whirlpool_close( &ctx.whirlpool, hash0 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in1, size );
             sph_whirlpool_close( &ctx.whirlpool, hash1 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in2, size );
             sph_whirlpool_close( &ctx.whirlpool, hash2 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in3, size );
             sph_whirlpool_close( &ctx.whirlpool, hash3 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in4, size );
             sph_whirlpool_close( &ctx.whirlpool, hash4 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in5, size );
             sph_whirlpool_close( &ctx.whirlpool, hash5 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in6, size );
             sph_whirlpool_close( &ctx.whirlpool, hash6 );
             sph_whirlpool_init( &ctx.whirlpool );
             sph_whirlpool( &ctx.whirlpool, in7, size );
             sph_whirlpool_close( &ctx.whirlpool, hash7 );
         break;
         case SHA_512:
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in0, size );
             sph_tiger_close( &ctx.tiger, hash0 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in1, size );
             sph_tiger_close( &ctx.tiger, hash1 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in2, size );
             sph_tiger_close( &ctx.tiger, hash2 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in3, size );
             sph_tiger_close( &ctx.tiger, hash3 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in4, size );
             sph_tiger_close( &ctx.tiger, hash4 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in5, size );
             sph_tiger_close( &ctx.tiger, hash5 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in6, size );
             sph_tiger_close( &ctx.tiger, hash6 );
             sph_tiger_init( &ctx.tiger );
             sph_tiger( &ctx.tiger, in7, size );
             sph_tiger_close( &ctx.tiger, hash7 );
             for ( int i = (24/4); i < (64/4); i++ )
                hash0[i] = hash1[i] = hash2[i] = hash3[i] =
                hash4[i] = hash5[i] = hash6[i] = hash7[i] = 0;
             intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5,
                          hash6, hash7 );
             sha512_8way_init( &ctx.sha512 );
             sha512_8way_update( &ctx.sha512, vhash, 64 );
             sha512_8way_close( &ctx.sha512, vhash );
             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                          hash7, vhash );
         break;
      }
      size = 64;
   }
   memcpy( output,     hash0, 32 );
   memcpy( output+32,  hash1, 32 );
   memcpy( output+64,  hash2, 32 );
   memcpy( output+96,  hash3, 32 );
   memcpy( output+128, hash4, 32 );
   memcpy( output+160, hash5, 32 );
   memcpy( output+192, hash6, 32 );
   memcpy( output+224, hash7, 32 );
 }
 int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[8*16] __attribute__ ((aligned (128)));
   uint32_t vdata[24*8] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
    __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
   int thr_id = mythr->id;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
   if ( opt_benchmark )
      ptarget[7] = 0x0cff;
   mm512_bswap32_intrlv80_8x64( vdata, pdata );
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
   }
   do
   {
      *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
      x16rv2_8way_hash( hash, vdata );
      pdata[19] = n;
      for ( int i = 0; i < 8; i++ )
      if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
      if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
      {
         pdata[19] = n+i;
         submit_lane_solution( work, hash+(i<<3), mythr, i );
      }
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   *hashes_done = n - first_nonce;
   return 0;
 }
 #elif defined (X16R_4WAY)
 union _x16rv2_4way_context_overlay
 {
    blake512_4way_context   blake;
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,6 +4,8 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.2'
+PACKAGE_VERSION='3.10.4'
-PACKAGE_STRING='cpuminer-opt 3.10.2'
+PACKAGE_STRING='cpuminer-opt 3.10.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.4 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.4:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.10.2
+cpuminer-opt configure 3.10.4
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.10.2, which was
+It was created by cpuminer-opt $as_me 3.10.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.10.2'
+ VERSION='3.10.4'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.2, which was
+This file was extended by cpuminer-opt $as_me 3.10.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.2
+cpuminer-opt config.status 3.10.4
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.2])
+AC_INIT([cpuminer-opt], [3.10.4])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
        printf(".\n");
     #endif
-     printf("CPU features:");
+     printf("CPU features: ");
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( " AES"    );
     if      ( cpu_has_sha    )    printf( " SHA"    );
     if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2"   );
+     else if ( cpu_has_avx2   )    printf( " AVX2  " );
-     else if ( cpu_has_avx    )    printf( " AVX"    );
+     else if ( cpu_has_avx    )    printf( " AVX   " );
     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2"   );
+     else if ( cpu_has_sse2   )    printf( " SSE2  " );
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( "  AES"   );
     if      ( cpu_has_sha    )    printf( " SHA"    );
-     printf(".\nSW features:");
+     printf("\nSW features:  ");
     if      ( sw_has_vaes   )    printf( " VAES"   );
     else if ( sw_has_aes    )    printf( " AES"    );
     if      ( sw_has_sha    )    printf( " SHA"    );
     if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2"   );
+     else if ( sw_has_avx2   )    printf( " AVX2  " );
-     else if ( sw_has_avx    )    printf( " AVX"    );
+     else if ( sw_has_avx    )    printf( " AVX   " );
     else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2"   );
+     else if ( sw_has_sse2   )    printf( " SSE2  " );
     if      ( sw_has_vaes   )    printf( " VAES"   );
     else if ( sw_has_aes    )    printf( " AES "   );
     if      ( sw_has_sha    )    printf( " SHA"    );
-     printf(".\nAlgo features:");
+     printf("\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
        if      ( algo_has_vaes   )    printf( " VAES"   );
        else if ( algo_has_aes    )    printf( " AES"    );
        if      ( algo_has_sha    )    printf( " SHA"    );
        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2"   );
+        else if ( algo_has_avx2   )    printf( " AVX2  " );
        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2"   );
+        else if ( algo_has_sse2   )    printf( " SSE2  " );
        if      ( algo_has_vaes   )    printf( " VAES"   );
        else if ( algo_has_aes    )    printf( " AES "   );
        if      ( algo_has_sha    )    printf( " SHA"    );
     }
-     printf(".\n");
+     printf("\n");
     // Check for CPU and build incompatibilities
     if ( !cpu_has_sse2 )
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
                   use_sha || use_vaes );
     // Display best options
-     printf( "Start mining with" );
+     printf( "\nStarting miner with" );
     if         ( use_none ) printf( " no optimizations" );
     else
     {
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_sse42  ) printf( " SSE4.2" );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha    ) printf( " SHA"    );
     }
-     printf( ".\n\n" );
+     printf( "...\n\n" );
     return true;
 }
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -2075,9 +2075,6 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
   d0[3] = s[6];      d1[3] = s[7];
 }
 #endif // AVX
 ///////////////////////////
@@ -2225,7 +2222,6 @@ static inline void rintrlv_4x32_4x64( void *dst,
 // 2x128 -> 4x64
 static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
                                       const void *src1, const int bit_len )
 {
@@ -2268,7 +2264,6 @@ static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
 }
 /*
 #define RLEAVE_2x128_4x64( i ) do \
 { \
@@ -2339,7 +2334,6 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
   d1[15] = _mm_unpackhi_epi64( s[29], s[31] );
 }
 /*
 #define RLEAVE_4x64_2x128( i ) do \
 { \
@@ -2364,6 +2358,354 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
 }
 */
 // 2x128 -> 8x64
 static inline void rintrlv_4x128_8x64( void *dst, const void *src0,
                                       const void *src1, const int bit_len )
 {
   __m128i *d = (__m128i*)dst;
   const __m128i *s0 = (const __m128i*)src0;
   const __m128i *s1 = (const __m128i*)src1;
   d[ 0] = _mm_unpacklo_epi64( s0[ 0], s0[ 1] );
   d[ 1] = _mm_unpacklo_epi64( s0[ 2], s0[ 3] );
   d[ 2] = _mm_unpacklo_epi64( s1[ 0], s1[ 1] );
   d[ 3] = _mm_unpacklo_epi64( s1[ 2], s1[ 3] );
   d[ 4] = _mm_unpackhi_epi64( s0[ 0], s0[ 1] );
   d[ 5] = _mm_unpackhi_epi64( s0[ 2], s0[ 3] );
   d[ 6] = _mm_unpackhi_epi64( s1[ 0], s1[ 1] );
   d[ 7] = _mm_unpackhi_epi64( s1[ 2], s1[ 3] );
   d[ 8] = _mm_unpacklo_epi64( s0[ 4], s0[ 5] );
   d[ 9] = _mm_unpacklo_epi64( s0[ 6], s0[ 7] );
   d[10] = _mm_unpacklo_epi64( s1[ 4], s1[ 5] );
   d[11] = _mm_unpacklo_epi64( s1[ 6], s1[ 7] );
   d[12] = _mm_unpackhi_epi64( s0[ 4], s0[ 5] );
   d[13] = _mm_unpackhi_epi64( s0[ 6], s0[ 7] );
   d[14] = _mm_unpackhi_epi64( s1[ 4], s1[ 5] );
   d[15] = _mm_unpackhi_epi64( s1[ 6], s1[ 7] );
   if ( bit_len <= 256 ) return;
   d[16] = _mm_unpacklo_epi64( s0[ 8], s0[ 9] );
   d[17] = _mm_unpacklo_epi64( s0[10], s0[11] );
   d[18] = _mm_unpacklo_epi64( s1[ 8], s1[ 9] );
   d[19] = _mm_unpacklo_epi64( s1[10], s1[11] );
   d[20] = _mm_unpackhi_epi64( s0[ 8], s0[ 9] );
   d[21] = _mm_unpackhi_epi64( s0[10], s0[11] );
   d[22] = _mm_unpackhi_epi64( s1[ 8], s1[ 9] );
   d[23] = _mm_unpackhi_epi64( s1[10], s1[11] );
   d[24] = _mm_unpacklo_epi64( s0[12], s0[13] );
   d[25] = _mm_unpacklo_epi64( s0[14], s0[15] );
   d[26] = _mm_unpacklo_epi64( s1[12], s1[13] );
   d[27] = _mm_unpacklo_epi64( s1[14], s1[15] );
   d[28] = _mm_unpackhi_epi64( s0[12], s0[13] );
   d[29] = _mm_unpackhi_epi64( s0[14], s0[15] );
   d[30] = _mm_unpackhi_epi64( s1[12], s1[13] );
   d[31] = _mm_unpackhi_epi64( s1[14], s1[15] );
   if ( bit_len <= 512 ) return;
   d[32] = _mm_unpacklo_epi64( s0[16], s0[17] );
   d[33] = _mm_unpacklo_epi64( s0[18], s0[19] );
   d[34] = _mm_unpacklo_epi64( s1[16], s1[17] );
   d[35] = _mm_unpacklo_epi64( s1[18], s1[19] );
   d[36] = _mm_unpackhi_epi64( s0[16], s0[17] );
   d[37] = _mm_unpackhi_epi64( s0[18], s0[19] );
   d[38] = _mm_unpackhi_epi64( s1[16], s1[17] );
   d[39] = _mm_unpackhi_epi64( s1[18], s1[19] );
   d[40] = _mm_unpacklo_epi64( s0[20], s0[21] );
   d[41] = _mm_unpacklo_epi64( s0[22], s0[23] );
   d[42] = _mm_unpacklo_epi64( s1[20], s1[21] );
   d[43] = _mm_unpacklo_epi64( s1[22], s1[23] );
   d[44] = _mm_unpackhi_epi64( s0[20], s0[21] );
   d[45] = _mm_unpackhi_epi64( s0[22], s0[23] );
   d[46] = _mm_unpackhi_epi64( s1[20], s1[21] );
   d[47] = _mm_unpackhi_epi64( s1[22], s1[23] );
   d[48] = _mm_unpacklo_epi64( s0[24], s0[25] );
   d[49] = _mm_unpacklo_epi64( s0[26], s0[27] );
   d[50] = _mm_unpacklo_epi64( s1[24], s1[25] );
   d[51] = _mm_unpacklo_epi64( s1[26], s1[27] );
   d[52] = _mm_unpackhi_epi64( s0[24], s0[25] );
   d[53] = _mm_unpackhi_epi64( s0[26], s0[27] );
   d[54] = _mm_unpackhi_epi64( s1[24], s1[25] );
   d[55] = _mm_unpackhi_epi64( s1[26], s1[27] );
   d[56] = _mm_unpacklo_epi64( s0[28], s0[29] );
   d[57] = _mm_unpacklo_epi64( s0[30], s0[31] );
   d[58] = _mm_unpacklo_epi64( s1[28], s1[29] );
   d[59] = _mm_unpacklo_epi64( s1[30], s1[31] );
   d[60] = _mm_unpackhi_epi64( s0[28], s0[29] );
   d[61] = _mm_unpackhi_epi64( s0[30], s0[31] );
   d[62] = _mm_unpackhi_epi64( s1[28], s1[29] );
   d[63] = _mm_unpackhi_epi64( s1[30], s1[31] );
 }
 // 8x64 -> 4x128
 static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
                                       const void *src, const int bit_len )
 {
   __m128i *d0 = (__m128i*)dst0;
   __m128i *d1 = (__m128i*)dst1;
   const __m128i* s = (const __m128i*)src;
   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
   d0[ 1] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
   d1[ 0] = _mm_unpacklo_epi64( s[ 2], s[ 6] );
   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
   d0[ 2] = _mm_unpacklo_epi64( s[ 1], s[ 5] );
   d0[ 3] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
   d1[ 2] = _mm_unpacklo_epi64( s[ 3], s[ 7] );
   d1[ 3] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
   d0[ 4] = _mm_unpacklo_epi64( s[ 8], s[12] );
   d0[ 5] = _mm_unpackhi_epi64( s[ 8], s[12] );
   d1[ 4] = _mm_unpacklo_epi64( s[10], s[14] );
   d1[ 5] = _mm_unpackhi_epi64( s[10], s[14] );
   d0[ 6] = _mm_unpacklo_epi64( s[ 9], s[13] );
   d0[ 7] = _mm_unpackhi_epi64( s[ 9], s[13] );
   d1[ 6] = _mm_unpacklo_epi64( s[11], s[15] );
   d1[ 7] = _mm_unpackhi_epi64( s[11], s[15] );
   if ( bit_len <= 256 ) return;
   d0[ 8] = _mm_unpacklo_epi64( s[16], s[20] );
   d0[ 9] = _mm_unpackhi_epi64( s[16], s[20] );
   d1[ 8] = _mm_unpacklo_epi64( s[18], s[22] );
   d1[ 9] = _mm_unpackhi_epi64( s[18], s[22] );
   d0[10] = _mm_unpacklo_epi64( s[17], s[21] );
   d0[11] = _mm_unpackhi_epi64( s[17], s[21] );
   d1[10] = _mm_unpacklo_epi64( s[19], s[23] );
   d1[11] = _mm_unpackhi_epi64( s[19], s[23] );
   d0[12] = _mm_unpacklo_epi64( s[24], s[28] );
   d0[13] = _mm_unpackhi_epi64( s[24], s[28] );
   d1[12] = _mm_unpacklo_epi64( s[26], s[30] );
   d1[13] = _mm_unpackhi_epi64( s[26], s[30] );
   d0[14] = _mm_unpacklo_epi64( s[25], s[29] );
   d0[15] = _mm_unpackhi_epi64( s[25], s[29] );
   d1[14] = _mm_unpacklo_epi64( s[27], s[31] );
   d1[15] = _mm_unpackhi_epi64( s[27], s[31] );
   if ( bit_len <= 512 ) return;
   d0[16] = _mm_unpacklo_epi64( s[32], s[36] );
   d0[17] = _mm_unpackhi_epi64( s[32], s[36] );
   d1[16] = _mm_unpacklo_epi64( s[34], s[38] );
   d1[17] = _mm_unpackhi_epi64( s[34], s[38] );
   d0[18] = _mm_unpacklo_epi64( s[33], s[37] );
   d0[19] = _mm_unpackhi_epi64( s[33], s[37] );
   d1[18] = _mm_unpacklo_epi64( s[35], s[39] );
   d1[19] = _mm_unpackhi_epi64( s[35], s[39] );
   d0[20] = _mm_unpacklo_epi64( s[40], s[44] );
   d0[21] = _mm_unpackhi_epi64( s[40], s[44] );
   d1[20] = _mm_unpacklo_epi64( s[42], s[46] );
   d1[21] = _mm_unpackhi_epi64( s[42], s[46] );
   d0[22] = _mm_unpacklo_epi64( s[41], s[45] );
   d0[23] = _mm_unpackhi_epi64( s[41], s[45] );
   d1[22] = _mm_unpacklo_epi64( s[43], s[47] );
   d1[23] = _mm_unpackhi_epi64( s[43], s[47] );
   d0[24] = _mm_unpacklo_epi64( s[48], s[52] );
   d0[25] = _mm_unpackhi_epi64( s[48], s[52] );
   d1[24] = _mm_unpacklo_epi64( s[50], s[54] );
   d1[25] = _mm_unpackhi_epi64( s[50], s[54] );
   d0[26] = _mm_unpacklo_epi64( s[49], s[53] );
   d0[27] = _mm_unpackhi_epi64( s[49], s[53] );
   d1[26] = _mm_unpacklo_epi64( s[51], s[55] );
   d1[27] = _mm_unpackhi_epi64( s[51], s[55] );
   d0[28] = _mm_unpacklo_epi64( s[56], s[60] );
   d0[29] = _mm_unpackhi_epi64( s[56], s[60] );
   d1[28] = _mm_unpacklo_epi64( s[58], s[62] );
   d1[29] = _mm_unpackhi_epi64( s[58], s[62] );
   d0[30] = _mm_unpacklo_epi64( s[57], s[61] );
   d0[31] = _mm_unpackhi_epi64( s[57], s[61] );
   d1[30] = _mm_unpacklo_epi64( s[59], s[63] );
   d1[31] = _mm_unpackhi_epi64( s[59], s[63] );
 }
 // 8x64 -> 2x256
 static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
                          void *dst3,  const void *src, const int bit_len )
 {
   __m128i *d0 = (__m128i*)dst0;
   __m128i *d1 = (__m128i*)dst1;
   __m128i *d2 = (__m128i*)dst2;
   __m128i *d3 = (__m128i*)dst3;
   const __m128i* s = (const __m128i*)src;
   d0[ 0] = _mm_unpacklo_epi64( s[ 0], s[ 4] );
   d1[ 0] = _mm_unpackhi_epi64( s[ 0], s[ 4] );
   d2[ 0] = _mm_unpacklo_epi64( s[ 1], s[ 5] );   
   d3[ 0] = _mm_unpackhi_epi64( s[ 1], s[ 5] );
   d0[ 1] = _mm_unpacklo_epi64( s[ 2], s[ 6] ); 
   d1[ 1] = _mm_unpackhi_epi64( s[ 2], s[ 6] );
   d2[ 1] = _mm_unpacklo_epi64( s[ 3], s[ 7] ); 
   d3[ 1] = _mm_unpackhi_epi64( s[ 3], s[ 7] );
   d0[ 2] = _mm_unpacklo_epi64( s[ 8], s[12] ); 
   d1[ 2] = _mm_unpackhi_epi64( s[ 8], s[12] );
   d2[ 2] = _mm_unpacklo_epi64( s[ 9], s[13] ); 
   d3[ 2] = _mm_unpackhi_epi64( s[ 9], s[13] );
   d0[ 3] = _mm_unpacklo_epi64( s[10], s[14] );
   d1[ 3] = _mm_unpackhi_epi64( s[10], s[14] );
   d2[ 3] = _mm_unpacklo_epi64( s[11], s[15] );
   d3[ 3] = _mm_unpackhi_epi64( s[11], s[15] );
   if ( bit_len <= 256 ) return;
   d0[ 4] = _mm_unpacklo_epi64( s[16], s[20] );
   d1[ 4] = _mm_unpackhi_epi64( s[16], s[20] );
   d2[ 4] = _mm_unpacklo_epi64( s[17], s[21] );
   d3[ 4] = _mm_unpackhi_epi64( s[17], s[21] );
   d0[ 5] = _mm_unpacklo_epi64( s[18], s[22] );
   d1[ 5] = _mm_unpackhi_epi64( s[18], s[22] );
   d2[ 5] = _mm_unpacklo_epi64( s[19], s[23] );
   d3[ 5] = _mm_unpackhi_epi64( s[19], s[23] );
   d0[ 6] = _mm_unpacklo_epi64( s[24], s[28] );
   d1[ 6] = _mm_unpackhi_epi64( s[24], s[28] );
   d2[ 6] = _mm_unpacklo_epi64( s[25], s[29] );
   d3[ 6] = _mm_unpackhi_epi64( s[25], s[29] );
   d0[ 7] = _mm_unpacklo_epi64( s[26], s[30] );
   d1[ 7] = _mm_unpackhi_epi64( s[26], s[30] );
   d2[ 7] = _mm_unpacklo_epi64( s[27], s[31] );
   d3[ 7] = _mm_unpackhi_epi64( s[27], s[31] );
   if ( bit_len <= 512 ) return;
   d0[ 8] = _mm_unpacklo_epi64( s[32], s[36] );
   d1[ 8] = _mm_unpackhi_epi64( s[32], s[36] );
   d2[ 8] = _mm_unpacklo_epi64( s[33], s[37] );
   d3[ 8] = _mm_unpackhi_epi64( s[33], s[37] );
   d0[ 9] = _mm_unpacklo_epi64( s[34], s[38] );
   d1[ 9] = _mm_unpackhi_epi64( s[34], s[38] );
   d2[ 9] = _mm_unpacklo_epi64( s[35], s[39] );
   d3[ 9] = _mm_unpackhi_epi64( s[35], s[39] );
   d0[10] = _mm_unpacklo_epi64( s[40], s[44] );
   d1[10] = _mm_unpackhi_epi64( s[40], s[44] );
   d2[10] = _mm_unpacklo_epi64( s[41], s[45] );
   d3[10] = _mm_unpackhi_epi64( s[41], s[45] );
   d0[11] = _mm_unpacklo_epi64( s[42], s[46] );
   d1[11] = _mm_unpackhi_epi64( s[42], s[46] );
   d2[11] = _mm_unpacklo_epi64( s[43], s[47] );
   d3[11] = _mm_unpackhi_epi64( s[43], s[47] );
   d0[12] = _mm_unpacklo_epi64( s[48], s[52] );
   d1[12] = _mm_unpackhi_epi64( s[48], s[52] );
   d2[12] = _mm_unpacklo_epi64( s[49], s[53] );
   d3[12] = _mm_unpackhi_epi64( s[49], s[53] );
   d0[13] = _mm_unpacklo_epi64( s[50], s[54] );
   d1[13] = _mm_unpackhi_epi64( s[50], s[54] );
   d2[13] = _mm_unpacklo_epi64( s[51], s[55] );
   d3[13] = _mm_unpackhi_epi64( s[51], s[55] );
   d0[14] = _mm_unpacklo_epi64( s[56], s[60] );
   d1[14] = _mm_unpackhi_epi64( s[56], s[60] );
   d2[14] = _mm_unpacklo_epi64( s[57], s[61] );
   d3[14] = _mm_unpackhi_epi64( s[57], s[61] );
   d0[15] = _mm_unpacklo_epi64( s[58], s[62] );
   d1[15] = _mm_unpackhi_epi64( s[58], s[62] );
   d2[15] = _mm_unpacklo_epi64( s[59], s[63] );
   d3[15] = _mm_unpackhi_epi64( s[59], s[63] );
 }
 // 4x128 -> 8x64
 static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
      const void *src1, const void *src2, const void *src3, const int bit_len )
 {
   __m128i *d = (__m128i*)dst;
   __m128i *s0 = (__m128i*)src0;
   __m128i *s1 = (__m128i*)src1;
   __m128i *s2 = (__m128i*)src2;
   __m128i *s3 = (__m128i*)src3;
   d[ 0] = _mm_unpacklo_epi64( s0[0], s0[2] );
   d[ 1] = _mm_unpacklo_epi64( s1[0], s1[2] );
   d[ 2] = _mm_unpacklo_epi64( s2[0], s2[2] );
   d[ 3] = _mm_unpacklo_epi64( s3[0], s3[2] );
   d[ 4] = _mm_unpackhi_epi64( s0[0], s0[2] );
   d[ 5] = _mm_unpackhi_epi64( s1[0], s1[2] );
   d[ 6] = _mm_unpackhi_epi64( s2[0], s2[2] );
   d[ 7] = _mm_unpackhi_epi64( s3[0], s3[2] );
   d[ 8] = _mm_unpacklo_epi64( s0[1], s0[3] );
   d[ 9] = _mm_unpacklo_epi64( s1[1], s1[3] );
   d[10] = _mm_unpacklo_epi64( s2[1], s2[3] );
   d[11] = _mm_unpacklo_epi64( s3[1], s3[3] );
   d[12] = _mm_unpackhi_epi64( s0[1], s0[3] );
   d[13] = _mm_unpackhi_epi64( s1[1], s1[3] );
   d[14] = _mm_unpackhi_epi64( s2[1], s2[3] );
   d[15] = _mm_unpackhi_epi64( s3[1], s3[3] );
   if ( bit_len <= 256 ) return;
   d[16] = _mm_unpacklo_epi64( s0[4], s0[6] );
   d[17] = _mm_unpacklo_epi64( s1[4], s1[6] );
   d[18] = _mm_unpacklo_epi64( s2[4], s2[6] );
   d[19] = _mm_unpacklo_epi64( s3[4], s3[6] );
   d[20] = _mm_unpackhi_epi64( s0[4], s0[6] );
   d[21] = _mm_unpackhi_epi64( s1[4], s1[6] );
   d[22] = _mm_unpackhi_epi64( s2[4], s2[6] );
   d[23] = _mm_unpackhi_epi64( s3[4], s3[6] );
   d[24] = _mm_unpacklo_epi64( s0[5], s0[7] );
   d[25] = _mm_unpacklo_epi64( s1[5], s1[7] );
   d[26] = _mm_unpacklo_epi64( s2[5], s2[7] );
   d[27] = _mm_unpacklo_epi64( s3[5], s3[7] );
   d[28] = _mm_unpackhi_epi64( s0[5], s0[7] );
   d[29] = _mm_unpackhi_epi64( s1[5], s1[7] );
   d[30] = _mm_unpackhi_epi64( s2[5], s2[7] );
   d[31] = _mm_unpackhi_epi64( s3[5], s3[7] );
   if ( bit_len <= 512 ) return;
   d[32] = _mm_unpacklo_epi64( s0[8], s0[10] );
   d[33] = _mm_unpacklo_epi64( s1[8], s1[10] );
   d[34] = _mm_unpacklo_epi64( s2[8], s2[10] );
   d[35] = _mm_unpacklo_epi64( s3[8], s3[10] );
   d[36] = _mm_unpackhi_epi64( s0[8], s0[10] );
   d[37] = _mm_unpackhi_epi64( s1[8], s1[10] );
   d[38] = _mm_unpackhi_epi64( s2[8], s2[10] );
   d[39] = _mm_unpackhi_epi64( s3[8], s3[10] );
   d[40] = _mm_unpacklo_epi64( s0[9], s0[11] );
   d[41] = _mm_unpacklo_epi64( s1[9], s1[11] );
   d[42] = _mm_unpacklo_epi64( s2[9], s2[11] );
   d[43] = _mm_unpacklo_epi64( s3[9], s3[11] );
   d[44] = _mm_unpackhi_epi64( s0[9], s0[11] );
   d[45] = _mm_unpackhi_epi64( s1[9], s1[11] );
   d[46] = _mm_unpackhi_epi64( s2[9], s2[11] );
   d[47] = _mm_unpackhi_epi64( s3[9], s3[11] );
   d[48] = _mm_unpacklo_epi64( s0[12], s0[14] );
   d[49] = _mm_unpacklo_epi64( s1[12], s1[14] );
   d[50] = _mm_unpacklo_epi64( s2[12], s2[14] );
   d[51] = _mm_unpacklo_epi64( s3[12], s3[14] );
   d[52] = _mm_unpackhi_epi64( s0[12], s0[14] );
   d[53] = _mm_unpackhi_epi64( s1[12], s1[14] );
   d[54] = _mm_unpackhi_epi64( s2[12], s2[14] );
   d[55] = _mm_unpackhi_epi64( s3[12], s3[14] );
   d[56] = _mm_unpacklo_epi64( s0[13], s0[15] );
   d[57] = _mm_unpacklo_epi64( s1[13], s1[15] );
   d[58] = _mm_unpacklo_epi64( s2[13], s2[15] );
   d[59] = _mm_unpacklo_epi64( s3[13], s3[15] );
   d[60] = _mm_unpackhi_epi64( s0[13], s0[15] );
   d[61] = _mm_unpackhi_epi64( s1[13], s1[15] );
   d[62] = _mm_unpackhi_epi64( s2[13], s2[15] );
   d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
 }
 //
 // Some functions customized for mining.
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #else
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
 // Rotate 16 byte (128 bit) vector by c bytes.
 // Less efficient using shift but more versatile. Use only for odd number
 // byte rotations. Use shuffle above whenever possible.
 #define mm128_ror_x8( v, c ) \
   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
 #define mm128_rol_x8( v, c ) \
   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
 #if defined (__SSE3__)
 // no SSE2 implementation, no current users
@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_1x8( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
                                       0x060504030201000f ) )
-#endif  // SSE3
+#else  // SSE2
-// Rotate 16 byte (128 bit) vector by c bytes.
+#define mm128_ror_1x16( v ) \
-// Less efficient using shift but more versatile. Use only for odd number
+   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
 // byte rotations. Use shuffle above whenever possible.
 #define mm128_bror( v, c ) \
   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
-#define mm128_brol( v, c ) \
+#define mm128_rol_1x16( v ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
 #define mm128_ror_1x8( v ) \
   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
 #define mm128_rol_1x8( v ) \
   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
 #endif   // SSE3 else SSE2
 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Rotate elements within lanes.
-#define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_swap_64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_ror16_64( v ) \
+#define mm128_rol64_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
+     _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
-                                       0x0100070605040302 )
+                   _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-#define mm128_rol16_64( v ) \
+#define mm128_ror64_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
+     _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
-                                       0x0504030201000706 )
+                   _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-#define mm128_swap16_32( v ) \
+#define mm128_rol32_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
+     _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
-                                       0x0504070601000302 )
+                   _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
 #define mm128_ror32_8( v, c ) \
     _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
                   _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
 //
 // Endian byte swap.
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 // Swap 128 bit vectorse.
-#define mm128_swap128_256( v1, v2 ) \
+#define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
 #if defined(__SSE4_1__)
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -497,7 +515,7 @@ do { \
 #else  // SSE2
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                              _mm_slli_si128( v2, 8 ) ); \
@@ -506,7 +524,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                              _mm_srli_si128( v2, 8 ) ); \
@@ -515,7 +533,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                              _mm_slli_si128( v2, 12 ) ); \
@@ -524,7 +542,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                              _mm_srli_si128( v2, 12 ) ); \
@@ -533,7 +551,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                              _mm_slli_si128( v2, 14 ) ); \
@@ -542,7 +560,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                              _mm_srli_si128( v2, 14 ) ); \
@@ -551,7 +569,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                              _mm_slli_si128( v2, 15 ) ); \
@@ -560,7 +578,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                              _mm_srli_si128( v2, 15 ) ); \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
-// Rotate elements within lanes of 256 bit vector.
+// Rotate elements within each 128 bit lane of 256 bit vector.
-// Swap 64 bit elements in each 128 bit lane.
+#define mm256_swap128_64( v )   _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_swap64_128( v )   _mm256_shuffle_epi32( v, 0x4e )
-// Rotate each 128 bit lane by one 32 bit element.
+#define mm256_ror128_32( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
-#define mm256_ror1x16_128( v ) \
+#define mm256_rol128_1x32( v )  _mm256_shuffle_epi32( v, 0x93 )
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
-#define mm256_rol1x16_128( v ) \
+// Rotave each 128 bit lane by c elements.
-   _mm256_shuffle_epi8( v, \
+#define mm256_ror128_8( v, c ) \
         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )
 #define mm256_ror1x8_128( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
 #define mm256_rol1x8_128( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 // Rotate each 128 bit lane by c bytes.
 #define mm256_bror_128( v, c ) \
  _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
                   _mm256_bslli_epi128( v, 16-(c) ) )
-#define mm256_brol_128( v, c ) \
+#define mm256_rol128_8( v, c ) \
  _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
                   _mm256_bsrli_epi128( v, 16-(c) ) )
-// Swap 32 bit elements in each 64 bit lane
+
-#define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
+// Rotate elements in each 64 bit lane
 #define mm256_swap64_32( v )    _mm256_shuffle_epi32( v, 0xb1 )
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#define mm256_rol1x16_64( v )   _mm256_rol_epi64( v, 16 )
+#define mm256_rol64_8( v, c )   _mm256_rol_epi64( v, ((c)<<3) ) 
-#define mm256_ror1x16_64( v )   _mm256_ror_epi64( v, 16 )
+#define mm256_ror64_8( v, c )   _mm256_ror_epi64( v, ((c)<<3) ) 
 #else
-#define mm256_ror1x16_64( v ) \
+#define mm256_rol64_8( v, c ) \
-   _mm256_shuffle_epi8( v, \
+     _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
-        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
+                      _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+
 #define mm256_ror64_8( v, c ) \
     _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
                      _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
 #define mm256_rol1x16_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 #endif
 #define mm256_ror1x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
-#define mm256_rol1x8_64( v ) \
+// Rotate elements in each 32 bit lane
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
 #define mm256_ror3x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #define mm256_rol3x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
 // Swap 16 bit elements in each 32 bit lane
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#define mm256_swap16_32( v )   _mm256_rol_epi32( v, 16 )
+#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
 #define mm256_rol32_8( v )   _mm256_rol_epi32( v, 8 )
 #define mm256_ror32_8( v )   _mm256_ror_epi32( v, 8 )
 #else
-#define mm256_swap16_32( v ) \
+#define mm256_swap32_16( v ) \
-   _mm256_shuffle_epi8( v, \
+     _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
-         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
+                      _mm256_srli_epi32( v, 16 ) )
-                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
+
 #define mm256_rol32_8( v ) \
     _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
                      _mm256_srli_epi32( v, 8 ) )
 #define mm256_ror32_8( v, c ) \
     _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
                      _mm256_slli_epi32( v, 8 ) )
 #endif
 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.
-#define mm256_swap256_512 (v1, v2) \
+#define mm256_swap512_256( v1, v2 ) \
-   v1 = _mm256_xor_si256(v1, v2); \
+   v1 = _mm256_xor_si256( v1, v2 ); \
-   v2 = _mm256_xor_si256(v1, v2); \
+   v2 = _mm256_xor_si256( v1, v2 ); \
-   v1 = _mm256_xor_si256(v1, v2);
+   v1 = _mm256_xor_si256( v1, v2 );
-#define mm256_ror1x128_512( v1, v2 ) \
+#define mm256_ror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)
-#define mm256_rol1x128_512( v1, v2 ) \
+#define mm256_rol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,13 @@
 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    Some instructions like cmp and blend use the mask regsiters now instead
+//    cmp instruction now returns a bitmask isnstead of a vector mask.
-//    a vector mask.
+//    This eliminates the need for the blendv instruction.
 //
-//    The new rotate instructions require the count to be only an 8 bit
+//    The new rotate instructions require the count to be an 8 bit
-//    immediate value. The documentation is the same as for shift and
+//    immediate value only. Compilation fails if a variable is used.
-//    it allows variables. Suspect a compiler issue but it still happens
+//    The documentation is the same as for shift and it works with
-//    in GCC9.
+//    variables.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const2_64( i1, i0 ) \
   m512_const1_128( m128_const_64( i1, i0 ) )
 #define m512_const2_32( i1, i0 ) \
   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
 {
@@ -265,7 +270,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
               m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
                              0x28292a2b2c2d2e2f, 0x2021222324252627, \
                              0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                              0x08090a0b0c0d0e0f, 0x0001020304050607 ))
+                              0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
 #define mm512_bswap_32( v ) \
   _mm512_shuffle_epi8( v, \
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 { \
  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements in 512 bit vector.
 #define mm512_swap_256( v )        _mm512_alignr_epi64( v, v, 4 )
 // 1x64 notation used to disinguish from bit rotation.
 #define mm512_ror_1x128( v )       _mm512_alignr_epi64( v, v, 2 )
 #define mm512_rol_1x128( v )       _mm512_alignr_epi64( v, v, 6 )
@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
 // Rename these for consistency. Element size is always last.
 // mm<vectorsize>_<op><lanesize>_<elementsize>
 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap128_256( v )   _mm512_permutex_epi64( v, 0x4e )
+
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
+
-#define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
+#define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )
 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror1x32_256( v ) \
+
 #define mm512_ror256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x000000080000000f, 0x0000000e0000000d, \
                      0x0000000c0000000b, 0x0000000a00000009, \
                      0x0000000000000007, 0x0000000600000005, \
                      0x0000000400000003, 0x0000000200000001 ), v )
-#define mm512_rol1x32_256( v ) \
+#define mm512_rol256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x0000000e0000000d, 0x0000000c0000000b, \
                      0x0000000a00000009, 0x000000080000000f, \
                      0x0000000600000005, 0x0000000400000003, \
                      0x0000000200000001, 0x0000000000000007 ), v )
-#define mm512_ror1x16_256( v ) \
+#define mm512_ror256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x00100001001e001d, 0x001c001b001a0019, \
                     0x0018001700160015, 0x0014001300120011, \
                     0x0000000f000e000d, 0x000c000b000a0009, \
                     0x0008000700060005, 0x0004000300020001 ), v )
-#define mm512_rol1x16_256( v ) \
+#define mm512_rol256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x001e001d001c001b, 0x001a001900180017, \
                     0x0016001500140013, 0x001200110010001f, \
                     0x000e000d000c000b, 0x000a000900080007, \
                     0x0006000500040003, 0x000200010000000f ), v )
-#define mm512_ror1x8_256( v ) \
+#define mm512_ror256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x203f3e3d3c3b3a39, 0x3837363534333231, \
                     0x302f2e2d2c2b2a29, 0x2827262524232221, \
                     0x001f1e1d1c1b1a19, 0x1817161514131211, \
                     0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
-#define mm512_rol1x8_256( v ) \
+#define mm512_rol256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231302f, \
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Rotate elements within 128 bit lanes of 512 bit vector.
 // Swap hi & lo 64 bits in each 128 bit lane
-#define mm512_swap64_128( v )    _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64( v )    _mm512_shuffle_epi32( v, 0x4e )
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_ror128_32( v )   _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_rol128_32( v )   _mm512_shuffle_epi32( v, 0x93 )
 #define mm512_ror1x16_128( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                     0x0018001f001e001d, 0x001c001b001a0019, \
                     0x0010001700160015, 0x0014001300120011, \
                     0x0008000f000e000d, 0x000c000b000a0009, \
                     0x0000000700060005, 0x0004000300020001 ), v ) 
-#define mm512_rol1x16_128( v ) \
+// Rotate 128 bit lanes by c bytes, faster than building that monstrous 
-    _mm512_permutexvar_epi16( m512_const_64( \
+// constant above.  
-                     0x001e001d001c001b, 0x001a00190018001f, \
+#define mm512_ror128_8( v, c ) \
                     0x0016001500140013, 0x0012001100100017, \
                     0x000e000d000c000b, 0x000a00090008000f, \
                     0x0006000500040003, 0x0002000100000007 ), v ) 
 #define mm512_ror1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                     0x303f3e3d3c3b3a39, 0x3837363534333231, \
                     0x202f2e2d2c2b2a29, 0x2827262524232221, \
                     0x101f1e1d1c1b1a19, 0x1817161514131211, \
                     0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
 #define mm512_rol1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231303f, \
                     0x2e2d2c2b2a292827, 0x262524232221202f, \
                     0x1e1d1c1b1a191817, 0x161514131211101f, \
                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
 // Rotate 128 bit lanes by c bytes.  
 #define mm512_bror_128( v, c ) \
   _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
                    _mm512_bslli_epi128( v, 16-(c) ) )
-#define mm512_brol_128( v, c ) \
+#define mm512_rol128_8( v, c ) \
   _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
                    _mm512_bsrli_epi128( v, 16-(c) ) )
@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 64 bit lanes.
 #define mm512_rol64_x8( v, c )   _mm512_rol_epi64( v, ((c)<<3) )
 #define mm512_ror64_x8( v, c )   _mm512_ror_epi64( v, ((c)<<3) )
 // Swap 32 bit elements in each 64 bit lane
-#define mm512_swap32_64( v )      _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32( v )      _mm512_shuffle_epi32( v, 0xb1 )
 // Rotate each 64 bit lane by one 16 bit element.
-#define mm512_ror1x16_64( v )   _mm512_ror_epi64( v, 16 )
+#define mm512_ror64_16( v )   _mm512_ror_epi64( v, 16 )
-#define mm512_rol1x16_64( v )   _mm512_rol_epi64( v, 16 )
+#define mm512_rol64_16( v )   _mm512_rol_epi64( v, 16 )
-#define mm512_ror1x8_64( v )    _mm512_ror_epi64( v, 8 )
+#define mm512_ror64_8( v )    _mm512_ror_epi64( v, 8 )
-#define mm512_rol1x8_64( v )    _mm512_rol_epi64( v, 8 )
+#define mm512_rol64_8( v )    _mm512_rol_epi64( v, 8 )
 /*
 #define mm512_ror1x16_64( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001c001f001e001d, 0x0018001b001a0019, \
                      0x0014001700160015, 0x0010001300120011, \
                      0x000c000f000e000d, 0x0008000b000a0009, \
                      0x0004000700060005, 0x0000000300020001, v )
 #define mm512_rol1x16_64( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001e001d001c001f, 0x001a00190018001b, \
                      0x0016001500140017, 0x0012001100100013, \
                      0x000e000d000c000f, 0x000a00090008000b, \
                      0x0006000500040007, 0x0002000100000003, v )
 // Rotate each 64 bit lane by one byte.
 #define mm512_ror1x8_64( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x383F3E3D3C3B3A39, 0x3037363534333231, \
                      0x282F2E2D2C2B2A29, 0x2027262524232221, \
                      0x181F1E1D1C1B1A19, 0x1017161514131211, \
                      0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
 #define mm512_rol1x8_64( v ) \
    _mm512_shuffle( v, m512_const_64( \
                       0x3E3D3C3B3A39383F, 0x3635343332313037, \
                       0x2E2D2C2B2A29282F, 0x2625242322212027, \
                       0x1E1D1C1B1A19181F, 0x1615141312111017, \
                       0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
 */
 //
 // Rotate elements within 32 bit lanes.
-#define mm512_swap16_32( v )   _mm512_ror_epi32( v, 16 )
+#define mm512_rol32_x8( v, c )   _mm512_rol_epi32( v, ((c)<<2) )
-#define mm512_ror1x8_32( v )   _mm512_ror_epi32( v, 8 )
+#define mm512_ror32_x8( v, c )   _mm512_ror_epi32( v, ((c)<<2) )
 #define mm512_rol1x8_32( v )   _mm512_rol_epi32( v, 8 )
 /*
 #define mm512_swap16_32( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001e001f001c001d, 0x001a001b00180019, \
                       0x0016001700140015, 0x0012001300100011, \
                       0x000e000f000c000d, 0x000a000b00080009, \
                       0x0006000700040005, 0x0002000300000001 ), v )
 #define mm512_ror1x8_32( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x3C3F3E3D383B3A39, 0x3437363530333231, \
                       0x2C2F2E2D282B2A29, 0x2427262520232221, \
                       0x1C1F1E1D181B1A19, 0x1417161510131211, \
                       0x0C0F0E0D080B0A09, 0x0407060500030201 ))
 #define mm512_rol1x8_32( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x3E3D3C3F3A39383B, 0x3635343732313033, \
                       0x2E2D2C2F2A29282B, 0x2625242722212023, \
                       0x1E1D1C1F1A19181B, 0x1615141712111013, \
                       0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
 */
 //
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //  These can all be done with 2 permutex2var instructions but they are
 //  slower than either xor or alignr and require AVX512VBMI.
-#define mm512_swap512_1024(v1, v2) \
+#define mm512_swap1024_512(v1, v2) \
   v1 = _mm512_xor_si512(v1, v2); \
   v2 = _mm512_xor_si512(v1, v2); \
   v1 = _mm512_xor_si512(v1, v2);
-#define mm512_ror1x256_1024( v1, v2 ) \
+#define mm512_ror1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x256_1024( v1, v2 ) \
+#define mm512_rol1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x128_1024( v1, v2 ) \
+#define mm512_ror1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x128_1024( v1, v2 ) \
+#define mm512_rol1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x64_1024( v1, v2 ) \
+#define mm512_ror1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x64_1024( v1, v2 ) \
+#define mm512_rol1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x32_1024( v1, v2 ) \
+#define mm512_ror1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x32_1024( v1, v2 ) \
+#define mm512_rol1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \