v3.10.3

2025-09-17 23:44:27 +00:00 · 2019-12-14 01:01:54 -05:00
42 changed files with 2656 additions and 1407 deletions
--- a/71
+++ b/71
@@ -1,12 +1,14 @@
-Requirements:
+1. Requirements:
 ---------------
 Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
 supported.
 64 bit Linux operating system. Apple is not supported.
-Building on linux prerequisites:
+2. Building on linux prerequisites:
 -----------------------------------
 It is assumed users know how to install packages on their system and
 be able to compile standard source packages. This is basic Linux and
@@ -20,41 +22,74 @@ http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu
 Install any additional dependencies needed by cpuminer-opt. The list below
 are some of the ones that may not be in the default install and need to
-be installed manually. There may be others, read the error messages they
+be installed manually. There may be others, read the compiler error messages,
-will give a clue as to the missing package.
+they will give a clue as to the missing package.
 The following command should install everything you need on Debian based
 distributions such as Ubuntu. Fedora and other distributions may have similar
-but different package names.
+but different package names. 
-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev
+$ sudo apt-get install build-essential automake libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev zlib1g-dev git
 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
-openssl 1.1.0e or higher. Add one of the following, depending on the
+openssl 1.1.0e or higher. Add one of the following to CFLAGS for SHA
-compiler version, to CFLAGS:
+support depending on your CPU and compiler version:
-"-march=native" or "-march=znver1" or "-msha".
+
 "-march=native" is always the best choice
 "-march=znver1" for Ryzen 1000 & 2000 series, znver2 for 3000.
 "-msha"  Add SHA to other tuning options
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.
 Local builds will always have the best performance and compatibility.
-Extract cpuminer source.
+3. Download cpuminer-opt
 ------------------------
-tar xvzf cpuminer-opt-x.y.z.tar.gz
+Download the source code for the latest realease from the official repository.
 cd cpuminer-opt-x.y.z
-Run ./build.sh to build on Linux or execute the following commands.
+https://github.com/JayDDee/cpuminer-opt/releases
-./autogen.sh
+Extract the source code.
 CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make
-Start mining.
+$ tar xvzf cpuminer-opt-x.y.z.tar.gz
 Alternatively it can be cloned from git.
 $ git clone https://github.com/JayDDee/cpuminer-opt.git
 4. Build cpuminer-opt
 ---------------------
 It is recomended to Build with default options, this will usuallly
 produce the best results.
 $ ./build.sh to build on Linux or execute the following commands.
 or 
 $ ./autogen.sh
 $ CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 $ make -j n
 n is the number of threads.
 5. Start mining.
 ----------------
 $ ./cpuminer -a algo -o url -u username -p password
 ./cpuminer -a algo -o url -u username -p password
 Windows
 -------
 See also INSTAL_WINDOWS
 The following procedure is obsolete and uses an old compiler.
 Precompiled Windows binaries are built on a Linux host using Mingw
 with a more recent compiler than the following Windows hosted procedure.
--- a/Makefile.am
+++ b/Makefile.am
@@ -124,6 +124,7 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/sponge-2way.c \
  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
--- a/README.md
+++ b/README.md
@@ -126,11 +126,11 @@ Supported Algorithms
                          x16rv2        Ravencoin (RVN)
                          x16rt         Gincoin (GIN)
                          x16rt-veil    Veil (VEIL)
-                          x16s          Pigeoncoin (PGN)
+                          x16s          
                          x17
-                          x21s
+                          x21s          Pigeoncoin (PGN)
-                          x22i
+                          x22i          
-                          x25x
+                          x25x          Sinovative (SIN)
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
--- a/7
+++ b/7
@@ -31,7 +31,12 @@ not supported. FreeBSD YMMV.
 Change Log
 ----------
-v3.10.2
+v3.10.3
 AVX512 for x12, x13, x14, x15.
 Fixed x12 AVX2 invalid shares.
 v.10.2
 AVX512 added for bmw512, c11, phi1612 (phi), qubit, skunk, x11, x11gost (sib).
 Fixed c11 AVX2 invalid shares.
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -184,10 +184,10 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
 #include <immintrin.h>
-#define  rotr32  mm256_swap32_64
+#define  rotr32( x )  mm256_ror_64( x, 32 )
-#define  rotr24  mm256_ror3x8_64
+#define  rotr24( x )  mm256_ror_64( x, 24 )
-#define  rotr16  mm256_ror1x16_64
+#define  rotr16( x )  mm256_ror_64( x, 16 )
-#define  rotr63( x ) mm256_rol_64( x, 1 )
+#define  rotr63( x )  mm256_rol_64( x,  1 )
 //#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
 //#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -70,19 +70,22 @@ typedef struct {
 // Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
-void blake256_4way(void *ctx, const void *data, size_t len);
+void blake256_4way_update(void *ctx, const void *data, size_t len);
 #define blake256_4way blake256_4way_update
 void blake256_4way_close(void *ctx, void *dst);
 // 14 rounds, blake, decred
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
-void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_update(void *cc, const void *data, size_t len);
 #define blake256r14_4way blake256r14_4way_update
 void blake256r14_4way_close(void *cc, void *dst);
 // 8 rounds, blakecoin, vanilla
 typedef blake_4way_small_context blake256r8_4way_context;
 void blake256r8_4way_init(void *cc);
-void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_update(void *cc, const void *data, size_t len);
 #define blake256r8_4way blake256r8_4way_update
 void blake256r8_4way_close(void *cc, void *dst);
 #ifdef __AVX2__
@@ -100,19 +103,21 @@ typedef struct {
 // Default 14 rounds
 typedef blake_8way_small_context blake256_8way_context;
 void blake256_8way_init(void *cc);
-void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_update(void *cc, const void *data, size_t len);
 #define blake256_8way blake256_8way_update
 void blake256_8way_close(void *cc, void *dst);
 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
 void blake256r14_8way_init(void *cc);
-void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_update(void *cc, const void *data, size_t len);
 void blake256r14_8way_close(void *cc, void *dst);
 // 8 rounds, blakecoin, vanilla
 typedef blake_8way_small_context blake256r8_8way_context;
 void blake256r8_8way_init(void *cc);
-void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_update(void *cc, const void *data, size_t len);
 #define blake256r8_8way blake256r8_8way_update
 void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -634,7 +634,7 @@ do { \
                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
-   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
@@ -1184,7 +1184,7 @@ blake256_16way_update(void *cc, const void *data, size_t len)
 }
 void
-blake256_16way_close_update(void *cc, void *dst)
+blake256_16way_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -1259,7 +1259,7 @@ blake256_8way_init(void *cc)
 }
 void
-blake256_8way(void *cc, const void *data, size_t len)
+blake256_8way_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }
@@ -1279,7 +1279,7 @@ void blake256r14_4way_init(void *cc)
 }
 void
-blake256r14_4way(void *cc, const void *data, size_t len)
+blake256r14_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1298,7 +1298,7 @@ void blake256r14_8way_init(void *cc)
 }
 void
-blake256r14_8way(void *cc, const void *data, size_t len)
+blake256r14_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
@@ -1318,7 +1318,7 @@ void blake256r8_4way_init(void *cc)
 }
 void
-blake256r8_4way(void *cc, const void *data, size_t len)
+blake256r8_4way_update(void *cc, const void *data, size_t len)
 {
   blake32_4way(cc, data, len);
 }
@@ -1337,7 +1337,7 @@ void blake256r8_8way_init(void *cc)
 }
 void
-blake256r8_8way(void *cc, const void *data, size_t len)
+blake256r8_8way_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -64,10 +64,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap64_128( x4 );
+        x4 = mm512_swap128_64( x4 );
-        x5 = mm512_swap64_128( x5 );
+        x5 = mm512_swap128_64( x5 );
-        x6 = mm512_swap64_128( x6 );
+        x6 = mm512_swap128_64( x6 );
-        x7 = mm512_swap64_128( x7 );
+        x7 = mm512_swap128_64( x7 );
        x4 = _mm512_add_epi32( x0, x4 );
        x5 = _mm512_add_epi32( x1, x5 );
        x6 = _mm512_add_epi32( x2, x6 );
@@ -82,10 +82,10 @@ static void transform_4way( cube_4way_context *sp )
        x1 = _mm512_xor_si512( x1, x5 );
        x2 = _mm512_xor_si512( x2, x6 );
        x3 = _mm512_xor_si512( x3, x7 );
-        x4 = mm512_swap32_64( x4 );
+        x4 = mm512_swap64_32( x4 );
-        x5 = mm512_swap32_64( x5 );
+        x5 = mm512_swap64_32( x5 );
-        x6 = mm512_swap32_64( x6 );
+        x6 = mm512_swap64_32( x6 );
-        x7 = mm512_swap32_64( x7 );
+        x7 = mm512_swap64_32( x7 );
    }
    _mm512_store_si512( (__m512i*)sp->h,     x0 );
@@ -239,10 +239,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap64_128( x4 );
+        x4 = mm256_swap128_64( x4 );
-        x5 = mm256_swap64_128( x5 );
+        x5 = mm256_swap128_64( x5 );
-        x6 = mm256_swap64_128( x6 );
+        x6 = mm256_swap128_64( x6 );
-        x7 = mm256_swap64_128( x7 );
+        x7 = mm256_swap128_64( x7 );
        x4 = _mm256_add_epi32( x0, x4 );
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
@@ -257,10 +257,10 @@ static void transform_2way( cube_2way_context *sp )
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
        x3 = _mm256_xor_si256( x3, x7 );
-        x4 = mm256_swap32_64( x4 );
+        x4 = mm256_swap64_32( x4 );
-        x5 = mm256_swap32_64( x5 );
+        x5 = mm256_swap64_32( x5 );
-        x6 = mm256_swap32_64( x6 );
+        x6 = mm256_swap64_32( x6 );
-        x7 = mm256_swap32_64( x7 );
+        x7 = mm256_swap64_32( x7 );
    }
    _mm256_store_si256( (__m256i*)sp->h,     x0 );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -39,8 +39,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y0, 7 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap64_128( x2 );
+        x2 = mm256_swap128_64( x2 );
-        x3 = mm256_swap64_128( x3 );
+        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
        y0 = mm256_swap_128( x0 );
@@ -49,8 +49,8 @@ static void transform( cubehashParam *sp )
        x1 = mm256_rol_32( y1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
-        x2 = mm256_swap32_64( x2 );
+        x2 = mm256_swap64_32( x2 );
-        x3 = mm256_swap32_64( x3 );
+        x3 = mm256_swap64_32( x3 );
    }
    _mm256_store_si256( (__m256i*)sp->x,     x0 );
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -528,6 +528,346 @@ static const sph_u32 T512[64][16] = {
 	  SPH_C32(0xe7e00a94) }
 };
 #define s0   m0
 #define s1   c0
 #define s2   m1
 #define s3   c1
 #define s4   c2
 #define s5   m2
 #define s6   c3
 #define s7   m3
 #define s8   m4
 #define s9   c4
 #define sA   m5
 #define sB   c5
 #define sC   c6
 #define sD   m6
 #define sE   c7
 #define sF   m7
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 // Hamsi 8 way 
 #define INPUT_BIG8 \
 do { \
  __m512i db = *buf; \
  const uint64_t *tp = (uint64_t*)&T512[0][0];  \
  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m512_zero; \
  for ( int u = 0; u < 64; u++ ) \
  { \
     __m512i dm = _mm512_and_si512( db, m512_one_64 ) ; \
     dm = mm512_negate_32( _mm512_or_si512( dm, \
                                          _mm512_slli_epi64( dm, 32 ) ) ); \
     m0 = _mm512_xor_si512( m0, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[0] ) ) ); \
     m1 = _mm512_xor_si512( m1, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[1] ) ) ); \
     m2 = _mm512_xor_si512( m2, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[2] ) ) ); \
     m3 = _mm512_xor_si512( m3, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[3] ) ) ); \
     m4 = _mm512_xor_si512( m4, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[4] ) ) ); \
     m5 = _mm512_xor_si512( m5, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[5] ) ) ); \
     m6 = _mm512_xor_si512( m6, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[6] ) ) ); \
     m7 = _mm512_xor_si512( m7, _mm512_and_si512( dm, \
                                          m512_const1_64( tp[7] ) ) ); \
     tp += 8; \
     db = _mm512_srli_epi64( db, 1 ); \
  } \
 } while (0)
 #define SBOX8( a, b, c, d ) \
 do { \
  __m512i t; \
  t = a; \
  a = _mm512_and_si512( a, c ); \
  a = _mm512_xor_si512( a, d ); \
  c = _mm512_xor_si512( c, b ); \
  c = _mm512_xor_si512( c, a ); \
  d = _mm512_or_si512( d, t ); \
  d = _mm512_xor_si512( d, b ); \
  t = _mm512_xor_si512( t, c ); \
  b = d; \
  d = _mm512_or_si512( d, t ); \
  d = _mm512_xor_si512( d, a ); \
  a = _mm512_and_si512( a, b ); \
  t = _mm512_xor_si512( t, a ); \
  b = _mm512_xor_si512( b, d ); \
  b = _mm512_xor_si512( b, t ); \
  a = c; \
  c = b; \
  b = d; \
  d = mm512_not( t ); \
 } while (0)
 #define L8( a, b, c, d ) \
 do { \
   a = mm512_rol_32( a, 13 ); \
   c = mm512_rol_32( c,  3 ); \
   b = _mm512_xor_si512( b, _mm512_xor_si512( a, c ) ); \
   d = _mm512_xor_si512( d, _mm512_xor_si512( c, \
                                              _mm512_slli_epi32( a, 3 ) ) ); \
   b = mm512_rol_32( b, 1 ); \
   d = mm512_rol_32( d, 7 ); \
   a = _mm512_xor_si512( a, _mm512_xor_si512( b, d ) ); \
   c = _mm512_xor_si512( c, _mm512_xor_si512( d, \
                                              _mm512_slli_epi32( b, 7 ) ) ); \
   a = mm512_rol_32( a,  5 ); \
   c = mm512_rol_32( c, 22 ); \
 } while (0)
 #define DECL_STATE_BIG8 \
   __m512i c0, c1, c2, c3, c4, c5, c6, c7; \
 #define READ_STATE_BIG8(sc) \
 do { \
   c0 = sc->h[0x0]; \
   c1 = sc->h[0x1]; \
   c2 = sc->h[0x2]; \
   c3 = sc->h[0x3]; \
   c4 = sc->h[0x4]; \
   c5 = sc->h[0x5]; \
   c6 = sc->h[0x6]; \
   c7 = sc->h[0x7]; \
 } while (0)
 #define WRITE_STATE_BIG8(sc) \
 do { \
   sc->h[0x0] = c0; \
   sc->h[0x1] = c1; \
   sc->h[0x2] = c2; \
   sc->h[0x3] = c3; \
   sc->h[0x4] = c4; \
   sc->h[0x5] = c5; \
   sc->h[0x6] = c6; \
   sc->h[0x7] = c7; \
 } while (0)
 #define ROUND_BIG8(rc, alpha) \
 do { \
   __m512i t0, t1, t2, t3; \
   s0 = _mm512_xor_si512( s0, m512_const1_64( \
                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
   s1 = _mm512_xor_si512( s1, m512_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
   s2 = _mm512_xor_si512( s2, m512_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
   s3 = _mm512_xor_si512( s3, m512_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
   s4 = _mm512_xor_si512( s4, m512_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
   s5 = _mm512_xor_si512( s5, m512_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
   s6 = _mm512_xor_si512( s6, m512_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
   s7 = _mm512_xor_si512( s7, m512_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
   s8 = _mm512_xor_si512( s8, m512_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
   s9 = _mm512_xor_si512( s9, m512_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
   sA = _mm512_xor_si512( sA, m512_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
   sB = _mm512_xor_si512( sB, m512_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
   sC = _mm512_xor_si512( sC, m512_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
   sD = _mm512_xor_si512( sD, m512_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
   sE = _mm512_xor_si512( sE, m512_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
   sF = _mm512_xor_si512( sF, m512_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
 \
  SBOX8( s0, s4, s8, sC ); \
  SBOX8( s1, s5, s9, sD ); \
  SBOX8( s2, s6, sA, sE ); \
  SBOX8( s3, s7, sB, sF ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), \
                                        _mm512_bslli_epi128( s5, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sD, 4 ), \
                                        _mm512_bslli_epi128( sE, 4 ) ); \
  L8( s0, t1, s9, t3 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t1, 4 ) ); \
  s5 = _mm512_mask_blend_epi32( 0x5555, s5, _mm512_bsrli_epi128( t1, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0xaaaa, sD, _mm512_bslli_epi128( t3, 4 ) ); \
  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
                                        _mm512_bslli_epi128( s6, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sE, 4 ), \
                                        _mm512_bslli_epi128( sF, 4 ) ); \
  L8( s1, t1, sA, t3 ); \
  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, s6, _mm512_bsrli_epi128( t1, 4 ) ); \
  sE = _mm512_mask_blend_epi32( 0xaaaa, sE, _mm512_bslli_epi128( t3, 4 ) ); \
  sF = _mm512_mask_blend_epi32( 0x5555, sF, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s6, 4 ), \
                                        _mm512_bslli_epi128( s7, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sF, 4 ), \
                                        _mm512_bslli_epi128( sC, 4 ) ); \
  L8( s2, t1, sB, t3 ); \
  s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( t1, 4 ) ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, s7, _mm512_bsrli_epi128( t1, 4 ) ); \
  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, _mm512_bslli_epi128( t3, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0x5555, sC, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s7, 4 ), \
                                        _mm512_bslli_epi128( s4, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( sC, 4 ), \
                                        _mm512_bslli_epi128( sD, 4 ) ); \
  L8( s3, t1, s8, t3 ); \
  s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, _mm512_bslli_epi128( t1, 4 ) ); \
  s4 = _mm512_mask_blend_epi32( 0x5555, s4, _mm512_bsrli_epi128( t1, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, _mm512_bslli_epi128( t3, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, _mm512_bslli_epi128( s8, 4 ) ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
  t2 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s2, 4 ), sA ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s3, 4 ), \
                                        _mm512_bslli_epi128( sB, 4 ) ); \
  L8( t0, t1, t2, t3 ); \
  s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
  s8 = _mm512_mask_blend_epi32( 0x5555, s8, _mm512_bsrli_epi128( t0, 4 ) ); \
  s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
  s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
  s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, _mm512_bslli_epi128( t2, 4 ) ); \
  sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
  s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, _mm512_bslli_epi128( t3, 4 ) ); \
  sB = _mm512_mask_blend_epi32( 0x5555, sB, _mm512_bsrli_epi128( t3, 4 ) ); \
 \
  t0 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s4, 4 ), sC ); \
  t1 = _mm512_mask_blend_epi32( 0xaaaa, _mm512_bsrli_epi128( s5, 4 ), \
                                        _mm512_bslli_epi128( sD, 4 ) ); \
  t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, _mm512_bslli_epi128( sE, 4 ) ); \
  t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
  L8( t0, t1, t2, t3 ); \
  s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, _mm512_bslli_epi128( t0, 4 ) ); \
  sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
  s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, _mm512_bslli_epi128( t1, 4 ) ); \
  sD = _mm512_mask_blend_epi32( 0x5555, sD, _mm512_bsrli_epi128( t1, 4 ) ); \
  s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
  sE = _mm512_mask_blend_epi32( 0x5555, sE, _mm512_bsrli_epi128( t2, 4 ) ); \
  s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
  sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
 } while (0)
 #define P_BIG8 \
 do { \
   ROUND_BIG8(0, alpha_n); \
   ROUND_BIG8(1, alpha_n); \
   ROUND_BIG8(2, alpha_n); \
   ROUND_BIG8(3, alpha_n); \
   ROUND_BIG8(4, alpha_n); \
   ROUND_BIG8(5, alpha_n); \
 } while (0)
 #define PF_BIG8 \
 do { \
   ROUND_BIG8( 0, alpha_f); \
   ROUND_BIG8( 1, alpha_f); \
   ROUND_BIG8( 2, alpha_f); \
   ROUND_BIG8( 3, alpha_f); \
   ROUND_BIG8( 4, alpha_f); \
   ROUND_BIG8( 5, alpha_f); \
   ROUND_BIG8( 6, alpha_f); \
   ROUND_BIG8( 7, alpha_f); \
   ROUND_BIG8( 8, alpha_f); \
   ROUND_BIG8( 9, alpha_f); \
   ROUND_BIG8(10, alpha_f); \
   ROUND_BIG8(11, alpha_f); \
 } while (0)
 #define T_BIG8 \
 do { /* order is important */ \
   c7 = sc->h[ 0x7 ] = _mm512_xor_si512( sc->h[ 0x7 ], sB ); \
   c6 = sc->h[ 0x6 ] = _mm512_xor_si512( sc->h[ 0x6 ], sA ); \
   c5 = sc->h[ 0x5 ] = _mm512_xor_si512( sc->h[ 0x5 ], s9 ); \
   c4 = sc->h[ 0x4 ] = _mm512_xor_si512( sc->h[ 0x4 ], s8 ); \
   c3 = sc->h[ 0x3 ] = _mm512_xor_si512( sc->h[ 0x3 ], s3 ); \
   c2 = sc->h[ 0x2 ] = _mm512_xor_si512( sc->h[ 0x2 ], s2 ); \
   c1 = sc->h[ 0x1 ] = _mm512_xor_si512( sc->h[ 0x1 ], s1 ); \
   c0 = sc->h[ 0x0 ] = _mm512_xor_si512( sc->h[ 0x0 ], s0 ); \
 } while (0)
 void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
 {
   DECL_STATE_BIG8
   uint32_t tmp = num << 6;
   sc->count_low = SPH_T32( sc->count_low + tmp );
   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
   if ( sc->count_low < tmp )
      sc->count_high++;
   READ_STATE_BIG8( sc );
   while ( num-- > 0 )
   {
      __m512i m0, m1, m2, m3, m4, m5, m6, m7;
      INPUT_BIG8;
      P_BIG8;
      T_BIG8;
      buf++;
   }
   WRITE_STATE_BIG8( sc );
 }
 void hamsi_8way_big_final( hamsi_8way_big_context *sc, __m512i *buf )
 {
   __m512i m0, m1, m2, m3, m4, m5, m6, m7;
   DECL_STATE_BIG8
   READ_STATE_BIG8( sc );
   INPUT_BIG8;
   PF_BIG8;
   T_BIG8;
   WRITE_STATE_BIG8( sc );
 }
 void hamsi512_8way_init( hamsi_8way_big_context *sc )
 {
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;
   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
   sc->h[2] = m512_const1_64( 0x302c206272672031 );
   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
 }
 void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   hamsi_8way_big( sc, vdata, len>>3 );
   vdata += ( (len& ~(size_t)7) >> 3 );
   len &= (size_t)7;
   memcpy_512( sc->buf, vdata, len>>3 );
   sc->partial_len = len;
 }
 void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
 {
   __m512i pad[1];
   int ch, cl;
   sph_enc32be( &ch, sc->count_high );
   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
   pad[0] =  _mm512_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch,
                               cl, ch, cl, ch, cl, ch, cl, ch );
 //   pad[0] =  m512_const2_32( cl, ch );
   sc->buf[0] = m512_const1_64( 0x80 );
   hamsi_8way_big( sc, sc->buf, 1 );
   hamsi_8way_big_final( sc, pad );
   mm512_block_bswap_32( (__m512i*)dst, sc->h );
 }
 #endif // AVX512
 // Hamsi 4 way
 #define INPUT_BIG \
 do { \
@@ -627,6 +967,7 @@ do { \
   sc->h[0x7] = c7; \
 } while (0)
 /*
 #define s0   m0
 #define s1   c0
 #define s2   m1
@@ -643,42 +984,28 @@ do { \
 #define sD   m6
 #define sE   c7
 #define sF   m7
 */
 #define ROUND_BIG(rc, alpha) \
 do { \
   __m256i t0, t1, t2, t3; \
   s0 = _mm256_xor_si256( s0, m256_const1_64( \
-        ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
+                   ( (uint64_t)(rc) << 32 ) ^ ( (uint64_t*)(alpha) )[ 0] ) ); \
-   s1 = _mm256_xor_si256( s1, m256_const1_64( \
+   s1 = _mm256_xor_si256( s1, m256_const1_64( ( (uint64_t*)(alpha) )[ 1] ) ); \
-        ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
+   s2 = _mm256_xor_si256( s2, m256_const1_64( ( (uint64_t*)(alpha) )[ 2] ) ); \
-   s2 = _mm256_xor_si256( s2, m256_const1_64( \
+   s3 = _mm256_xor_si256( s3, m256_const1_64( ( (uint64_t*)(alpha) )[ 3] ) ); \
-        ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
+   s4 = _mm256_xor_si256( s4, m256_const1_64( ( (uint64_t*)(alpha) )[ 4] ) ); \
-   s3 = _mm256_xor_si256( s3, m256_const1_64( \
+   s5 = _mm256_xor_si256( s5, m256_const1_64( ( (uint64_t*)(alpha) )[ 5] ) ); \
-        ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
+   s6 = _mm256_xor_si256( s6, m256_const1_64( ( (uint64_t*)(alpha) )[ 6] ) ); \
-   s4 = _mm256_xor_si256( s4, m256_const1_64( \
+   s7 = _mm256_xor_si256( s7, m256_const1_64( ( (uint64_t*)(alpha) )[ 7] ) ); \
-        ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
+   s8 = _mm256_xor_si256( s8, m256_const1_64( ( (uint64_t*)(alpha) )[ 8] ) ); \
-   s5 = _mm256_xor_si256( s5, m256_const1_64( \
+   s9 = _mm256_xor_si256( s9, m256_const1_64( ( (uint64_t*)(alpha) )[ 9] ) ); \
-        ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
+   sA = _mm256_xor_si256( sA, m256_const1_64( ( (uint64_t*)(alpha) )[10] ) ); \
-   s6 = _mm256_xor_si256( s6, m256_const1_64( \
+   sB = _mm256_xor_si256( sB, m256_const1_64( ( (uint64_t*)(alpha) )[11] ) ); \
-        ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
+   sC = _mm256_xor_si256( sC, m256_const1_64( ( (uint64_t*)(alpha) )[12] ) ); \
-   s7 = _mm256_xor_si256( s7, m256_const1_64( \
+   sD = _mm256_xor_si256( sD, m256_const1_64( ( (uint64_t*)(alpha) )[13] ) ); \
-        ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
+   sE = _mm256_xor_si256( sE, m256_const1_64( ( (uint64_t*)(alpha) )[14] ) ); \
-   s8 = _mm256_xor_si256( s8, m256_const1_64( \
+   sF = _mm256_xor_si256( sF, m256_const1_64( ( (uint64_t*)(alpha) )[15] ) ); \
        ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
   s9 = _mm256_xor_si256( s9, m256_const1_64( \
        ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
   sA = _mm256_xor_si256( sA, m256_const1_64( \
        ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
   sB = _mm256_xor_si256( sB, m256_const1_64( \
        ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
   sC = _mm256_xor_si256( sC, m256_const1_64( \
        ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
   sD = _mm256_xor_si256( sD, m256_const1_64( \
        ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
   sE = _mm256_xor_si256( sE, m256_const1_64( \
        ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
   sF = _mm256_xor_si256( sF, m256_const1_64( \
        ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
 \
  SBOX( s0, s4, s8, sC ); \
  SBOX( s1, s5, s9, sD ); \
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -60,9 +60,32 @@ typedef struct {
 typedef hamsi_4way_big_context hamsi512_4way_context;
 void hamsi512_4way_init( hamsi512_4way_context *sc );
-void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
      size_t len );
 #define hamsi512_4way hamsi512_4way_update
 void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 typedef struct {
   __m512i h[8];
   __m512i buf[1];
   size_t partial_len;
   sph_u32 count_high, count_low;
 } hamsi_8way_big_context;
 typedef hamsi_8way_big_context hamsi512_8way_context;
 void hamsi512_8way_init( hamsi512_8way_context *sc );
 void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
                           size_t len );
 void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -44,8 +44,13 @@ bool lyra2rev3_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int size = ROW_LEN_BYTES * 4; // nRows;
-   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+#if defined(LYRA2REV3_16WAY)
 //   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
   init_lyra2rev3_16way_ctx();;
 #else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
 #if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
@@ -53,13 +58,17 @@ bool lyra2rev3_thread_init()
   init_lyra2rev3_4way_ctx();;
 #else
   init_lyra2rev3_ctx();
 #endif
 #endif
   return l2v3_wholeMatrix;
 }
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_8WAY)
+#if defined(LYRA2REV3_16WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
  gate->hash      = (void*)&lyra2rev3_16way_hash;
 #elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
 #elif defined (LYRA2REV3_4WAY)
@@ -69,6 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
 #endif
 //  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -5,18 +5,27 @@
 #include <stdint.h>
 #include "lyra2.h"
 //#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 //  #define LYRA2REV3_16WAY 1
 //#elif defined(__AVX2__)
 #if defined(__AVX2__)
-  #define LYRA2REV3_8WAY
+  #define LYRA2REV3_8WAY 1
-#endif
+#elif defined(__SSE2__)
-
+  #define LYRA2REV3_4WAY 1
 #if defined(__SSE2__)
  #define LYRA2REV3_4WAY
 #endif
 extern __thread uint64_t* l2v3_wholeMatrix;
 bool register_lyra2rev3_algo( algo_gate_t* gate );
-#if defined(LYRA2REV3_8WAY)
+
 #if defined(LYRA2REV3_16WAY)
 void lyra2rev3_16way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_16way( struct work *work, uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr );
 bool init_lyra2rev3_16way_ctx();
 #elif defined(LYRA2REV3_8WAY)
 void lyra2rev3_8way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/lyra2-hash-2way.c
+++ b/algo/lyra2/lyra2-hash-2way.c
@@ -1,715 +0,0 @@
 /**
 * Implementation of the Lyra2 Password Hashing Scheme (PHS).
 *
 * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
 *
 * This software is hereby placed in the public domain.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <mm_malloc.h>
 #include "compat.h"
 #include "lyra2.h"
 #include "sponge.h"
 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
 * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
 * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
 * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
 * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
 *
 * @param K The derived key to be output by the algorithm
 * @param kLen Desired key length
 * @param pwd User password
 * @param pwdlen Password length
 * @param salt Salt
 * @param saltlen Salt length
 * @param timeCost Parameter to determine the processing time (T)
 * @param nRows Number or rows of the memory matrix (R)
 * @param nCols Number of columns of the memory matrix (C)
 *
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */
 int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
               const uint64_t timeCost, const uint64_t nRows,
               const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
   int64_t tau; //Time Loop iterator
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
 //   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   //====================================================================/
   //=== Initializing the Memory Matrix and pointers to it =============//
   //Tries to allocate enough space for the whole memory matrix
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
 //   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;
 //   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible
   //First, we clean enough blocks for the password, salt, basil and padding
   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
   byte *ptrByte = (byte*) wholeMatrix;
   //Prepends the password
   memcpy(ptrByte, pwd, pwdlen);
   ptrByte += pwdlen;
   //Concatenates the salt
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;
   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
                       - (saltlen + pwdlen) );
   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = pwdlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = saltlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = timeCost;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nRows;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nCols;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   //Now comes the padding
   *ptrByte = 0x80; //first byte of padding: right after the password
   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 // from here on it's all simd acces to state and matrix
 // define vector pointers and adjust sizes and pointer offsets
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
 //   initState( state );
   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
   ptrWord = wholeMatrix;
   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
 /*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
 */
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
      //updates the value of row* (deterministically picked during Setup))
      rowa = (rowa + step) & (window - 1);
      //update prev: it now points to the last row ever computed
      prev = row;
      //updates row: goes to the next row to be computed
      row++;
      //Checks if all rows in the window where visited.
      if (rowa == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
         gap = -gap; //inverts the modifier to the step
      }
   } while (row < nRows);
   //===================== Wandering Phase =============================//
   row = 0; //Resets the visitation to the first row of the memory matrix
   for (tau = 1; tau <= timeCost; tau++)
   {
       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
       do
       {
           //Selects a pseudorandom index row*
           //-----------------------------------------------
           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
           //-------------------------------------------
           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
           //update prev: it now points to the last row ever computed
           prev = row;
           //updates row: goes to the next row to be computed
           //----------------------------------------------------
           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
           //----------------------------------------------------
       } while (row != 0);
   }
   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
   //Squeezes the key
   squeeze(state, K, (unsigned int) kLen);
   return 0;
 }
 /////////////////////////////////////////////////
 // 2 way 256
 // drop salt, salt len arguments, hard code some others.
 // Data is interleaved 2x256.
 int LYRA2REV3_2WAY( uint64_t* wholeMatrix, void *K, uint64_t kLen,
      const void *pwd, const uint64_t pwdlen, const void *salt,
      const uint64_t saltlen, const uint64_t timeCost, const uint64_t nRows,
      const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
   int64_t tau; //Time Loop iterator
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
 //   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   uint64_t instance0 = 0; // Seperate instance for each lane
   uint64_t instance1 = 0;
   //====================================================================/
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
   uint64_t *ptrWord = wholeMatrix;
 //  2 way 256 rewrite. Salt always == password, and data is interleaved,
 //  need to build in parallel:
 //  {   password,    (64 or 80 bytes)
 //      salt,        (64 or 80 bytes) =  same as password
 //      Klen,        (u64)  = 32 bytes
 //      pwdlen,      (u64)
 //      saltlen,     (u64)
 //      timecost,    (u64)
 //      nrows,       (u64)
 //      ncols,       (u64)
 //      0x80,        (byte)
 //      { 0 .. 0 },
 //      1            (byte)
 //   }
 //   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
   byte *ptrByte = (byte*) wholeMatrix;
   //Prepends the password
   memcpy(ptrByte, pwd, pwdlen);
   ptrByte += pwdlen;
   //Concatenates the salt
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;
   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
                       - (saltlen + pwdlen) );
   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = pwdlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = saltlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = timeCost;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nRows;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nCols;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   //Now comes the padding
   *ptrByte = 0x80; //first byte of padding: right after the password
   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 // from here on it's all simd acces to state and matrix
 // define vector pointers and adjust sizes and pointer offsets
   ptrWord = wholeMatrix;
   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols );
   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
   do
   {
      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
      rowa = (rowa + step) & (window - 1);
      prev = row;
      row++;
      if (rowa == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
         gap = -gap; //inverts the modifier to the step
      }
   } while (row < nRows);
   row = 0;
   for (tau = 1; tau <= timeCost; tau++)
   {
      step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
      do
      {
        // This part is not parallel, rowa will be different for each lane.
        // state (u64[16]) is interleaved 2x256, need to extract seperately.
        // index = 2 * instance / 4 * 4 + instance % 4
        uint64_t index0 = ( ( (instance0 & 0xf) >> 3 ) << 2 )
                           + ( instance0 & 0x3 )
        uint64_t index1 = ( ( (instance1 & 0xf) >> 3 ) << 2 )
                           + ( instance1 & 0x3 )
        instance0 = state[ index0 ] & 0xf;
        instance1 = (state+4)[ index1 ] & 0xf;
        rowa0 = state[ instance0 ];
        rowa1 = (state+4)[ instance1 ];
        reducedDuplexRow_2way( state, &wholeMatrix[prev*ROW_LEN_INT64],
                                      &wholeMatrix[rowa0*ROW_LEN_INT64],
                                      &wholeMatrix[rowa1*ROW_LEN_INT64],
                                      &wholeMatrix[row*ROW_LEN_INT64], nCols );
 /*
           instance = state[instance & 0xF];
           rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
 */
        // End of divergence.
        prev = row;
        row = (row + step) & (unsigned int)(nRows-1); 
       } while ( row != 0 );
   }
   absorbBlock( state, &wholeMatrix[rowa*ROW_LEN_INT64] );
   squeeze( state, K, (unsigned int) kLen );
   return 0;
 }
 //////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
            const uint64_t timeCost, const uint64_t nRows,
            const uint64_t nCols )
 {
    //========================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
    int64_t row = 2; //index of row to be processed
    int64_t prev = 1; //index of prev (last row ever computed/modified)
    int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
    int64_t tau; //Time Loop iterator
    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
 //    int64_t i; //auxiliary iteration counter
    //=======================================================================/
    //======= Initializing the Memory Matrix and pointers to it =============//
    //Tries to allocate enough space for the whole memory matrix
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
 //    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
 //    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
    //but this ensures that the password copied locally will be overwritten as soon as possible
    //First, we clean enough blocks for the password, salt, basil and padding
    uint64_t nBlocksInput = ( ( saltlen + pwdlen + 6 *
                       sizeof (uint64_t) ) / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
    byte *ptrByte = (byte*) wholeMatrix;
    memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES );
    //Prepends the password
    memcpy(ptrByte, pwd, pwdlen);
    ptrByte += pwdlen;
    //Concatenates the salt
    memcpy(ptrByte, salt, saltlen);
    ptrByte += saltlen;
    //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
    memcpy(ptrByte, &kLen, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    memcpy(ptrByte, &saltlen, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    memcpy(ptrByte, &timeCost, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    memcpy(ptrByte, &nRows, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    memcpy(ptrByte, &nCols, sizeof (uint64_t));
    ptrByte += sizeof (uint64_t);
    //Now comes the padding
    *ptrByte = 0x80; //first byte of padding: right after the password
    ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
    //=================== Initializing the Sponge State ====================//
    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
 //        uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
 //        if (state == NULL) {
 //                return -1;
 //        }
 //    initState( state );
    //============================== Setup Phase =============================//
    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
    uint64_t *ptrWord = wholeMatrix;
    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
 /*
    for ( i = 0; i < nBlocksInput; i++ )
    {
      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
    }
 */
    //Initializes M[0] and M[1]
        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
        do {
                //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
                reducedDuplexRowSetup(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
                //updates the value of row* (deterministically picked during Setup))
                rowa = (rowa + step) & (window - 1);
                //update prev: it now points to the last row ever computed
                prev = row;
                //updates row: goes to the next row to be computed
                row++;
                //Checks if all rows in the window where visited.
                if (rowa == 0) {
                        step = window + gap; //changes the step: approximately doubles its value
                        window *= 2; //doubles the size of the re-visitation window
                        gap = -gap; //inverts the modifier to the step
                }
        } while (row < nRows);
    //======================== Wandering Phase =============================//
    row = 0; //Resets the visitation to the first row of the memory matrix
    for ( tau = 1; tau <= timeCost; tau++ )
    {
        //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
        step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
        do {
        //Selects a pseudorandom index row*
        //----------------------------------------------------------------------
        //rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
        rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
        //-----------------------------------------------------------------
        //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
                reducedDuplexRow(state, &wholeMatrix[prev*ROW_LEN_INT64], &wholeMatrix[rowa*ROW_LEN_INT64], &wholeMatrix[row*ROW_LEN_INT64], nCols);
        //update prev: it now points to the last row ever computed
        prev = row;
        //updates row: goes to the next row to be computed
        //---------------------------------------------------------------
        //row = (row + step) & (nRows-1);       //(USE THIS IF nRows IS A POWER OF 2)
        row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
        //--------------------------------------------------------------------
      } while (row != 0);
    }
    //========================= Wrap-up Phase ===============================//
    //Absorbs the last block of the memory matrix
    absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
    //Squeezes the key
    squeeze( state, K, kLen );
    return 0;
 }
 // Lyra2RE doesn't like the new wholeMatrix implementation
 int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
   int64_t row = 2; //index of row to be processed
   int64_t prev = 1; //index of prev (last row ever computed/modified)
   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
   int64_t tau; //Time Loop iterator
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   //====================================================================/
   //=== Initializing the Memory Matrix and pointers to it =============//
   //Tries to allocate enough space for the whole memory matrix
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   // for Lyra2REv2, nCols = 4, v1 was using 8
   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   i = (int64_t)ROW_LEN_BYTES * nRows;
   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;
 #if defined(__AVX2__)
   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
 #elif defined(__SSE2__)
   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
   memset( wholeMatrix, 0, i );
 #endif
   uint64_t *ptrWord = wholeMatrix;
   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible
   //First, we clean enough blocks for the password, salt, basil and padding
   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
   byte *ptrByte = (byte*) wholeMatrix;
   //Prepends the password
   memcpy(ptrByte, pwd, pwdlen);
   ptrByte += pwdlen;
   //Concatenates the salt
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;
 //   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
 //                       - (saltlen + pwdlen) );
   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = pwdlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = saltlen;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = timeCost;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nRows;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   v64 = nCols;
   memcpy(ptrByte, &v64, sizeof(int64_t));
   ptrByte += sizeof(uint64_t);
   //Now comes the padding
   *ptrByte = 0x80; //first byte of padding: right after the password
   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
 //   initState( state );
   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
   ptrWord = wholeMatrix;
   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
 /*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
 */
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
                      nCols);
   do
   {
      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
      //updates the value of row* (deterministically picked during Setup))
      rowa = (rowa + step) & (window - 1);
      //update prev: it now points to the last row ever computed
      prev = row;
      //updates row: goes to the next row to be computed
      row++;
      //Checks if all rows in the window where visited.
      if (rowa == 0)
      {
         step = window + gap; //changes the step: approximately doubles its value
         window *= 2; //doubles the size of the re-visitation window
         gap = -gap; //inverts the modifier to the step
      }
   } while (row < nRows);
   //===================== Wandering Phase =============================//
   row = 0; //Resets the visitation to the first row of the memory matrix
   for (tau = 1; tau <= timeCost; tau++)
   {
       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
       do
       {
           //Selects a pseudorandom index row*
           //-----------------------------------------------
           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
           //-------------------------------------------
           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
                             &wholeMatrix[rowa*ROW_LEN_INT64],
                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
           //update prev: it now points to the last row ever computed
           prev = row;
           //updates row: goes to the next row to be computed
           //----------------------------------------------------
           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
           //----------------------------------------------------
       } while (row != 0);
   }
   //===================== Wrap-up Phase ===============================//
   //Absorbs the last block of the memory matrix
   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
   //Squeezes the key
   squeeze(state, K, (unsigned int) kLen);
   //================== Freeing the memory =============================//
   _mm_free(wholeMatrix);
   return 0;
 }
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -60,4 +60,15 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
               uint64_t pwdlen, const void *salt, uint64_t saltlen,
               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
 //int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
 //        uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
 #endif
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -19,7 +19,7 @@
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include "algo-gate.h"
+#include "algo-gate-api.h"
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
@@ -27,7 +27,8 @@
 #include "sponge.h"
 #include "lyra2.h"
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if 0
 //#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
 inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
 {
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,14 +65,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
-   G_4X64( s0, s1, s2, s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_ror_1x64( s1); \
+   s1 = mm512_ror256_64( s1); \
-   s2 = mm512_swap128_256( s2 ); \
+   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_rol1x64_256( s3 ); \
+   s3 = mm512_rol256_64( s3 ); \
-   G_4X64( s0, s1, s2, s3 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
-   s1 = mm512_rol1x64_256( s1 ); \
+   s1 = mm512_rol256_64( s1 ); \
-   s2 = mm512_swap128_256( s2 ); \
+   s2 = mm512_swap256_128( s2 ); \
-   s3 = mm512_ror1x64_256( s3 );
+   s3 = mm512_ror256_64( s3 );
 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -148,14 +148,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_ror1x64_256( s2, s3 ); \
+   mm128_ror256_64( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
+   mm128_swap256_128( s4, s5 ); \
-   mm128_rol1x64_256( s6, s7 ); \
+   mm128_rol256_64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rol1x64_256( s2, s3 ); \
+   mm128_rol256_64( s2, s3 ); \
-   mm128_swap128_256( s4, s5 ); \
+   mm128_swap256_128( s4, s5 ); \
-   mm128_ror1x64_256( s6, s7 );
+   mm128_ror256_64( s6, s7 );
 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -92,7 +92,6 @@ int scanhash_qubit_4way( struct work *work,uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -33,7 +33,7 @@
 #include <stddef.h>
 #include <string.h>
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 #include "shabal-hash-4way.h"
 #ifdef __cplusplus
@@ -58,6 +58,599 @@ extern "C"{
 #define O2    9
 #define O3    6
 #if defined(__AVX2__)
 #define DECL_STATE8   \
   __m256i A00, A01, A02, A03, A04, A05, A06, A07, \
           A08, A09, A0A, A0B; \
   __m256i B0, B1, B2, B3, B4, B5, B6, B7, \
           B8, B9, BA, BB, BC, BD, BE, BF; \
   __m256i C0, C1, C2, C3, C4, C5, C6, C7, \
           C8, C9, CA, CB, CC, CD, CE, CF; \
   __m256i M0, M1, M2, M3, M4, M5, M6, M7, \
           M8, M9, MA, MB, MC, MD, ME, MF; \
   sph_u32 Wlow, Whigh;
 #define READ_STATE8(state) do \
 { \
   if ( (state)->state_loaded ) \
   { \
      A00 = (state)->A[0]; \
      A01 = (state)->A[1]; \
      A02 = (state)->A[2]; \
      A03 = (state)->A[3]; \
      A04 = (state)->A[4]; \
      A05 = (state)->A[5]; \
      A06 = (state)->A[6]; \
      A07 = (state)->A[7]; \
      A08 = (state)->A[8]; \
      A09 = (state)->A[9]; \
      A0A = (state)->A[10]; \
      A0B = (state)->A[11]; \
      B0 = (state)->B[0]; \
      B1 = (state)->B[1]; \
      B2 = (state)->B[2]; \
      B3 = (state)->B[3]; \
      B4 = (state)->B[4]; \
      B5 = (state)->B[5]; \
      B6 = (state)->B[6]; \
      B7 = (state)->B[7]; \
      B8 = (state)->B[8]; \
      B9 = (state)->B[9]; \
      BA = (state)->B[10]; \
      BB = (state)->B[11]; \
      BC = (state)->B[12]; \
      BD = (state)->B[13]; \
      BE = (state)->B[14]; \
      BF = (state)->B[15]; \
      C0 = (state)->C[0]; \
      C1 = (state)->C[1]; \
      C2 = (state)->C[2]; \
      C3 = (state)->C[3]; \
      C4 = (state)->C[4]; \
      C5 = (state)->C[5]; \
      C6 = (state)->C[6]; \
      C7 = (state)->C[7]; \
      C8 = (state)->C[8]; \
      C9 = (state)->C[9]; \
      CA = (state)->C[10]; \
      CB = (state)->C[11]; \
      CC = (state)->C[12]; \
      CD = (state)->C[13]; \
      CE = (state)->C[14]; \
      CF = (state)->C[15]; \
   } \
   else \
   { \
       (state)->state_loaded = true; \
       A00 = m256_const1_64( 0x20728DFD20728DFD ); \
       A01 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
       A02 = m256_const1_64( 0xE782B699E782B699 ); \
       A03 = m256_const1_64( 0x5530463255304632 ); \
       A04 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
       A05 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
       A06 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
       A07 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
       A08 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
       A09 = m256_const1_64( 0x8BD144108BD14410 ); \
       A0A = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
       A0B = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
       BD = m256_const1_64( 0x75941D9975941D99 ); \
       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
       C3 = m256_const1_64( 0x8134F3598134F359 ); \
       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
       C7 = m256_const1_64( 0x0405278004052780 ); \
       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
       C9 = m256_const1_64( 0x5194358F5194358F ); \
       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
       CC = m256_const1_64( 0x950C3434950C3434 ); \
       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
   } \
   Wlow = (state)->Wlow; \
   Whigh = (state)->Whigh; \
 } while (0)
 #define WRITE_STATE8(state)   do { \
      (state)->A[0] = A00; \
      (state)->A[1] = A01; \
      (state)->A[2] = A02; \
      (state)->A[3] = A03; \
      (state)->A[4] = A04; \
      (state)->A[5] = A05; \
      (state)->A[6] = A06; \
      (state)->A[7] = A07; \
      (state)->A[8] = A08; \
      (state)->A[9] = A09; \
      (state)->A[10] = A0A; \
      (state)->A[11] = A0B; \
      (state)->B[0] = B0; \
      (state)->B[1] = B1; \
      (state)->B[2] = B2; \
      (state)->B[3] = B3; \
      (state)->B[4] = B4; \
      (state)->B[5] = B5; \
      (state)->B[6] = B6; \
      (state)->B[7] = B7; \
      (state)->B[8] = B8; \
      (state)->B[9] = B9; \
      (state)->B[10] = BA; \
      (state)->B[11] = BB; \
      (state)->B[12] = BC; \
      (state)->B[13] = BD; \
      (state)->B[14] = BE; \
      (state)->B[15] = BF; \
      (state)->C[0] = C0; \
      (state)->C[1] = C1; \
      (state)->C[2] = C2; \
      (state)->C[3] = C3; \
      (state)->C[4] = C4; \
      (state)->C[5] = C5; \
      (state)->C[6] = C6; \
      (state)->C[7] = C7; \
      (state)->C[8] = C8; \
      (state)->C[9] = C9; \
      (state)->C[10] = CA; \
      (state)->C[11] = CB; \
      (state)->C[12] = CC; \
      (state)->C[13] = CD; \
      (state)->C[14] = CE; \
      (state)->C[15] = CF; \
      (state)->Wlow = Wlow; \
      (state)->Whigh = Whigh; \
   } while (0)
 #define DECODE_BLOCK8 \
 do { \
   M0 = buf[ 0]; \
   M1 = buf[ 1]; \
   M2 = buf[ 2]; \
   M3 = buf[ 3]; \
   M4 = buf[ 4]; \
   M5 = buf[ 5]; \
   M6 = buf[ 6]; \
   M7 = buf[ 7]; \
   M8 = buf[ 8]; \
   M9 = buf[ 9]; \
   MA = buf[10]; \
   MB = buf[11]; \
   MC = buf[12]; \
   MD = buf[13]; \
   ME = buf[14]; \
   MF = buf[15]; \
 } while (0)
 #define INPUT_BLOCK_ADD8 \
 do { \
    B0 = _mm256_add_epi32( B0, M0 );\
    B1 = _mm256_add_epi32( B1, M1 );\
    B2 = _mm256_add_epi32( B2, M2 );\
    B3 = _mm256_add_epi32( B3, M3 );\
    B4 = _mm256_add_epi32( B4, M4 );\
    B5 = _mm256_add_epi32( B5, M5 );\
    B6 = _mm256_add_epi32( B6, M6 );\
    B7 = _mm256_add_epi32( B7, M7 );\
    B8 = _mm256_add_epi32( B8, M8 );\
    B9 = _mm256_add_epi32( B9, M9 );\
    BA = _mm256_add_epi32( BA, MA );\
    BB = _mm256_add_epi32( BB, MB );\
    BC = _mm256_add_epi32( BC, MC );\
    BD = _mm256_add_epi32( BD, MD );\
    BE = _mm256_add_epi32( BE, ME );\
    BF = _mm256_add_epi32( BF, MF );\
 } while (0)
 #define INPUT_BLOCK_SUB8 \
 do { \
    C0 = _mm256_sub_epi32( C0, M0 ); \
    C1 = _mm256_sub_epi32( C1, M1 ); \
    C2 = _mm256_sub_epi32( C2, M2 ); \
    C3 = _mm256_sub_epi32( C3, M3 ); \
    C4 = _mm256_sub_epi32( C4, M4 ); \
    C5 = _mm256_sub_epi32( C5, M5 ); \
    C6 = _mm256_sub_epi32( C6, M6 ); \
    C7 = _mm256_sub_epi32( C7, M7 ); \
    C8 = _mm256_sub_epi32( C8, M8 ); \
    C9 = _mm256_sub_epi32( C9, M9 ); \
    CA = _mm256_sub_epi32( CA, MA ); \
    CB = _mm256_sub_epi32( CB, MB ); \
    CC = _mm256_sub_epi32( CC, MC ); \
    CD = _mm256_sub_epi32( CD, MD ); \
    CE = _mm256_sub_epi32( CE, ME ); \
    CF = _mm256_sub_epi32( CF, MF ); \
 } while (0)
 #define XOR_W8 \
 do { \
   A00 = _mm256_xor_si256( A00, _mm256_set1_epi32( Wlow ) ); \
   A01 = _mm256_xor_si256( A01, _mm256_set1_epi32( Whigh ) ); \
 } while (0)
 #define SWAP_BC8 \
 do { \
    mm256_swap512_256( B0, C0 ); \
    mm256_swap512_256( B1, C1 ); \
    mm256_swap512_256( B2, C2 ); \
    mm256_swap512_256( B3, C3 ); \
    mm256_swap512_256( B4, C4 ); \
    mm256_swap512_256( B5, C5 ); \
    mm256_swap512_256( B6, C6 ); \
    mm256_swap512_256( B7, C7 ); \
    mm256_swap512_256( B8, C8 ); \
    mm256_swap512_256( B9, C9 ); \
    mm256_swap512_256( BA, CA ); \
    mm256_swap512_256( BB, CB ); \
    mm256_swap512_256( BC, CC ); \
    mm256_swap512_256( BD, CD ); \
    mm256_swap512_256( BE, CE ); \
    mm256_swap512_256( BF, CF ); \
 } while (0)
 #define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm256_xor_si256( xm, _mm256_xor_si256( xb1, _mm256_xor_si256(  \
            _mm256_andnot_si256( xb3, xb2 ), \
            _mm256_mullo_epi32( _mm256_xor_si256( xa0, _mm256_xor_si256( xc, \
               _mm256_mullo_epi32(  mm256_rol_32( xa1, 15 ), _mm256_set1_epi32(5UL) ) \
                   ) ), _mm256_set1_epi32(3UL) ) ) ) ); \
   xb0 = mm256_not( _mm256_xor_si256( xa0, mm256_rol_32( xb0, 1 ) ) ); \
 } while (0)
 #define PERM_STEP_0_8   do { \
      PERM_ELT8(A00, A0B, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A01, A00, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A02, A01, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A03, A02, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A04, A03, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A05, A04, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A06, A05, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A07, A06, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A08, A07, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A09, A08, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A0A, A09, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A0B, A0A, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A00, A0B, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A01, A00, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A02, A01, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A03, A02, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define PERM_STEP_1_8   do { \
      PERM_ELT8(A04, A03, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A05, A04, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A06, A05, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A07, A06, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A08, A07, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A09, A08, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A0A, A09, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A0B, A0A, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A00, A0B, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A01, A00, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A02, A01, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A03, A02, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A04, A03, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A05, A04, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A06, A05, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A07, A06, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define PERM_STEP_2_8   do { \
      PERM_ELT8(A08, A07, B0, BD, B9, B6, C8, M0); \
      PERM_ELT8(A09, A08, B1, BE, BA, B7, C7, M1); \
      PERM_ELT8(A0A, A09, B2, BF, BB, B8, C6, M2); \
      PERM_ELT8(A0B, A0A, B3, B0, BC, B9, C5, M3); \
      PERM_ELT8(A00, A0B, B4, B1, BD, BA, C4, M4); \
      PERM_ELT8(A01, A00, B5, B2, BE, BB, C3, M5); \
      PERM_ELT8(A02, A01, B6, B3, BF, BC, C2, M6); \
      PERM_ELT8(A03, A02, B7, B4, B0, BD, C1, M7); \
      PERM_ELT8(A04, A03, B8, B5, B1, BE, C0, M8); \
      PERM_ELT8(A05, A04, B9, B6, B2, BF, CF, M9); \
      PERM_ELT8(A06, A05, BA, B7, B3, B0, CE, MA); \
      PERM_ELT8(A07, A06, BB, B8, B4, B1, CD, MB); \
      PERM_ELT8(A08, A07, BC, B9, B5, B2, CC, MC); \
      PERM_ELT8(A09, A08, BD, BA, B6, B3, CB, MD); \
      PERM_ELT8(A0A, A09, BE, BB, B7, B4, CA, ME); \
      PERM_ELT8(A0B, A0A, BF, BC, B8, B5, C9, MF); \
   } while (0)
 #define APPLY_P8 \
 do { \
    B0 = mm256_ror_32( B0, 15 ); \
    B1 = mm256_ror_32( B1, 15 ); \
    B2 = mm256_ror_32( B2, 15 ); \
    B3 = mm256_ror_32( B3, 15 ); \
    B4 = mm256_ror_32( B4, 15 ); \
    B5 = mm256_ror_32( B5, 15 ); \
    B6 = mm256_ror_32( B6, 15 ); \
    B7 = mm256_ror_32( B7, 15 ); \
    B8 = mm256_ror_32( B8, 15 ); \
    B9 = mm256_ror_32( B9, 15 ); \
    BA = mm256_ror_32( BA, 15 ); \
    BB = mm256_ror_32( BB, 15 ); \
    BC = mm256_ror_32( BC, 15 ); \
    BD = mm256_ror_32( BD, 15 ); \
    BE = mm256_ror_32( BE, 15 ); \
    BF = mm256_ror_32( BF, 15 ); \
    PERM_STEP_0_8; \
    PERM_STEP_1_8; \
    PERM_STEP_2_8; \
    A0B = _mm256_add_epi32( A0B, C6 ); \
    A0A = _mm256_add_epi32( A0A, C5 ); \
    A09 = _mm256_add_epi32( A09, C4 ); \
    A08 = _mm256_add_epi32( A08, C3 ); \
    A07 = _mm256_add_epi32( A07, C2 ); \
    A06 = _mm256_add_epi32( A06, C1 ); \
    A05 = _mm256_add_epi32( A05, C0 ); \
    A04 = _mm256_add_epi32( A04, CF ); \
    A03 = _mm256_add_epi32( A03, CE ); \
    A02 = _mm256_add_epi32( A02, CD ); \
    A01 = _mm256_add_epi32( A01, CC ); \
    A00 = _mm256_add_epi32( A00, CB ); \
    A0B = _mm256_add_epi32( A0B, CA ); \
    A0A = _mm256_add_epi32( A0A, C9 ); \
    A09 = _mm256_add_epi32( A09, C8 ); \
    A08 = _mm256_add_epi32( A08, C7 ); \
    A07 = _mm256_add_epi32( A07, C6 ); \
    A06 = _mm256_add_epi32( A06, C5 ); \
    A05 = _mm256_add_epi32( A05, C4 ); \
    A04 = _mm256_add_epi32( A04, C3 ); \
    A03 = _mm256_add_epi32( A03, C2 ); \
    A02 = _mm256_add_epi32( A02, C1 ); \
    A01 = _mm256_add_epi32( A01, C0 ); \
    A00 = _mm256_add_epi32( A00, CF ); \
    A0B = _mm256_add_epi32( A0B, CE ); \
    A0A = _mm256_add_epi32( A0A, CD ); \
    A09 = _mm256_add_epi32( A09, CC ); \
    A08 = _mm256_add_epi32( A08, CB ); \
    A07 = _mm256_add_epi32( A07, CA ); \
    A06 = _mm256_add_epi32( A06, C9 ); \
    A05 = _mm256_add_epi32( A05, C8 ); \
    A04 = _mm256_add_epi32( A04, C7 ); \
    A03 = _mm256_add_epi32( A03, C6 ); \
    A02 = _mm256_add_epi32( A02, C5 ); \
    A01 = _mm256_add_epi32( A01, C4 ); \
    A00 = _mm256_add_epi32( A00, C3 ); \
 } while (0)
 #define INCR_W8   do { \
      if ((Wlow = T32(Wlow + 1)) == 0) \
         Whigh = T32(Whigh + 1); \
   } while (0)
 static void
 shabal_8way_init( void *cc, unsigned size )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
   if ( size == 512 )
   { // copy immediate constants directly to working registers later.
       sc->state_loaded = false;
   }
   else
   {  // No users
       sc->state_loaded = true;
       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
   }
    sc->Wlow = 1;
    sc->Whigh = 0;
    sc->ptr = 0;
 }
 static void
 shabal_8way_core( void *cc, const unsigned char *data, size_t len )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
   const int buf_size = 64;
   size_t ptr;
   DECL_STATE8
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < (buf_size - ptr ) )
   {
      memcpy_256( buf + (ptr>>2), vdata, len>>2 );
      ptr += len;
      sc->ptr = ptr;
      return;
   }
   READ_STATE8( sc );
   while ( len > 0 )
   {
      size_t clen;
      clen = buf_size - ptr;
      if ( clen > len )
         clen = len;
      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
      ptr += clen;
      vdata += clen>>2;
      len -= clen;
      if ( ptr == buf_size )
      {
         DECODE_BLOCK8;
         INPUT_BLOCK_ADD8;
         XOR_W8;
         APPLY_P8;
         INPUT_BLOCK_SUB8;
         SWAP_BC8;
         INCR_W8;
         ptr = 0;
      }
   }
   WRITE_STATE8(sc);
   sc->ptr = ptr;
 }
 static void
 shabal_8way_close( void *cc, unsigned ub, unsigned n, void *dst,
                   unsigned size_words )
 {
   shabal_8way_context *sc = (shabal_8way_context*)cc;
    __m256i *buf;
   const int buf_size = 64;
   size_t ptr;
   int i;
   unsigned z, zz;
   DECL_STATE8
   buf = sc->buf;
   ptr = sc->ptr;
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
   buf[ptr>>2] = _mm256_set1_epi32( zz );
   memset_zero_256( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
   READ_STATE8(sc);
   DECODE_BLOCK8;
   INPUT_BLOCK_ADD8;
   XOR_W8;
   APPLY_P8;
   for ( i = 0; i < 3; i ++ )
   {
      SWAP_BC8;
      XOR_W8;
      APPLY_P8;
   }
   __m256i *d = (__m256i*)dst;
   if ( size_words == 16 )   // 512
   {
      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
   }
   else    // 256
   {
      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
   }
 }
 void
 shabal256_8way_init( void *cc )
 {
   shabal_8way_init(cc, 256);
 }
 void
 shabal256_8way_update( void *cc, const void *data, size_t len )
 {
   shabal_8way_core( cc, data, len );
 }
 void
 shabal256_8way_close( void *cc, void *dst )
 {
   shabal_8way_close(cc, 0, 0, dst, 8);
 }
 void
 shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                  void *dst )
 {
   shabal_8way_close(cc, ub, n, dst, 8);
 }
 void
 shabal512_8way_init(void *cc)
 {
   shabal_8way_init(cc, 512);
 }
 void
 shabal512_8way_update(void *cc, const void *data, size_t len)
 {
   shabal_8way_core(cc, data, len);
 }
 void
 shabal512_8way_close(void *cc, void *dst)
 {
   shabal_8way_close(cc, 0, 0, dst, 16);
 }
 void
 shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
   shabal_8way_close(cc, ub, n, dst, 16);
 }
 #endif  // AVX2
 /*
 * We copy the state into local variables, so that the compiler knows
 * that it can optimize them at will.
@@ -290,6 +883,8 @@ do { \
   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
 } while (0)
 /*
 #define SWAP(v1, v2)   do { \
 		sph_u32 tmp = (v1); \
@@ -297,26 +892,39 @@ do { \
 		(v2) = tmp; \
 	} while (0)
 */
 #define SWAP_BC \
 do { \
-    mm128_swap128_256( B0, C0 ); \
+    mm128_swap256_128( B0, C0 ); \
-    mm128_swap128_256( B1, C1 ); \
+    mm128_swap256_128( B1, C1 ); \
-    mm128_swap128_256( B2, C2 ); \
+    mm128_swap256_128( B2, C2 ); \
-    mm128_swap128_256( B3, C3 ); \
+    mm128_swap256_128( B3, C3 ); \
-    mm128_swap128_256( B4, C4 ); \
+    mm128_swap256_128( B4, C4 ); \
-    mm128_swap128_256( B5, C5 ); \
+    mm128_swap256_128( B5, C5 ); \
-    mm128_swap128_256( B6, C6 ); \
+    mm128_swap256_128( B6, C6 ); \
-    mm128_swap128_256( B7, C7 ); \
+    mm128_swap256_128( B7, C7 ); \
-    mm128_swap128_256( B8, C8 ); \
+    mm128_swap256_128( B8, C8 ); \
-    mm128_swap128_256( B9, C9 ); \
+    mm128_swap256_128( B9, C9 ); \
-    mm128_swap128_256( BA, CA ); \
+    mm128_swap256_128( BA, CA ); \
-    mm128_swap128_256( BB, CB ); \
+    mm128_swap256_128( BB, CB ); \
-    mm128_swap128_256( BC, CC ); \
+    mm128_swap256_128( BC, CC ); \
-    mm128_swap128_256( BD, CD ); \
+    mm128_swap256_128( BD, CD ); \
-    mm128_swap128_256( BE, CE ); \
+    mm128_swap256_128( BE, CE ); \
-    mm128_swap128_256( BF, CF ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)
 /*
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
  __m128i t1 = _mm_mullo_epi32(  mm_rol_32( xa1, 15 ),\
                                   _mm_set1_epi32(5UL) ) \
  __m128i t2 = _mm_xor_si128( xa0, xc ); \
  xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \
  xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \
              _mm_xor_si128( t2, \
                      _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \
 */
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -706,7 +1314,7 @@ shabal256_4way_init( void *cc )
 }
 void
-shabal256_4way( void *cc, const void *data, size_t len )
+shabal256_4way_update( void *cc, const void *data, size_t len )
 {
 	shabal_4way_core( cc, data, len );
 }
@@ -731,7 +1339,7 @@ shabal512_4way_init(void *cc)
 }
 void
-shabal512_4way(void *cc, const void *data, size_t len)
+shabal512_4way_update(void *cc, const void *data, size_t len)
 {
 	shabal_4way_core(cc, data, len);
 }
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -36,7 +36,7 @@
 #ifndef SHABAL_HASH_4WAY_H__
 #define SHABAL_HASH_4WAY_H__ 1
-#ifdef __AVX2__
+#ifdef __SSE4_1__
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
@@ -50,6 +50,34 @@ extern "C"{
 #define SPH_SIZE_shabal512   512
 #if defined(__AVX2__)
 typedef struct {
   __m256i buf[16];
   __m256i A[12], B[16], C[16];
   sph_u32 Whigh, Wlow;
   size_t ptr;
   bool state_loaded;
 } shabal_8way_context __attribute__ ((aligned (64)));
 typedef shabal_8way_context shabal256_8way_context;
 typedef shabal_8way_context shabal512_8way_context;
 void shabal256_8way_init( void *cc );
 void shabal256_8way_update( void *cc, const void *data, size_t len );
 void shabal256_8way_close( void *cc, void *dst );
 void shabal256_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
 void shabal512_8way_init( void *cc );
 void shabal512_8way_update( void *cc, const void *data, size_t len );
 void shabal512_8way_close( void *cc, void *dst );
 void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
 #endif
 typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
@@ -62,13 +90,14 @@ typedef shabal_4way_context shabal256_4way_context;
 typedef shabal_4way_context shabal512_4way_context;
 void shabal256_4way_init( void *cc );
-void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_update( void *cc, const void *data, size_t len );
 void shabal256_4way_close( void *cc, void *dst );
 void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
                                       void *dst );
 void shabal512_4way_init( void *cc );
-void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_update( void *cc, const void *data, size_t len );
 #define shabal512_4way shabal512_4way_update
 void shabal512_4way_close( void *cc, void *dst );
 void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
                                       void *dst );
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -16,8 +16,8 @@ static const uint32_t IV512[] =
 #define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_ror1x32_128( a ), \
+   _mm256_blend_epi32( mm256_ror128_32( a ), \
-                       mm256_ror1x32_128( b ), 0x88 )
+                       mm256_ror128_32( b ), 0x88 )
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
@@ -61,7 +61,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   {
      // round 1, 5, 9
-     k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( k13, mm256_ror128_32(
                                  mm256_aesenc_2x128( k00, zero ) ) );
     if ( r == 0 )
@@ -71,7 +71,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k01, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k01, zero ) ) );
     if ( r == 1 )
        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
@@ -80,25 +80,25 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k02, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k02, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
     k03 = _mm256_xor_si256( k02,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k03, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k03, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p3 = _mm256_xor_si256( p3, x );
     k10 = _mm256_xor_si256( k03,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k10, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k10, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
     k11 = _mm256_xor_si256( k10,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k11, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k11, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
     k12 = _mm256_xor_si256( k11,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) ) );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
     k13 = _mm256_xor_si256( k12,
-		     mm256_ror1x32_128( mm256_aesenc_2x128( k13, zero ) ) );
+		     mm256_ror128_32( mm256_aesenc_2x128( k13, zero ) ) );
     if ( r == 2 )
        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
@@ -134,31 +134,31 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
     // round 3, 7, 11
-     k00 = _mm256_xor_si256( mm256_ror1x32_128(
+     k00 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k00, zero ) ), k13 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ), zero );
-     k01 = _mm256_xor_si256( mm256_ror1x32_128(
+     k01 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k01, zero ) ), k00 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-     k02 = _mm256_xor_si256( mm256_ror1x32_128(
+     k02 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k02, zero ) ), k01 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-     k03 = _mm256_xor_si256( mm256_ror1x32_128(
+     k03 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k03, zero ) ), k02 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
     p1 = _mm256_xor_si256( p1, x );
-     k10 = _mm256_xor_si256( mm256_ror1x32_128(
+     k10 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k10, zero ) ), k03 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ), zero );
-     k11 = _mm256_xor_si256( mm256_ror1x32_128(
+     k11 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k11, zero ) ), k10 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-     k12 = _mm256_xor_si256( mm256_ror1x32_128(
+     k12 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k12, zero ) ), k11 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-     k13 = _mm256_xor_si256( mm256_ror1x32_128(
+     k13 = _mm256_xor_si256( mm256_ror128_32(
                                     mm256_aesenc_2x128( k13, zero ) ), k12 );
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
@@ -192,35 +192,35 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   // round 13
-   k00 = _mm256_xor_si256( mm256_ror1x32_128(
+   k00 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k00, zero ) ), k13  );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
-   k01 = _mm256_xor_si256( mm256_ror1x32_128(
+   k01 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k01, zero ) ), k00 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
-   k02 = _mm256_xor_si256( mm256_ror1x32_128(
+   k02 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k02, zero ) ), k01 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ), zero );
-   k03 = _mm256_xor_si256( mm256_ror1x32_128(
+   k03 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k03, zero ) ), k02 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ), zero );
   p3 = _mm256_xor_si256( p3, x );
-   k10 = _mm256_xor_si256( mm256_ror1x32_128(
+   k10 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k10, zero ) ), k03 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ), zero );
-   k11 = _mm256_xor_si256( mm256_ror1x32_128(
+   k11 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k11, zero ) ), k10 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );
-   k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12, zero ) );
+   k12 = mm256_ror128_32( mm256_aesenc_2x128( k12, zero ) );
   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
 	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
-   k13 = _mm256_xor_si256( mm256_ror1x32_128(
+   k13 = _mm256_xor_si256( mm256_ror128_32(
 			             mm256_aesenc_2x128( k13, zero ) ), k12 );
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -1,7 +1,4 @@
 #include "x12-gate.h"
 #if defined(X12_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,11 +11,223 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/fugue/sph_fugue.h"
+
 #if defined(X12_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
 } x12_8way_ctx_holder;
 x12_8way_ctx_holder x12_8way_ctx __attribute__ ((aligned (64)));
 void init_x12_8way_ctx()
 {
     blake512_8way_init( &x12_8way_ctx.blake );
     bmw512_8way_init( &x12_8way_ctx.bmw );
     init_groestl( &x12_8way_ctx.groestl, 64 );
     skein512_8way_init( &x12_8way_ctx.skein );
     jh512_8way_init( &x12_8way_ctx.jh );
     keccak512_8way_init( &x12_8way_ctx.keccak );
     luffa_4way_init( &x12_8way_ctx.luffa, 512 );
     cube_4way_init( &x12_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x12_8way_ctx.shavite );
     simd_4way_init( &x12_8way_ctx.simd, 512 );
     init_echo( &x12_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x12_8way_ctx.hamsi );
 };
 void x12_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x12_8way_ctx_holder ctx;
     memcpy( &ctx, &x12_8way_ctx, sizeof(x12_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7, vhash );
     // Luffa + Cube
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x12_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x12_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x12_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, state );
 }
 int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[16*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t lane_hash[8] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[49]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     int thr_id = mythr->id;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
               _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                 n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );
        x12_8way_hash( hash, vdata );
        for ( int lane = 0; lane < 8; lane++ )
        if ( hash7[ lane<<1 ] < Htarg )
        {
           extr_lane_8x64( lane_hash, hash, lane, 256 );
           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
           {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
           }
        }
        n += 8;
     } while ( ( n < max_nonce-8 ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X12_4WAY)
 typedef struct {
    blake512_4way_context   blake;
@@ -63,45 +272,13 @@ void x12_4way_hash( void *state, const void *input )
     x12_4way_ctx_holder ctx;
     memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
     // 1 Blake
     blake512_4way( &ctx.blake, input, 80 );
     blake512_4way_close( &ctx.blake, vhash );
     // 2 Bmw
     bmw512_4way( &ctx.bmw, vhash, 64 );
     bmw512_4way_close( &ctx.bmw, vhash );
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     // 4 Skein
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
     // 5 JH
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     // 6 Keccak
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     // Serial
     dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     // 7 Luffa
     intrlv_2x128( vhash, hash0, hash1, 512 );
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -110,7 +287,6 @@ void x12_4way_hash( void *state, const void *input )
     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
     // 8 Cubehash
     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
@@ -119,7 +295,6 @@ void x12_4way_hash( void *state, const void *input )
     cubehashInit( &ctx.cube, 512, 16, 32 );
     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
@@ -135,7 +310,6 @@ void x12_4way_hash( void *state, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     // 10 Simd
     intrlv_2x128( vhash, hash0, hash1, 512 );
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash0, hash1, vhash, 512 );
@@ -144,21 +318,25 @@ void x12_4way_hash( void *state, const void *input )
     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_2x128( hash2, hash3, vhash, 512 );
-     // 11 Echo
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
-     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-                       (const BitSequence *) hash0, 512 );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
-                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
-     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
-     // 12 Hamsi parallel 4way 32 bit
+     // Parallel 4way 64 bit
     intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
     skein512_4way( &ctx.skein, vhash, 64 );
     skein512_4way_close( &ctx.skein, vhash );
     jh512_4way( &ctx.jh, vhash, 64 );
     jh512_4way_close( &ctx.jh, vhash );
     keccak512_4way( &ctx.keccak, vhash, 64 );
     keccak512_4way_close( &ctx.keccak, vhash );
     hamsi512_4way( &ctx.hamsi, vhash, 64 );
     hamsi512_4way_close( &ctx.hamsi, vhash );
--- a/algo/x12/x12-gate.c
+++ b/algo/x12/x12-gate.c
@@ -2,7 +2,11 @@
 bool register_x12_algo( algo_gate_t* gate )
 {
-#if defined (X12_4WAY)
+#if defined (X12_8WAY)
  init_x12_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_8way;
  gate->hash      = (void*)&x12_8way_hash;
 #elif defined (X12_4WAY)
  init_x12_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x12_4way;
  gate->hash      = (void*)&x12_4way_hash;
@@ -11,7 +15,7 @@ bool register_x12_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x12;
  gate->hash      = (void*)&x12hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x12/x12-gate.h
+++ b/algo/x12/x12-gate.h
@@ -4,29 +4,36 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X12_4WAY
+  #define X12_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X12_4WAY 1
 #endif
 bool register_x12_algo( algo_gate_t* gate );
-#if defined(X12_4WAY)
+#if defined(X12_8WAY)
 void x12_8way_hash( void *state, const void *input );
 int scanhash_x12_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_8way_ctx();
 #elif defined(X12_4WAY)
 void x12_4way_hash( void *state, const void *input );
 int scanhash_x12_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_4way_ctx();
-#endif
+#else
 void x12hash( void *state, const void *input );
 int scanhash_x12( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x12_ctx();
 #endif
 #endif
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,35 +20,40 @@
 #include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/skein/sse2/skein.c"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 typedef struct {
   sph_blake512_context    blake;
   sph_bmw512_context      bmw;
   sph_skein512_context    skein;
   sph_jh512_context       jh;
   sph_keccak512_context   keccak;
 #if defined(__AES__)
-        hashState_groestl       groestl;
+   hashState_groestl       groestl;
-        hashState_echo          echo;
+   hashState_echo          echo;
 #else
-        sph_groestl512_context   groestl;
+   sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
+   sph_echo512_context      echo;
 #endif
-        hashState_luffa         luffa;
+   hashState_luffa          luffa;
-        cubehashParam           cubehash;
+   cubehashParam            cubehash;
-        sph_shavite512_context  shavite;
+   sph_shavite512_context   shavite;
-        hashState_sd            simd;
+   hashState_sd             simd;
-        sph_hamsi512_context    hamsi;
+   sph_hamsi512_context     hamsi;
 } x12_ctx_holder;
 x12_ctx_holder x12_ctx;
 void init_x12_ctx()
 {
        sph_blake512_init( &x12_ctx.blake );
        sph_bmw512_init( &x12_ctx.bmw );
        sph_skein512_init( &x12_ctx.skein);
        sph_jh512_init( &x12_ctx.jh);
        sph_keccak512_init( &x12_ctx.keccak);
 #if defined(__AES__)
        init_echo( &x12_ctx.echo, 512 );
        init_groestl (&x12_ctx.groestl, 64 );
@@ -65,102 +70,59 @@ void init_x12_ctx()
 void x12hash(void *output, const void *input)
 {
 	unsigned char hash[128] __attribute__ ((aligned (32)));
 	#define hashB hash+64
-        x12_ctx_holder ctx;
+   x12_ctx_holder ctx;
-        memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
+   memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
-        // X11 algos
+   sph_blake512(&ctx.blake, input, 80);
   sph_blake512_close(&ctx.blake, hash);
-        unsigned char hashbuf[128];
+   sph_bmw512(&ctx.bmw, hash, 64);
-        size_t hashptr;
+   sph_bmw512_close(&ctx.bmw, hash);
        sph_u64 hashctA;
        sph_u64 hashctB;
-        //---blake1---
+   update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                           (const BitSequence*)hash, 64 );
-        DECL_BLK;
+   cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
-        BLK_I;
+                         (const byte*)hashB, 64 );
        BLK_W;
        BLK_C;
-        //---bmw2---
+   sph_shavite512( &ctx.shavite, hash, 64);
   sph_shavite512_close( &ctx.shavite, hashB);
-        DECL_BMW;
+   update_final_sd( &ctx.simd, (BitSequence *)hash,
-        BMW_I;
+                    (const BitSequence *)hashB, 512 );
        BMW_U;
        #define M(x)    sph_dec64le_aligned(data + 8 * (x))
        #define H(x)    (h[x])
        #define dH(x)   (dh[x])
        BMW_C;
        #undef M
        #undef H
        #undef dH
        //---groetl----
 #if defined(__AES__)
-        update_and_final_groestl( &ctx.groestl, (char*)hash,
+   update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                                  (const char*)hash, 512 );
 #else
        sph_groestl512 (&ctx.groestl, hash, 64);
        sph_groestl512_close(&ctx.groestl, hash);
 #endif
        //---skein4---
        DECL_SKN;
        SKN_I;
        SKN_U;
        SKN_C;
        //---jh5------
        DECL_JH;
        JH_H;
        //---keccak6---
        DECL_KEC;
        KEC_I;
        KEC_U;
        KEC_C;
        //--- luffa7
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
                                (const BitSequence*)hash, 64 );
        // 8 Cube
        cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
                              (const byte*)hashB, 64 );
        // 9 Shavite
        sph_shavite512( &ctx.shavite, hash, 64);
        sph_shavite512_close( &ctx.shavite, hashB);
        // 10 Simd
        update_final_sd( &ctx.simd, (BitSequence *)hash,
                         (const BitSequence *)hashB, 512 );
        //11---echo---
 #if defined(__AES__)
        update_final_echo ( &ctx.echo, (BitSequence *)hashB,
                            (const BitSequence *)hash, 512 );
 #else
-        sph_echo512(&ctx.echo, hash, 64);
+   sph_echo512(&ctx.echo, hash, 64);
-        sph_echo512_close(&ctx.echo, hashB);
+   sph_echo512_close(&ctx.echo, hashB);
 #endif
-        // 12 Hamsi
+#if defined(__AES__)
   update_and_final_groestl( &ctx.groestl, (char*)hash,
                                  (const char*)hash, 512 );
 #else
   sph_groestl512 (&ctx.groestl, hash, 64);
   sph_groestl512_close(&ctx.groestl, hash);
 #endif
   sph_skein512(&ctx.skein, hash, 64);
   sph_skein512_close(&ctx.skein, hash);
   sph_jh512(&ctx.jh, hash, 64);
   sph_jh512_close(&ctx.jh, hash);
   sph_keccak512(&ctx.keccak, hash, 64);
   sph_keccak512_close(&ctx.keccak, hash);
 	sph_hamsi512(&ctx.hamsi, hashB, 64);
 	sph_hamsi512_close(&ctx.hamsi, hash);
        asm volatile ("emms");
 	memcpy(output, hashB, 32);
 }
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -1,7 +1,4 @@
 #include "x13-gate.h"
 #if defined(X13_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,12 +11,267 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
 #if defined(X13_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
 } x13_8way_ctx_holder;
 x13_8way_ctx_holder x13_8way_ctx;
 void init_x13_8way_ctx()
 {
     blake512_8way_init( &x13_8way_ctx.blake );
     bmw512_8way_init( &x13_8way_ctx.bmw );
     init_groestl( &x13_8way_ctx.groestl, 64 );
     skein512_8way_init( &x13_8way_ctx.skein );
     jh512_8way_init( &x13_8way_ctx.jh );
     keccak512_8way_init( &x13_8way_ctx.keccak );
     luffa_4way_init( &x13_8way_ctx.luffa, 512 );
     cube_4way_init( &x13_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x13_8way_ctx.shavite );
     simd_4way_init( &x13_8way_ctx.simd, 512 );
     init_echo( &x13_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x13_8way_ctx.hamsi );
     sph_fugue512_init( &x13_8way_ctx.fugue );
 }
 void x13_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x13_8way_ctx_holder ctx;
     memcpy( &ctx, &x13_8way_ctx, sizeof(x13_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x13_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     // Luffa + Cube
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x13_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x13_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x13_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     memcpy( state,     hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
     memcpy( state+128, hash4, 32 );
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
 }
 int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     int thr_id = mythr->id;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     const uint32_t last_nonce = max_nonce -8;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
         _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                           n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
         x13_8way_hash( hash, vdata );
         pdata[19] = n;
         for ( int i = 0; i < 8; i++ )
         if ( ( hash+(i<<3) )[7] < Htarg
              && fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
             pdata[19] = n+i;
             submit_lane_solution( work, hash+(i<<3), mythr, i );
         }
         n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X13_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
--- a/algo/x13/x13-gate.c
+++ b/algo/x13/x13-gate.c
@@ -2,7 +2,11 @@
 bool register_x13_algo( algo_gate_t* gate )
 {
-#if defined (X13_4WAY)
+#if defined (X13_8WAY)
  init_x13_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_8way;
  gate->hash      = (void*)&x13_8way_hash;
 #elif defined (X13_4WAY)
  init_x13_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x13_4way;
  gate->hash      = (void*)&x13_4way_hash;
@@ -11,7 +15,7 @@ bool register_x13_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x13;
  gate->hash      = (void*)&x13hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x13/x13-gate.h
+++ b/algo/x13/x13-gate.h
@@ -4,29 +4,35 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X13_4WAY
+  #define X13_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X13_4WAY 1
 #endif
 bool register_x13_algo( algo_gate_t* gate );
-#if defined(X13_4WAY)
+#if defined(X13_8WAY)
 void x13_8way_hash( void *state, const void *input );
 int scanhash_x13_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_8way_ctx();
 #elif defined(X13_4WAY)
 void x13_4way_hash( void *state, const void *input );
 int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_4way_ctx();
-#endif
+#else
 void x13hash( void *state, const void *input );
 int scanhash_x13( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );
 void init_x13_ctx();
 #endif
 #endif
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -1,7 +1,4 @@
 #include "x14-gate.h"
 #if defined(X14_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -13,6 +10,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
@@ -22,6 +20,263 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #if defined(X14_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
 } x14_8way_ctx_holder;
 x14_8way_ctx_holder x14_8way_ctx __attribute__ ((aligned (64)));
 void init_x14_8way_ctx()
 {
     blake512_8way_init( &x14_8way_ctx.blake );
     bmw512_8way_init( &x14_8way_ctx.bmw );
     init_groestl( &x14_8way_ctx.groestl, 64 );
     skein512_8way_init( &x14_8way_ctx.skein );
     jh512_8way_init( &x14_8way_ctx.jh );
     keccak512_8way_init( &x14_8way_ctx.keccak );
     luffa_4way_init( &x14_8way_ctx.luffa, 512 );
     cube_4way_init( &x14_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x14_8way_ctx.shavite );
     simd_4way_init( &x14_8way_ctx.simd, 512 );
     init_echo( &x14_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x14_8way_ctx.hamsi );
     sph_fugue512_init( &x14_8way_ctx.fugue );
     shabal512_8way_init( &x14_8way_ctx.shabal );
 };
 void x14_8way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*8] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     x14_8way_ctx_holder ctx;
     memcpy( &ctx, &x14_8way_ctx, sizeof(x14_8way_ctx) );
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x14_8way_ctx.groestl,
             sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                  hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                   vhash );
     // Luffa + Cube
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x14_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x14_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue serial
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x14_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     // 14 Shabal, parallel 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     shabal512_8way_update( &ctx.shabal, vhash, 64 );
     shabal512_8way_close( &ctx.shabal, state );
 }
 int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     const uint32_t last_nonce = max_nonce - 8;
     __m512i  *noncev = (__m512i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
              _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
        x14_8way_hash( hash, vdata );
        pdata[19] = n;
        uint32_t *hash7 = &(hash[7<<3]);
        for ( int lane = 0; lane < 8; lane++ )
        if ( hash7[ lane ] < Htarg )
        {
            uint32_t lane_hash[8] __attribute__ ((aligned (64)));
            extr_lane_8x32( lane_hash, hash, lane, 256 );
            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
                pdata[19] = n + lane;
                submit_lane_solution( work, lane_hash, mythr, lane );
            }
         }
         n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X14_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -61,11 +316,11 @@ void init_x14_4way_ctx()
 void x14_4way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x14_4way_ctx_holder ctx;
     memcpy( &ctx, &x14_4way_ctx, sizeof(x14_4way_ctx) );
@@ -184,61 +439,49 @@ void x14_4way_hash( void *state, const void *input )
     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
 }
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;  
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
-     for ( int m=0; m < 6; m++ )
+     do
-       if ( Htarg <= htmax[m] )
+     {
       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
             _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
       x14_4way_hash( hash, vdata );
       pdata[19] = n;
       uint32_t *hash7 = &(hash[7<<2]);
       for ( int lane = 0; lane < 4; lane++ )
       if ( hash7[ lane ] < Htarg )
       {
-         uint32_t mask = masks[m];
+           uint32_t lane_hash[8];
-         do
+           extr_lane_4x32( lane_hash, hash, lane, 256 );
         {
           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-            x14_4way_hash( hash, vdata );
+           if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
-            pdata[19] = n;
+           {
-
+               pdata[19] = n + lane;
-            uint32_t *hash7 = &(hash[7<<2]);
+               submit_lane_solution( work, lane_hash, mythr, lane );
-
+           }
-            for ( int lane = 0; lane < 4; lane++ )
+        }
-            if ( ( hash7[ lane ] & mask ) == 0 )
+        n += 4;
-            {
+     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-               // deinterleave hash for lane
+     *hashes_done = n - first_nonce;
               uint32_t lane_hash[8];
               extr_lane_4x32( lane_hash, hash, lane, 256 );
               if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
               {
                  pdata[19] = n + lane;
                  submit_lane_solution( work, lane_hash, mythr, lane );
               }
            }
            n += 4;
         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
       }
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -2,7 +2,11 @@
 bool register_x14_algo( algo_gate_t* gate )
 {
-#if defined (X14_4WAY)
+#if defined (X14_8WAY)
  init_x14_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_8way;
  gate->hash      = (void*)&x14_8way_hash;
 #elif defined (X14_4WAY)
  init_x14_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x14_4way;
  gate->hash      = (void*)&x14_4way_hash;
@@ -11,7 +15,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x14/x14-gate.h
+++ b/algo/x14/x14-gate.h
@@ -4,20 +4,29 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X14_4WAY
+  #define X14_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X14_4WAY 1
 #endif
 bool register_x14_algo( algo_gate_t* gate );
-#if defined(X14_4WAY)
+#if defined(X14_8WAY)
 void x14_8way_hash( void *state, const void *input );
 int scanhash_x14_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_8way_ctx();
 #elif defined(X14_4WAY)
 void x14_4way_hash( void *state, const void *input );
 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x14_4way_ctx();
-#endif
+#else
 void x14hash( void *state, const void *input );
 int scanhash_x14( struct work *work, uint32_t max_nonce,
@@ -26,3 +35,4 @@ void init_x14_ctx();
 #endif
 #endif
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -1,7 +1,4 @@
 #include "x15-gate.h"
 #if defined(X15_4WAY)
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -14,6 +11,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/cubehash/cube-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
@@ -23,6 +21,306 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #if defined(X15_8WAY)
 typedef struct {
    blake512_8way_context   blake;
    bmw512_8way_context     bmw;
    hashState_groestl       groestl;
    skein512_8way_context   skein;
    jh512_8way_context      jh;
    keccak512_8way_context  keccak;
    luffa_4way_context      luffa;
    cube_4way_context       cube;
    sph_shavite512_context  shavite;
    simd_4way_context       simd;
    hashState_echo          echo;
    hamsi512_8way_context   hamsi;
    sph_fugue512_context    fugue;
    shabal512_8way_context  shabal;
    sph_whirlpool_context   whirlpool;
 } x15_8way_ctx_holder;
 x15_8way_ctx_holder x15_8way_ctx __attribute__ ((aligned (64)));
 void init_x15_8way_ctx()
 {
     blake512_8way_init( &x15_8way_ctx.blake );
     bmw512_8way_init( &x15_8way_ctx.bmw );
     init_groestl( &x15_8way_ctx.groestl, 64 );
     skein512_8way_init( &x15_8way_ctx.skein );
     jh512_8way_init( &x15_8way_ctx.jh );
     keccak512_8way_init( &x15_8way_ctx.keccak );
     luffa_4way_init( &x15_8way_ctx.luffa, 512 );
     cube_4way_init( &x15_8way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &x15_8way_ctx.shavite );
     simd_4way_init( &x15_8way_ctx.simd, 512 );
     init_echo( &x15_8way_ctx.echo, 512 );
     hamsi512_8way_init( &x15_8way_ctx.hamsi );
     sph_fugue512_init( &x15_8way_ctx.fugue );
     shabal512_8way_init( &x15_8way_ctx.shabal );
     sph_whirlpool_init( &x15_8way_ctx.whirlpool );
 };
 void x15_8way_hash( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t hash4[8] __attribute__ ((aligned (64)));
     uint64_t hash5[8] __attribute__ ((aligned (64)));
     uint64_t hash6[8] __attribute__ ((aligned (64)));
     uint64_t hash7[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*8] __attribute__ ((aligned (64)));
     x15_8way_ctx_holder ctx;
     memcpy( &ctx, &x15_8way_ctx, sizeof(x15_8way_ctx) );
     // 1 Blake
     blake512_8way_update( &ctx.blake, input, 80 );
     blake512_8way_close( &ctx.blake, vhash );
     // 2 Bmw
     bmw512_8way_update( &ctx.bmw, vhash, 64 );
     bmw512_8way_close( &ctx.bmw, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 3 Groestl
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash4, (char*)hash4, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash5, (char*)hash5, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash6, (char*)hash6, 512 );
     memcpy( &ctx.groestl, &x15_8way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash7, (char*)hash7, 512 );
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     skein512_8way_update( &ctx.skein, vhash, 64 );
     skein512_8way_close( &ctx.skein, vhash );
     // 5 JH
     jh512_8way_update( &ctx.jh, vhash, 64 );
     jh512_8way_close( &ctx.jh, vhash );
     // 6 Keccak
     keccak512_8way_update( &ctx.keccak, vhash, 64 );
     keccak512_8way_close( &ctx.keccak, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // Luffa + Cube
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     luffa_4way_init( &ctx.luffa, 512 );
     cube_4way_init( &ctx.cube, 512, 16, 32 );
     luffa_4way_update_close( &ctx.luffa, vhash, vhash, 64 );
     cube_4way_update_close( &ctx.cube, vhash, vhash, 64 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     // 9 Shavite
     sph_shavite512( &ctx.shavite, hash0, 64 );
     sph_shavite512_close( &ctx.shavite, hash0 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash1, 64 );
     sph_shavite512_close( &ctx.shavite, hash1 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash2, 64 );
     sph_shavite512_close( &ctx.shavite, hash2 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash4, 64 );
     sph_shavite512_close( &ctx.shavite, hash4 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash5, 64 );
     sph_shavite512_close( &ctx.shavite, hash5 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash6, 64 );
     sph_shavite512_close( &ctx.shavite, hash6 );
     memcpy( &ctx.shavite, &x15_8way_ctx.shavite,
             sizeof(sph_shavite512_context) );
     sph_shavite512( &ctx.shavite, hash7, 64 );
     sph_shavite512_close( &ctx.shavite, hash7 );
     // 10 Simd
     intrlv_4x128_512( vhash, hash0, hash1, hash2, hash3 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
     intrlv_4x128_512( vhash, hash4, hash5, hash6, hash7 );
     simd_4way_init( &ctx.simd, 512 );
     simd_4way_update_close( &ctx.simd, vhash, vhash, 512 );
     dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
     // 11 Echo
     update_final_echo( &ctx.echo, (BitSequence *)hash0,
                       (const BitSequence *) hash0, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash1,
                       (const BitSequence *) hash1, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash2,
                       (const BitSequence *) hash2, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash4,
                       (const BitSequence *) hash4, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash5,
                       (const BitSequence *) hash5, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash6,
                       (const BitSequence *) hash6, 512 );
     memcpy( &ctx.echo, &x15_8way_ctx.echo, sizeof(hashState_echo) );
     update_final_echo( &ctx.echo, (BitSequence *)hash7,
                       (const BitSequence *) hash7, 512 );
     // 12 Hamsi parallel 4way 64 bit
     intrlv_8x64_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
     hamsi512_8way_close( &ctx.hamsi, vhash );
     dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 13 Fugue
     sph_fugue512( &ctx.fugue, hash0, 64 );
     sph_fugue512_close( &ctx.fugue, hash0 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash1, 64 );
     sph_fugue512_close( &ctx.fugue, hash1 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash2, 64 );
     sph_fugue512_close( &ctx.fugue, hash2 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash4, 64 );
     sph_fugue512_close( &ctx.fugue, hash4 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash5, 64 );
     sph_fugue512_close( &ctx.fugue, hash5 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash6, 64 );
     sph_fugue512_close( &ctx.fugue, hash6 );
     memcpy( &ctx.fugue, &x15_8way_ctx.fugue, sizeof(sph_fugue512_context) );
     sph_fugue512( &ctx.fugue, hash7, 64 );
     sph_fugue512_close( &ctx.fugue, hash7 );
     // 14 Shabal, parallel 32 bit
     intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                      hash7 );
     shabal512_8way_update( &ctx.shabal, vhash, 64 );
     shabal512_8way_close( &ctx.shabal, vhash );
     dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                       vhash );
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash0 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash1, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash1 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash2, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash2 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash3, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash3 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash4, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash4 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash5, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash5 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash6, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash6 );
     memcpy( &ctx.whirlpool, &x15_8way_ctx.whirlpool,
             sizeof(sph_whirlpool_context) );
     sph_whirlpool( &ctx.whirlpool, hash7, 64 );
     sph_whirlpool_close( &ctx.whirlpool, hash7 );
     memcpy( state,    hash0, 32 );
     memcpy( state+ 32, hash1, 32 );
     memcpy( state+ 64, hash2, 32 );
     memcpy( state+ 96, hash3, 32 );
     memcpy( state+128, hash4, 32 );
     memcpy( state+160, hash5, 32 );
     memcpy( state+192, hash6, 32 );
     memcpy( state+224, hash7, 32 );
 }
 int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[8*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*8] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 8;
     __m512i  *noncev = (__m512i*)vdata + 9;  
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;  
     mm512_bswap32_intrlv80_8x64( vdata, pdata );
     do
     {
        *noncev = mm512_intrlv_blend_32( mm512_bswap_32(
           _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ) ), *noncev );
        x15_8way_hash( hash, vdata );
        pdata[19] = n;
        for ( int i = 0; i < 8; i++ )
        if ( ( hash+(i<<3) )[7] < Htarg )
        if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
        {
           pdata[19] = n+i;
           submit_lane_solution( work, hash, mythr, i );
        }
        n += 8;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
     *hashes_done = n - first_nonce;
     return 0;
 }
 #elif defined(X15_4WAY)
 typedef struct {
    blake512_4way_context   blake;
    bmw512_4way_context     bmw;
@@ -64,11 +362,11 @@ void init_x15_4way_ctx()
 void x15_4way_hash( void *state, const void *input )
 {
     uint64_t vhash[8*4] __attribute__ ((aligned (128)));
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     x15_4way_ctx_holder ctx;
     memcpy( &ctx, &x15_4way_ctx, sizeof(x15_4way_ctx) );
@@ -187,7 +485,7 @@ void x15_4way_hash( void *state, const void *input )
     // 14 Shabal, parallel 32 bit
     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
-     shabal512_4way( &ctx.shabal, vhash, 64 );
+     shabal512_4way_update( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
@@ -216,48 +514,37 @@ void x15_4way_hash( void *state, const void *input )
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*8] __attribute__ ((aligned (128)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+     uint32_t n = first_nonce;
     const uint32_t last_nonce = max_nonce - 4;
     __m256i  *noncev = (__m256i*)vdata + 9;
     const uint32_t Htarg = ptarget[7];
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     int thr_id = mythr->id;  
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
-     for ( int m=0; m < 6; m++ )
+     do
-       if ( Htarg <= htmax[m] )
+     {
-       {
+        *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-         uint32_t mask = masks[m];
+              _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
-         do
+
         x15_4way_hash( hash, vdata );
         pdata[19] = n;
         for ( int i = 0; i < 4; i++ )
         if ( ( hash+(i<<3) )[7] < Htarg )
         if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
         {
-           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+            pdata[19] = n+i;
-                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+            submit_lane_solution( work, hash, mythr, i );
         }
         n += 4;
     } while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
-            x15_4way_hash( hash, vdata );
+     *hashes_done = n - first_nonce;
            pdata[19] = n;
            for ( int i = 0; i < 4; i++ )
            if ( ( (hash+(i<<3))[7] & mask ) == 0 )
            if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
            {
               pdata[19] = n+i;
               submit_lane_solution( work, hash, mythr, i );
            }
            n += 4;
         } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
         break;
       }
     *hashes_done = n - first_nonce + 1;
     return 0;
 }
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -2,7 +2,11 @@
 bool register_x15_algo( algo_gate_t* gate )
 {
-#if defined (X15_4WAY)
+#if defined (X15_8WAY)
  init_x15_8way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_8way;
  gate->hash      = (void*)&x15_8way_hash;
 #elif defined (X15_4WAY)
  init_x15_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x15_4way;
  gate->hash      = (void*)&x15_4way_hash;
@@ -11,7 +15,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 };
--- a/algo/x15/x15-gate.h
+++ b/algo/x15/x15-gate.h
@@ -4,20 +4,30 @@
 #include "algo-gate-api.h"
 #include <stdint.h>
-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X15_4WAY
+  #define X15_8WAY 1
 #elif defined(__AVX2__) && defined(__AES__)
  #define X15_4WAY 1
 #endif
 bool register_x15_algo( algo_gate_t* gate );
-#if defined(X15_4WAY)
+#if defined(X15_8WAY)
 void x15_8way_hash( void *state, const void *input );
 int scanhash_x15_8way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_8way_ctx();
 #elif defined(X15_4WAY)
 void x15_4way_hash( void *state, const void *input );
 int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr );
 void init_x15_4way_ctx();
-#endif
+#else
 void x15hash( void *state, const void *input );
 int scanhash_x15( struct work *work, uint32_t max_nonce,
@@ -26,3 +36,5 @@ void init_x15_ctx();
 #endif
 #endif
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,6 +4,8 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.
 rm cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen 
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.10.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.10.2'
+PACKAGE_VERSION='3.10.3'
-PACKAGE_STRING='cpuminer-opt 3.10.2'
+PACKAGE_STRING='cpuminer-opt 3.10.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.10.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.10.3 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.10.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.10.3:";;
   esac
  cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.10.2
+cpuminer-opt configure 3.10.3
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.10.2, which was
+It was created by cpuminer-opt $as_me 3.10.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2993,7 +2993,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.10.2'
+ VERSION='3.10.3'
 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.10.2, which was
+This file was extended by cpuminer-opt $as_me 3.10.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.10.2
+cpuminer-opt config.status 3.10.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.10.2])
+AC_INIT([cpuminer-opt], [3.10.3])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3410,39 +3410,39 @@ bool check_cpu_capability ()
        printf(".\n");
     #endif
-     printf("CPU features:");
+     printf("CPU features: ");
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( " AES"    );
     if      ( cpu_has_sha    )    printf( " SHA"    );
     if      ( cpu_has_avx512 )    printf( " AVX512" );
-     else if ( cpu_has_avx2   )    printf( " AVX2"   );
+     else if ( cpu_has_avx2   )    printf( " AVX2  " );
-     else if ( cpu_has_avx    )    printf( " AVX"    );
+     else if ( cpu_has_avx    )    printf( " AVX   " );
     else if ( cpu_has_sse42  )    printf( " SSE4.2" );
-     else if ( cpu_has_sse2   )    printf( " SSE2"   );
+     else if ( cpu_has_sse2   )    printf( " SSE2  " );
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( "  AES"   );
     if      ( cpu_has_sha    )    printf( " SHA"    );
-     printf(".\nSW features:");
+     printf("\nSW features:  ");
     if      ( sw_has_vaes   )    printf( " VAES"   );
     else if ( sw_has_aes    )    printf( " AES"    );
     if      ( sw_has_sha    )    printf( " SHA"    );
     if      ( sw_has_avx512 )    printf( " AVX512" );
-     else if ( sw_has_avx2   )    printf( " AVX2"   );
+     else if ( sw_has_avx2   )    printf( " AVX2  " );
-     else if ( sw_has_avx    )    printf( " AVX"    );
+     else if ( sw_has_avx    )    printf( " AVX   " );
     else if ( sw_has_sse42  )    printf( " SSE4.2" );
-     else if ( sw_has_sse2   )    printf( " SSE2"   );
+     else if ( sw_has_sse2   )    printf( " SSE2  " );
     if      ( sw_has_vaes   )    printf( " VAES"   );
     else if ( sw_has_aes    )    printf( " AES "   );
     if      ( sw_has_sha    )    printf( " SHA"    );
-     printf(".\nAlgo features:");
+     printf("\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
        if      ( algo_has_vaes   )    printf( " VAES"   );
        else if ( algo_has_aes    )    printf( " AES"    );
        if      ( algo_has_sha    )    printf( " SHA"    );
        if      ( algo_has_avx512 )    printf( " AVX512" );
-        else if ( algo_has_avx2   )    printf( " AVX2"   );
+        else if ( algo_has_avx2   )    printf( " AVX2  " );
        else if ( algo_has_sse42  )    printf( " SSE4.2" );
-        else if ( algo_has_sse2   )    printf( " SSE2"   );
+        else if ( algo_has_sse2   )    printf( " SSE2  " );
        if      ( algo_has_vaes   )    printf( " VAES"   );
        else if ( algo_has_aes    )    printf( " AES "   );
        if      ( algo_has_sha    )    printf( " SHA"    );
     }
-     printf(".\n");
+     printf("\n");
     // Check for CPU and build incompatibilities
     if ( !cpu_has_sse2 )
@@ -3483,19 +3483,19 @@ bool check_cpu_capability ()
                   use_sha || use_vaes );
     // Display best options
-     printf( "Start mining with" );
+     printf( "\nStarting miner with" );
     if         ( use_none ) printf( " no optimizations" );
     else
     {
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_avx512 ) printf( " AVX512" );
        else if ( use_avx2   ) printf( " AVX2"   );
        else if ( use_sse42  ) printf( " SSE4.2" );
        else if ( use_sse2   ) printf( " SSE2"   );
        if      ( use_vaes   ) printf( " VAES"   );
        else if ( use_aes    ) printf( " AES"    );
        if      ( use_sha    ) printf( " SHA"    );
     }
-     printf( ".\n\n" );
+     printf( "...\n\n" );
     return true;
 }
--- a/miner.h
+++ b/miner.h
@@ -874,9 +874,9 @@ Options:\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s\n\
                          x17\n\
-                          x21s\n\
+                          x21s          Pigeoncoin (PGN)\n\
                          x22i\n\
-                          x25x\n\
+                          x25x          Sinovative (SIN)\n\
                          xevan         Bitsend (BSD)\n\
                          yescrypt      Globalboost-Y (BSTY)\n\
                          yescryptr8    BitZeny (ZNY)\n\
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -252,7 +252,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #else
 #define mm128_ror_64   mm128_ror_var_64
 #define mm128_rol_64   mm128_rol_var_64
 #define mm128_ror_32   mm128_ror_var_32
@@ -274,6 +273,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
 // Rotate 16 byte (128 bit) vector by c bytes.
 // Less efficient using shift but more versatile. Use only for odd number
 // byte rotations. Use shuffle above whenever possible.
 #define mm128_ror_x8( v, c ) \
   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
 #define mm128_rol_x8( v, c ) \
   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
 #if defined (__SSE3__)
 // no SSE2 implementation, no current users
@@ -289,17 +297,21 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_1x8( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
                                       0x060504030201000f ) )
-#endif  // SSE3
+#else  // SSE2
-// Rotate 16 byte (128 bit) vector by c bytes.
+#define mm128_ror_1x16( v ) \
-// Less efficient using shift but more versatile. Use only for odd number
+   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
 // byte rotations. Use shuffle above whenever possible.
 #define mm128_bror( v, c ) \
   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
-#define mm128_brol( v, c ) \
+#define mm128_rol_1x16( v ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
+   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
 #define mm128_ror_1x8( v ) \
   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
 #define mm128_rol_1x8( v ) \
   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
 #endif   // SSE3 else SSE2
 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )
@@ -319,19 +331,24 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Rotate elements within lanes.
-#define mm128_swap32_64( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_swap_64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_ror16_64( v ) \
+#define mm128_rol64_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
+     _mm_or_si128( _mm_slli_epi64( v, ( ( (c)<<3 ) ), \
-                                       0x0100070605040302 )
+                   _mm_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-#define mm128_rol16_64( v ) \
+#define mm128_ror64_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
+     _mm_or_si128( _mm_srli_epi64( v, ( ( (c)<<3 ) ), \
-                                       0x0504030201000706 )
+                   _mm_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-#define mm128_swap16_32( v ) \
+#define mm128_rol32_8( v, c ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
+     _mm_or_si128( _mm_slli_epi32( v, ( ( (c)<<3 ) ), \
-                                       0x0504070601000302 )
+                   _mm_srli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
 #define mm128_ror32_8( v, c ) \
     _mm_or_si128( _mm_srli_epi32( v, ( ( (c)<<3 ) ), \
                   _mm_slli_epi32( v, ( ( 32 - ( (c)<<3 ) ) ) )
 //
 // Endian byte swap.
@@ -431,64 +448,65 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 // Swap 128 bit vectorse.
-#define mm128_swap128_256( v1, v2 ) \
+#define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
   v1 = _mm_xor_si128( v1, v2 );
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
 #if defined(__SSE4_1__)
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
           v1 = t; \
 } while(0)
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
           v2 = t; \
 } while(0)
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -497,7 +515,7 @@ do { \
 #else  // SSE2
-#define mm128_ror1x64_256( v1, v2 ) \
+#define mm128_ror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                              _mm_slli_si128( v2, 8 ) ); \
@@ -506,7 +524,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x64_256( v1, v2 ) \
+#define mm128_rol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                              _mm_srli_si128( v2, 8 ) ); \
@@ -515,7 +533,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x32_256( v1, v2 ) \
+#define mm128_ror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                              _mm_slli_si128( v2, 12 ) ); \
@@ -524,7 +542,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x32_256( v1, v2 ) \
+#define mm128_rol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                              _mm_srli_si128( v2, 12 ) ); \
@@ -533,7 +551,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x16_256( v1, v2 ) \
+#define mm128_ror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                              _mm_slli_si128( v2, 14 ) ); \
@@ -542,7 +560,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x16_256( v1, v2 ) \
+#define mm128_rol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                              _mm_srli_si128( v2, 14 ) ); \
@@ -551,7 +569,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_ror1x8_256( v1, v2 ) \
+#define mm128_ror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                              _mm_slli_si128( v2, 15 ) ); \
@@ -560,7 +578,7 @@ do { \
           v1 = t; \
 } while(0)
-#define mm128_rol1x8_256( v1, v2 ) \
+#define mm128_rol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                              _mm_srli_si128( v2, 15 ) ); \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -414,99 +414,71 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
-// Rotate elements within lanes of 256 bit vector.
+// Rotate elements within each 128 bit lane of 256 bit vector.
-// Swap 64 bit elements in each 128 bit lane.
+#define mm256_swap128_64( v )   _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_swap64_128( v )   _mm256_shuffle_epi32( v, 0x4e )
-// Rotate each 128 bit lane by one 32 bit element.
+#define mm256_ror128_32( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_ror1x32_128( v )  _mm256_shuffle_epi32( v, 0x39 )
 #define mm256_rol1x32_128( v )  _mm256_shuffle_epi32( v, 0x93 )
-#define mm256_ror1x16_128( v ) \
+#define mm256_rol128_1x32( v )  _mm256_shuffle_epi32( v, 0x93 )
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x11101f1e1d1c1b1a, 0x1918171615141312, \
                        0x01000f0e0d0c0b0a, 0x0908070605040302 ) )
-#define mm256_rol1x16_128( v ) \
+// Rotave each 128 bit lane by c elements.
-   _mm256_shuffle_epi8( v, \
+#define mm256_ror128_8( v, c ) \
         m256_const_64( 0x1d1c1b1a19181716, 0x1514131211101f1e, \
                        0x0d0c0b0a09080706, 0x0504030201000f0e ) )
 #define mm256_ror1x8_128( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x101f1e1d1c1b1a19, 0x1817161514131211, \
                        0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
 #define mm256_rol1x8_128( v ) \
   _mm256_shuffle_epi8( v, \
         m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                        0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 // Rotate each 128 bit lane by c bytes.
 #define mm256_bror_128( v, c ) \
  _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
                   _mm256_bslli_epi128( v, 16-(c) ) )
-#define mm256_brol_128( v, c ) \
+#define mm256_rol128_8( v, c ) \
  _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
                   _mm256_bsrli_epi128( v, 16-(c) ) )
-// Swap 32 bit elements in each 64 bit lane
+
-#define mm256_swap32_64( v )    _mm256_shuffle_epi32( v, 0xb1 )
+// Rotate elements in each 64 bit lane
 #define mm256_swap64_32( v )    _mm256_shuffle_epi32( v, 0xb1 )
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#define mm256_rol1x16_64( v )   _mm256_rol_epi64( v, 16 )
+#define mm256_rol64_8( v, c )   _mm256_rol_epi64( v, ((c)<<3) ) 
-#define mm256_ror1x16_64( v )   _mm256_ror_epi64( v, 16 )
+#define mm256_ror64_8( v, c )   _mm256_ror_epi64( v, ((c)<<3) ) 
 #else
-#define mm256_ror1x16_64( v ) \
+#define mm256_rol64_8( v, c ) \
-   _mm256_shuffle_epi8( v, \
+     _mm256_or_si256( _mm256_slli_epi64( v, ( ( (c)<<3 ) ), \
-        m256_const_64( 0x19181f1e1d1c1b1a, 0x1110171615141312, \
+                      _mm256_srli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
-                       0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+
 #define mm256_ror64_8( v, c ) \
     _mm256_or_si256( _mm256_srli_epi64( v, ( ( (c)<<3 ) ), \
                      _mm256_slli_epi64( v, ( ( 64 - ( (c)<<3 ) ) ) )
 #define mm256_rol1x16_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1d1c1b1a19181f1e, 0x1514131211101716, \
                       0x0d0c0b0a09080f0e, 0x0504030201000706 ) )
 #endif
 #define mm256_ror1x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x181f1e1d1c1b1a19, 0x1017161514131211, \
                       0x080f0e0d0c0b0a09, 0x0007060504030201 ) )
-#define mm256_rol1x8_64( v ) \
+// Rotate elements in each 32 bit lane
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1e1d1c1b1a19181f, 0x1615141312111017, \
                       0x0e0d0c0b0a09080f, 0x0605040302010007 ) )
 #define mm256_ror3x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1a19181f1e1d1c1b, 0x1211101716151413, \
                       0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
 #define mm256_rol3x8_64( v ) \
   _mm256_shuffle_epi8( v, \
        m256_const_64( 0x1c1b1a19181f1e1d, 0x1413121110171615, \
                       0x0c0b0a09080f0e0d, 0x0403020100070605 ) )
 // Swap 16 bit elements in each 32 bit lane
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-#define mm256_swap16_32( v )   _mm256_rol_epi32( v, 16 )
+#define mm256_swap32_16( v ) _mm256_rol_epi32( v, 16 )
 #define mm256_rol32_8( v )   _mm256_rol_epi32( v, 8 )
 #define mm256_ror32_8( v )   _mm256_ror_epi32( v, 8 )
 #else
-#define mm256_swap16_32( v ) \
+#define mm256_swap32_16( v ) \
-   _mm256_shuffle_epi8( v, \
+     _mm256_or_si256( _mm256_slli_epi32( v, 16 ), \
-         m256_const_64( 0x1b1a19181f1e1d1c, 0x1312111017161514, \
+                      _mm256_srli_epi32( v, 16 ) )
-                        0x0b0a09080f0e0d0c, 0x0302010007060504 ) )
+
 #define mm256_rol32_8( v ) \
     _mm256_or_si256( _mm256_slli_epi32( v, 8 ), \
                      _mm256_srli_epi32( v, 8 ) )
 #define mm256_ror32_8( v, c ) \
     _mm256_or_si256( _mm256_srli_epi32( v, 8 ), \
                      _mm256_slli_epi32( v, 8 ) )
 #endif
 //
 // Swap bytes in vector elements, endian bswap.
 #define mm256_bswap_64( v ) \
@@ -565,19 +537,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.
-#define mm256_swap256_512 (v1, v2) \
+#define mm256_swap512_256( v1, v2 ) \
-   v1 = _mm256_xor_si256(v1, v2); \
+   v1 = _mm256_xor_si256( v1, v2 ); \
-   v2 = _mm256_xor_si256(v1, v2); \
+   v2 = _mm256_xor_si256( v1, v2 ); \
-   v1 = _mm256_xor_si256(v1, v2);
+   v1 = _mm256_xor_si256( v1, v2 );
-#define mm256_ror1x128_512( v1, v2 ) \
+#define mm256_ror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)
-#define mm256_rol1x128_512( v1, v2 ) \
+#define mm256_rol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -15,13 +15,13 @@
 //  AVX512 intrinsics have a few changes from previous conventions.
 //
-//    Some instructions like cmp and blend use the mask regsiters now instead
+//    cmp instruction now returns a bitmask isnstead of a vector mask.
-//    a vector mask.
+//    This eliminates the need for the blendv instruction.
 //
-//    The new rotate instructions require the count to be only an 8 bit
+//    The new rotate instructions require the count to be an 8 bit
-//    immediate value. The documentation is the same as for shift and
+//    immediate value only. Compilation fails if a variable is used.
-//    it allows variables. Suspect a compiler issue but it still happens
+//    The documentation is the same as for shift and it works with
-//    in GCC9.
+//    variables.
 //
 //    _mm512_permutex_epi64 only shuffles within 256 bit lanes. Permute
 //    usually shuffles accross all lanes.
@@ -109,6 +109,11 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
 #define m512_const2_64( i1, i0 ) \
   m512_const1_128( m128_const_64( i1, i0 ) )
 #define m512_const2_32( i1, i0 ) \
   m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
                     | ( (uint64_t)(i0) & 0xffffffff ) ) )
 static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
                                      const uint64_t i1, const uint64_t i0 )
 {
@@ -304,8 +309,8 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 { \
  __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
                               0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                               0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                               0x1c1d1e1f18191a1b, 0x1415161710111213 ); \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
  casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
  casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
  casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -320,8 +325,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements in 512 bit vector.
 #define mm512_swap_256( v )        _mm512_alignr_epi64( v, v, 4 )
 // 1x64 notation used to disinguish from bit rotation.
 #define mm512_ror_1x128( v )       _mm512_alignr_epi64( v, v, 2 )
 #define mm512_rol_1x128( v )       _mm512_alignr_epi64( v, v, 6 )
@@ -401,51 +408,58 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
 // Rename these for consistency. Element size is always last.
 // mm<vectorsize>_<op><lanesize>_<elementsize>
 // Swap hi & lo 128 bits in each 256 bit lane
-#define mm512_swap128_256( v )   _mm512_permutex_epi64( v, 0x4e )
+
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror1x64_256( v )   _mm512_permutex_epi64( v, 0x39 )
+
-#define mm512_rol1x64_256( v )   _mm512_permutex_epi64( v, 0x93 )
+#define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )
 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror1x32_256( v ) \
+
 #define mm512_ror256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x000000080000000f, 0x0000000e0000000d, \
                      0x0000000c0000000b, 0x0000000a00000009, \
                      0x0000000000000007, 0x0000000600000005, \
                      0x0000000400000003, 0x0000000200000001 ), v )
-#define mm512_rol1x32_256( v ) \
+#define mm512_rol256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x0000000e0000000d, 0x0000000c0000000b, \
                      0x0000000a00000009, 0x000000080000000f, \
                      0x0000000600000005, 0x0000000400000003, \
                      0x0000000200000001, 0x0000000000000007 ), v )
-#define mm512_ror1x16_256( v ) \
+#define mm512_ror256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x00100001001e001d, 0x001c001b001a0019, \
                     0x0018001700160015, 0x0014001300120011, \
                     0x0000000f000e000d, 0x000c000b000a0009, \
                     0x0008000700060005, 0x0004000300020001 ), v )
-#define mm512_rol1x16_256( v ) \
+#define mm512_rol256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x001e001d001c001b, 0x001a001900180017, \
                     0x0016001500140013, 0x001200110010001f, \
                     0x000e000d000c000b, 0x000a000900080007, \
                     0x0006000500040003, 0x000200010000000f ), v )
-#define mm512_ror1x8_256( v ) \
+#define mm512_ror256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x203f3e3d3c3b3a39, 0x3837363534333231, \
                     0x302f2e2d2c2b2a29, 0x2827262524232221, \
                     0x001f1e1d1c1b1a19, 0x1817161514131211, \
                     0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
-#define mm512_rol1x8_256( v ) \
+#define mm512_rol256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231302f, \
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -456,45 +470,19 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Rotate elements within 128 bit lanes of 512 bit vector.
 // Swap hi & lo 64 bits in each 128 bit lane
-#define mm512_swap64_128( v )    _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_swap128_64( v )    _mm512_shuffle_epi32( v, 0x4e )
 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror1x32_128( v )   _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_ror128_32( v )   _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol1x32_128( v )   _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_rol128_32( v )   _mm512_shuffle_epi32( v, 0x93 )
 #define mm512_ror1x16_128( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                     0x0018001f001e001d, 0x001c001b001a0019, \
                     0x0010001700160015, 0x0014001300120011, \
                     0x0008000f000e000d, 0x000c000b000a0009, \
                     0x0000000700060005, 0x0004000300020001 ), v ) 
-#define mm512_rol1x16_128( v ) \
+// Rotate 128 bit lanes by c bytes, faster than building that monstrous 
-    _mm512_permutexvar_epi16( m512_const_64( \
+// constant above.  
-                     0x001e001d001c001b, 0x001a00190018001f, \
+#define mm512_ror128_8( v, c ) \
                     0x0016001500140013, 0x0012001100100017, \
                     0x000e000d000c000b, 0x000a00090008000f, \
                     0x0006000500040003, 0x0002000100000007 ), v ) 
 #define mm512_ror1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                     0x303f3e3d3c3b3a39, 0x3837363534333231, \
                     0x202f2e2d2c2b2a29, 0x2827262524232221, \
                     0x101f1e1d1c1b1a19, 0x1817161514131211, \
                     0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
 #define mm512_rol1x8_128( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231303f, \
                     0x2e2d2c2b2a292827, 0x262524232221202f, \
                     0x1e1d1c1b1a191817, 0x161514131211101f, \
                     0x0e0d0c0b0a090807, 0x060504030201000f ) )
 // Rotate 128 bit lanes by c bytes.  
 #define mm512_bror_128( v, c ) \
   _mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
                    _mm512_bslli_epi128( v, 16-(c) ) )
-#define mm512_brol_128( v, c ) \
+#define mm512_rol128_8( v, c ) \
   _mm512_or_si512( _mm512_bslli_epi128( v, c ), \
                    _mm512_bsrli_epi128( v, 16-(c) ) )
@@ -502,75 +490,23 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 64 bit lanes.
 #define mm512_rol64_x8( v, c )   _mm512_rol_epi64( v, ((c)<<3) )
 #define mm512_ror64_x8( v, c )   _mm512_ror_epi64( v, ((c)<<3) )
 // Swap 32 bit elements in each 64 bit lane
-#define mm512_swap32_64( v )      _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_swap64_32( v )      _mm512_shuffle_epi32( v, 0xb1 )
 // Rotate each 64 bit lane by one 16 bit element.
-#define mm512_ror1x16_64( v )   _mm512_ror_epi64( v, 16 )
+#define mm512_ror64_16( v )   _mm512_ror_epi64( v, 16 )
-#define mm512_rol1x16_64( v )   _mm512_rol_epi64( v, 16 )
+#define mm512_rol64_16( v )   _mm512_rol_epi64( v, 16 )
-#define mm512_ror1x8_64( v )    _mm512_ror_epi64( v, 8 )
+#define mm512_ror64_8( v )    _mm512_ror_epi64( v, 8 )
-#define mm512_rol1x8_64( v )    _mm512_rol_epi64( v, 8 )
+#define mm512_rol64_8( v )    _mm512_rol_epi64( v, 8 )
 /*
 #define mm512_ror1x16_64( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001c001f001e001d, 0x0018001b001a0019, \
                      0x0014001700160015, 0x0010001300120011, \
                      0x000c000f000e000d, 0x0008000b000a0009, \
                      0x0004000700060005, 0x0000000300020001, v )
 #define mm512_rol1x16_64( v ) \
    _mm512_permutexvar_epi16( m512_const_64( \
                      0x001e001d001c001f, 0x001a00190018001b, \
                      0x0016001500140017, 0x0012001100100013, \
                      0x000e000d000c000f, 0x000a00090008000b, \
                      0x0006000500040007, 0x0002000100000003, v )
 // Rotate each 64 bit lane by one byte.
 #define mm512_ror1x8_64( v ) \
    _mm512_shuffle_epi8( v, m512_const_64( \
                      0x383F3E3D3C3B3A39, 0x3037363534333231, \
                      0x282F2E2D2C2B2A29, 0x2027262524232221, \
                      0x181F1E1D1C1B1A19, 0x1017161514131211, \
                      0x080F0E0D0C0B0A09, 0x0007060504030201 ) )
 #define mm512_rol1x8_64( v ) \
    _mm512_shuffle( v, m512_const_64( \
                       0x3E3D3C3B3A39383F, 0x3635343332313037, \
                       0x2E2D2C2B2A29282F, 0x2625242322212027, \
                       0x1E1D1C1B1A19181F, 0x1615141312111017, \
                       0x0E0D0C0B0A09080F, 0x0605040302010007 ) )
 */
 //
 // Rotate elements within 32 bit lanes.
-#define mm512_swap16_32( v )   _mm512_ror_epi32( v, 16 )
+#define mm512_rol32_x8( v, c )   _mm512_rol_epi32( v, ((c)<<2) )
-#define mm512_ror1x8_32( v )   _mm512_ror_epi32( v, 8 )
+#define mm512_ror32_x8( v, c )   _mm512_ror_epi32( v, ((c)<<2) )
 #define mm512_rol1x8_32( v )   _mm512_rol_epi32( v, 8 )
 /*
 #define mm512_swap16_32( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001e001f001c001d, 0x001a001b00180019, \
                       0x0016001700140015, 0x0012001300100011, \
                       0x000e000f000c000d, 0x000a000b00080009, \
                       0x0006000700040005, 0x0002000300000001 ), v )
 #define mm512_ror1x8_32( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x3C3F3E3D383B3A39, 0x3437363530333231, \
                       0x2C2F2E2D282B2A29, 0x2427262520232221, \
                       0x1C1F1E1D181B1A19, 0x1417161510131211, \
                       0x0C0F0E0D080B0A09, 0x0407060500030201 ))
 #define mm512_rol1x8_32( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x3E3D3C3F3A39383B, 0x3635343732313033, \
                       0x2E2D2C2F2A29282B, 0x2625242722212023, \
                       0x1E1D1C1F1A19181B, 0x1615141712111013, \
                       0x0E0D0C0F0A09080B, 0x0605040702010003 ) )
 */
 //
@@ -579,61 +515,61 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //  These can all be done with 2 permutex2var instructions but they are
 //  slower than either xor or alignr and require AVX512VBMI.
-#define mm512_swap512_1024(v1, v2) \
+#define mm512_swap1024_512(v1, v2) \
   v1 = _mm512_xor_si512(v1, v2); \
   v2 = _mm512_xor_si512(v1, v2); \
   v1 = _mm512_xor_si512(v1, v2);
-#define mm512_ror1x256_1024( v1, v2 ) \
+#define mm512_ror1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x256_1024( v1, v2 ) \
+#define mm512_rol1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x128_1024( v1, v2 ) \
+#define mm512_ror1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x128_1024( v1, v2 ) \
+#define mm512_rol1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x64_1024( v1, v2 ) \
+#define mm512_ror1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x64_1024( v1, v2 ) \
+#define mm512_rol1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
   v1 = t; \
 } while(0)
-#define mm512_ror1x32_1024( v1, v2 ) \
+#define mm512_ror1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
   v2 = t; \
 } while(0)
-#define mm512_rol1x32_1024( v1, v2 ) \
+#define mm512_rol1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \